diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,50217 +10,50217 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 2041.5848999023438, + "completion_length": 2035.6161193847656, "epoch": 0.00029870808752146963, - "grad_norm": 0.07196161895990372, + "grad_norm": 0.07377588003873825, "kl": 0.0, - "learning_rate": 2.985074626865672e-10, - "loss": 0.0032, - "reward": 0.3046875149011612, - "reward_std": 0.04920794372446835, - "rewards/accuracy_reward": 0.04687500116415322, + "learning_rate": 1.4925373134328358e-09, + "loss": 0.0095, + "reward": 0.2991071566939354, + "reward_std": 0.035552696557715535, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.2566964477300644, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 2016.7254638671875, + "completion_length": 2006.6161193847656, "epoch": 0.0005974161750429393, - "grad_norm": 0.06716308742761612, + "grad_norm": 0.07657723128795624, "kl": 0.0, - "learning_rate": 5.970149253731344e-10, - "loss": 0.0148, - "reward": 0.3364955484867096, - "reward_std": 0.014682387001812458, - "rewards/accuracy_reward": 0.0714285746216774, + "learning_rate": 2.9850746268656717e-09, + "loss": 0.0203, + "reward": 0.3392857313156128, + "reward_std": 0.023831423372030258, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.2656250149011612, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 2035.9375305175781, + "completion_length": 2046.8616333007812, "epoch": 0.0008961242625644089, - "grad_norm": 0.05492328107357025, - "kl": 1.329183578491211e-05, - "learning_rate": 8.955223880597015e-10, - "loss": 0.0091, - "reward": 0.2645089402794838, - "reward_std": 0.026757876621559262, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 0.059486813843250275, + "kl": 0.00015664100646972656, + "learning_rate": 4.477611940298507e-09, + "loss": 0.0006, + "reward": 0.2611607238650322, + "reward_std": 0.026519399601966143, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.2566964328289032, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 2046.9554138183594, + "completion_length": 2044.7544860839844, "epoch": 0.0011948323500858785, - "grad_norm": 0.04859047383069992, - "kl": 5.7578086853027344e-05, - "learning_rate": 1.1940298507462687e-09, - "loss": 0.0014, - "reward": 0.3286830484867096, - "reward_std": 0.027603261172771454, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 0.039279546588659286, + "kl": 0.0001628398895263672, + "learning_rate": 5.970149253731343e-09, + "loss": 0.0061, + "reward": 0.3231026902794838, + "reward_std": 0.006696428870782256, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901828289032, + "rewards/tag_count_reward": 0.2516741156578064, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 2007.4688415527344, + "completion_length": 2008.3393859863281, "epoch": 0.0014935404376073482, - "grad_norm": 0.08128504455089569, - "kl": 0.0001773834228515625, - "learning_rate": 1.4925373134328356e-09, - "loss": 0.0171, - "reward": 0.341517873108387, - "reward_std": 0.02414499269798398, + "grad_norm": 0.08565092086791992, + "kl": 0.00017762184143066406, + "learning_rate": 7.462686567164179e-09, + "loss": 0.0173, + "reward": 0.3404018059372902, + "reward_std": 0.023454802576452494, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.2689732238650322, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 2027.4732971191406, + "completion_length": 2036.5625610351562, "epoch": 0.0017922485251288178, - "grad_norm": 0.09373266994953156, - "kl": 0.00017452239990234375, - "learning_rate": 1.791044776119403e-09, - "loss": 0.0137, - "reward": 0.3275669738650322, - "reward_std": 0.09999084658920765, - "rewards/accuracy_reward": 0.06250000488944352, + "grad_norm": 0.06848834455013275, + "kl": 0.00017714500427246094, + "learning_rate": 8.955223880597015e-09, + "loss": 0.005, + "reward": 0.3108259066939354, + "reward_std": 0.05675417836755514, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.2617187649011612, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 2040.544677734375, + "completion_length": 2044.9017944335938, "epoch": 0.0020909566126502874, - "grad_norm": 0.054204873740673065, + "grad_norm": 0.04619584605097771, "kl": 0.0001800060272216797, - "learning_rate": 2.0895522388059703e-09, - "loss": 0.0063, - "reward": 0.2594866156578064, - "reward_std": 0.01549767074175179, + "learning_rate": 1.044776119402985e-08, + "loss": 0.0023, + "reward": 0.2539062649011612, + "reward_std": 0.008738514501601458, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.2539062649011612, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 2040.2500610351562, + "completion_length": 2034.5111999511719, "epoch": 0.002389664700171757, - "grad_norm": 0.06061135604977608, - "kl": 0.00017833709716796875, - "learning_rate": 2.3880597014925374e-09, - "loss": 0.009, - "reward": 0.3353794738650322, - "reward_std": 0.036229430697858334, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 0.05582153797149658, + "kl": 0.00017952919006347656, + "learning_rate": 1.1940298507462687e-08, + "loss": 0.0136, + "reward": 0.3314732238650322, + "reward_std": 0.0254965724889189, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.2555803656578064, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 2035.3214721679688, + "completion_length": 2038.3259582519531, "epoch": 0.0026883727876932267, - "grad_norm": 0.06799230724573135, - "kl": 0.00017833709716796875, - "learning_rate": 2.686567164179104e-09, - "loss": 0.01, - "reward": 0.2968750074505806, - "reward_std": 0.026149316923692822, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.0764269232749939, + "kl": 0.00017571449279785156, + "learning_rate": 1.3432835820895521e-08, + "loss": 0.0089, + "reward": 0.2996651902794838, + "reward_std": 0.04299068497493863, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.259486623108387, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 2026.1764221191406, + "completion_length": 2032.02685546875, "epoch": 0.0029870808752146963, - "grad_norm": 0.08426691591739655, - "kl": 0.0001728534698486328, - "learning_rate": 2.9850746268656713e-09, - "loss": 0.0157, - "reward": 0.3431919887661934, - "reward_std": 0.052618308225646615, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 0.06721504032611847, + "kl": 0.00016880035400390625, + "learning_rate": 1.4925373134328357e-08, + "loss": 0.013, + "reward": 0.3320312649011612, + "reward_std": 0.027131953742355108, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.2583705484867096, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 2015.7054748535156, + "completion_length": 1999.3639221191406, "epoch": 0.003285788962736166, - "grad_norm": 0.08199253678321838, - "kl": 0.00017452239990234375, - "learning_rate": 3.2835820895522384e-09, - "loss": 0.0141, - "reward": 0.314174123108387, - "reward_std": 0.04220882756635547, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.09062264859676361, + "kl": 0.00017023086547851562, + "learning_rate": 1.6417910447761193e-08, + "loss": 0.0221, + "reward": 0.3119419813156128, + "reward_std": 0.039642661809921265, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.2717634066939354, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 2041.2210998535156, + "completion_length": 2043.2790832519531, "epoch": 0.0035844970502576356, - "grad_norm": 0.0718466192483902, - "kl": 0.0001678466796875, - "learning_rate": 3.582089552238806e-09, - "loss": 0.0066, - "reward": 0.3387276902794838, - "reward_std": 0.04345958912745118, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 0.07396755367517471, + "kl": 0.00017070770263671875, + "learning_rate": 1.791044776119403e-08, + "loss": 0.0033, + "reward": 0.3376116156578064, + "reward_std": 0.04667802760377526, + "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705559372902, + "rewards/tag_count_reward": 0.259486623108387, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 2008.6272888183594, + "completion_length": 2009.6763916015625, "epoch": 0.003883205137779105, - "grad_norm": 0.08510968834161758, - "kl": 0.00017142295837402344, - "learning_rate": 3.880597014925373e-09, - "loss": 0.0153, - "reward": 0.2996651828289032, - "reward_std": 0.01867468887940049, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.06218832731246948, + "kl": 0.00017213821411132812, + "learning_rate": 1.9402985074626865e-08, + "loss": 0.0124, + "reward": 0.301897332072258, + "reward_std": 0.016571818618103862, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2639508992433548, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 2041.930908203125, + "completion_length": 2045.700927734375, "epoch": 0.004181913225300575, - "grad_norm": 0.07649057358503342, - "kl": 0.0001742839813232422, - "learning_rate": 4.179104477611941e-09, - "loss": 0.0085, - "reward": 0.3364955484867096, - "reward_std": 0.04992468957789242, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 0.05105062201619148, + "kl": 0.0001728534698486328, + "learning_rate": 2.08955223880597e-08, + "loss": 0.0035, + "reward": 0.3231026902794838, + "reward_std": 0.00974611658602953, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705559372902, + "rewards/tag_count_reward": 0.2516741082072258, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 2004.58935546875, + "completion_length": 2007.4286499023438, "epoch": 0.004480621312822045, - "grad_norm": 0.08563992381095886, - "kl": 0.0001800060272216797, - "learning_rate": 4.477611940298507e-09, - "loss": 0.0199, - "reward": 0.3359375149011612, - "reward_std": 0.08687988109886646, - "rewards/accuracy_reward": 0.06473214761354029, + "grad_norm": 0.07748989015817642, + "kl": 0.0001804828643798828, + "learning_rate": 2.2388059701492534e-08, + "loss": 0.0197, + "reward": 0.3297991156578064, + "reward_std": 0.0723750137258321, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2712053656578064, + "rewards/tag_count_reward": 0.2695312649011612, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 2038.9866333007812, + "completion_length": 2044.7009582519531, "epoch": 0.004779329400343514, - "grad_norm": 0.050895314663648605, - "kl": 0.00017523765563964844, - "learning_rate": 4.776119402985075e-09, - "loss": 0.0057, - "reward": 0.310267873108387, - "reward_std": 0.030567446490749717, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 0.06081807240843773, + "kl": 0.00017905235290527344, + "learning_rate": 2.3880597014925373e-08, + "loss": 0.001, + "reward": 0.2991071566939354, + "reward_std": 0.0357142873108387, + "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.254464291036129, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 2034.165283203125, + "completion_length": 2036.9978332519531, "epoch": 0.005078037487864984, - "grad_norm": 0.07308439910411835, - "kl": 0.00015878677368164062, - "learning_rate": 5.0746268656716416e-09, - "loss": 0.0115, - "reward": 0.3080357238650322, - "reward_std": 0.060329388128593564, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 0.07298179715871811, + "kl": 0.00016069412231445312, + "learning_rate": 2.5373134328358206e-08, + "loss": 0.0098, + "reward": 0.3046875149011612, + "reward_std": 0.04351334855891764, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.2622768059372902, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 2023.1340026855469, + "completion_length": 2031.7232971191406, "epoch": 0.005376745575386453, - "grad_norm": 0.07275012135505676, - "kl": 0.000179290771484375, - "learning_rate": 5.373134328358208e-09, - "loss": 0.0118, - "reward": 0.3359375149011612, - "reward_std": 0.034183790208771825, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 0.05950786545872688, + "kl": 0.00017952919006347656, + "learning_rate": 2.6865671641791042e-08, + "loss": 0.0092, + "reward": 0.3370535895228386, + "reward_std": 0.03425386408343911, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2589285895228386, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 1999.8415832519531, + "completion_length": 2000.6094360351562, "epoch": 0.0056754536629079234, - "grad_norm": 0.09680669754743576, - "kl": 0.0001761913299560547, - "learning_rate": 5.671641791044776e-09, - "loss": 0.0201, - "reward": 0.376116082072258, - "reward_std": 0.0977831743657589, - "rewards/accuracy_reward": 0.10714286053553224, + "grad_norm": 0.0792248547077179, + "kl": 0.00018167495727539062, + "learning_rate": 2.835820895522388e-08, + "loss": 0.0219, + "reward": 0.3571428656578064, + "reward_std": 0.07195467129349709, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.2656250074505806, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 2042.12060546875, + "completion_length": 2033.5491638183594, "epoch": 0.005974161750429393, - "grad_norm": 0.05396566540002823, - "kl": 0.00017571449279785156, - "learning_rate": 5.9701492537313425e-09, - "loss": 0.0055, - "reward": 0.345982164144516, - "reward_std": 0.05132496077567339, - "rewards/accuracy_reward": 0.08928571990691125, + "grad_norm": 0.06336899846792221, + "kl": 0.0001785755157470703, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.009, + "reward": 0.364955373108387, + "reward_std": 0.0678480037022382, + "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.2600446566939354, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 2019.0000305175781, + "completion_length": 2025.8326416015625, "epoch": 0.006272869837950863, - "grad_norm": 0.058006905019283295, - "kl": 0.00017523765563964844, - "learning_rate": 6.268656716417909e-09, - "loss": 0.0047, - "reward": 0.3325892984867096, - "reward_std": 0.017809624783694744, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 0.04469405114650726, + "kl": 0.00017404556274414062, + "learning_rate": 3.134328358208955e-08, + "loss": 0.0088, + "reward": 0.3309151902794838, + "reward_std": 0.008267207071185112, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285746216774, + "rewards/tag_count_reward": 0.2594866156578064, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 2036.7076416015625, + "completion_length": 2033.9129943847656, "epoch": 0.006571577925472332, - "grad_norm": 0.06518836319446564, - "kl": 0.00017547607421875, - "learning_rate": 6.567164179104477e-09, - "loss": 0.0148, - "reward": 0.2561384066939354, - "reward_std": 0.01694456161931157, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.058748260140419006, + "kl": 0.00017070770263671875, + "learning_rate": 3.2835820895522386e-08, + "loss": 0.0099, + "reward": 0.263392873108387, + "reward_std": 0.028387735132128, + "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.2566964402794838, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 2038.2857360839844, + "completion_length": 2042.3192138671875, "epoch": 0.006870286012993802, - "grad_norm": 0.05401337146759033, - "kl": 0.00017714500427246094, - "learning_rate": 6.865671641791044e-09, - "loss": 0.0049, - "reward": 0.2957589402794838, - "reward_std": 0.02767741889692843, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 0.05660548433661461, + "kl": 0.00017571449279785156, + "learning_rate": 3.432835820895522e-08, + "loss": 0.0034, + "reward": 0.3030134066939354, + "reward_std": 0.036540572764351964, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2555803656578064, + "rewards/tag_count_reward": 0.2561384066939354, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 2009.6250610351562, + "completion_length": 2004.4509887695312, "epoch": 0.007168994100515271, - "grad_norm": 0.08191654086112976, - "kl": 0.0001704692840576172, - "learning_rate": 7.164179104477612e-09, - "loss": 0.0214, - "reward": 0.4179687723517418, - "reward_std": 0.059019600972533226, - "rewards/accuracy_reward": 0.1495535783469677, + "grad_norm": 0.07296764850616455, + "kl": 0.0001685619354248047, + "learning_rate": 3.582089552238806e-08, + "loss": 0.0175, + "reward": 0.424107164144516, + "reward_std": 0.05331416008993983, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151977300644, + "rewards/tag_count_reward": 0.2700892984867096, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 2043.7120971679688, + "completion_length": 2046.154052734375, "epoch": 0.007467702188036741, - "grad_norm": 0.05558430030941963, - "kl": 0.00017499923706054688, - "learning_rate": 7.462686567164179e-09, - "loss": 0.0049, - "reward": 0.3074776902794838, - "reward_std": 0.02943751262500882, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 0.050552818924188614, + "kl": 0.00017547607421875, + "learning_rate": 3.731343283582089e-08, + "loss": 0.0016, + "reward": 0.3046875149011612, + "reward_std": 0.03753993893042207, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.255580373108387, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 2028.3013916015625, + "completion_length": 2035.3482666015625, "epoch": 0.00776641027555821, - "grad_norm": 0.07265523076057434, - "kl": 0.0001735687255859375, - "learning_rate": 7.761194029850746e-09, - "loss": 0.0109, - "reward": 0.3152901902794838, - "reward_std": 0.052725025452673435, - "rewards/accuracy_reward": 0.05133928684517741, + "grad_norm": 0.06812873482704163, + "kl": 0.00017070770263671875, + "learning_rate": 3.880597014925373e-08, + "loss": 0.0089, + "reward": 0.3113839328289032, + "reward_std": 0.04659886518493295, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.2600446566939354, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 2038.9822082519531, + "completion_length": 2041.4754943847656, "epoch": 0.00806511836307968, - "grad_norm": 0.06600334495306015, - "kl": 0.00019073486328125, - "learning_rate": 8.059701492537314e-09, - "loss": 0.0072, - "reward": 0.4179687574505806, - "reward_std": 0.03689300501719117, + "grad_norm": 0.060088448226451874, + "kl": 0.0001895427703857422, + "learning_rate": 4.029850746268657e-08, + "loss": 0.0076, + "reward": 0.4140625223517418, + "reward_std": 0.0314870816655457, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.2555803656578064, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 2032.3125610351562, + "completion_length": 2033.1295471191406, "epoch": 0.00836382645060115, - "grad_norm": 0.06140688806772232, - "kl": 0.00017452239990234375, - "learning_rate": 8.358208955223881e-09, - "loss": 0.0116, - "reward": 0.3074776977300644, - "reward_std": 0.03334940271452069, + "grad_norm": 0.07130387425422668, + "kl": 0.00017189979553222656, + "learning_rate": 4.17910447761194e-08, + "loss": 0.0102, + "reward": 0.309151791036129, + "reward_std": 0.03787641413509846, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.262276791036129, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 1986.5937805175781, + "completion_length": 1981.497802734375, "epoch": 0.008662534538122619, - "grad_norm": 0.08826440572738647, - "kl": 0.0001823902130126953, - "learning_rate": 8.656716417910449e-09, - "loss": 0.0199, - "reward": 0.329241082072258, - "reward_std": 0.07924155634827912, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 0.09127658605575562, + "kl": 0.000179290771484375, + "learning_rate": 4.328358208955224e-08, + "loss": 0.0169, + "reward": 0.3242187649011612, + "reward_std": 0.0687947750557214, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375074505806, + "rewards/tag_count_reward": 0.2728794813156128, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 1987.9442749023438, + "completion_length": 2005.3996276855469, "epoch": 0.00896124262564409, - "grad_norm": 0.09865257143974304, - "kl": 0.00017690658569335938, - "learning_rate": 8.955223880597015e-09, - "loss": 0.0263, - "reward": 0.3989955559372902, - "reward_std": 0.05678805522620678, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 0.08891815692186356, + "kl": 0.00018072128295898438, + "learning_rate": 4.477611940298507e-08, + "loss": 0.0183, + "reward": 0.3867187574505806, + "reward_std": 0.04750661668367684, + "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276977300644, + "rewards/tag_count_reward": 0.2706473395228386, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 2015.0268249511719, + "completion_length": 2021.1786193847656, "epoch": 0.009259950713165559, - "grad_norm": 0.08402200788259506, - "kl": 0.00018024444580078125, - "learning_rate": 9.253731343283582e-09, - "loss": 0.0181, - "reward": 0.3169643059372902, - "reward_std": 0.045405891723930836, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 0.07822487503290176, + "kl": 0.0001800060272216797, + "learning_rate": 4.6268656716417904e-08, + "loss": 0.0172, + "reward": 0.3147321566939354, + "reward_std": 0.04173394991084933, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.2611607313156128, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 2035.3728332519531, + "completion_length": 2030.0335693359375, "epoch": 0.009558658800687028, - "grad_norm": 0.08652578294277191, - "kl": 0.0001804828643798828, - "learning_rate": 9.55223880597015e-09, - "loss": 0.013, - "reward": 0.3465401902794838, - "reward_std": 0.04377779574133456, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 0.06654183566570282, + "kl": 0.0001800060272216797, + "learning_rate": 4.776119402985075e-08, + "loss": 0.01, + "reward": 0.3504464402794838, + "reward_std": 0.03687205188907683, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.2611607313156128, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 2041.72998046875, + "completion_length": 2036.5781555175781, "epoch": 0.009857366888208497, - "grad_norm": 0.059057775884866714, - "kl": 0.0001811981201171875, - "learning_rate": 9.850746268656716e-09, - "loss": 0.0054, - "reward": 0.3035714402794838, - "reward_std": 0.03986412403173745, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 0.05982429161667824, + "kl": 0.0001819133758544922, + "learning_rate": 4.925373134328358e-08, + "loss": 0.0073, + "reward": 0.3091517984867096, + "reward_std": 0.029121503233909607, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964477300644, + "rewards/tag_count_reward": 0.2578125074505806, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 2033.1942749023438, + "completion_length": 2041.2656860351562, "epoch": 0.010156074975729968, - "grad_norm": 0.07461834698915482, - "kl": 0.0001766681671142578, - "learning_rate": 1.0149253731343283e-08, - "loss": 0.0114, - "reward": 0.3191964477300644, - "reward_std": 0.05069577763788402, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 0.0702994167804718, + "kl": 0.00018072128295898438, + "learning_rate": 5.074626865671641e-08, + "loss": 0.0051, + "reward": 0.3203125149011612, + "reward_std": 0.048810507170856, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.2600446492433548, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 2019.3750305175781, + "completion_length": 2017.1295166015625, "epoch": 0.010454783063251438, - "grad_norm": 0.07312300056219101, - "kl": 0.00016927719116210938, - "learning_rate": 1.044776119402985e-08, - "loss": 0.0111, - "reward": 0.3303571492433548, - "reward_std": 0.06500917486846447, - "rewards/accuracy_reward": 0.06473214412108064, + "grad_norm": 0.08044405281543732, + "kl": 0.00017762184143066406, + "learning_rate": 5.223880597014925e-08, + "loss": 0.0197, + "reward": 0.325892873108387, + "reward_std": 0.07425616635009646, + "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.2633928656578064, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 2030.9978332519531, + "completion_length": 2024.4531860351562, "epoch": 0.010753491150772907, - "grad_norm": 0.0669090747833252, - "kl": 0.0001823902130126953, - "learning_rate": 1.0746268656716417e-08, - "loss": 0.0114, - "reward": 0.2840401902794838, - "reward_std": 0.03899030759930611, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 0.08431462198495865, + "kl": 0.00018262863159179688, + "learning_rate": 5.3731343283582085e-08, + "loss": 0.0135, + "reward": 0.2924107238650322, + "reward_std": 0.0366232821252197, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.2633928656578064, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 2033.30810546875, + "completion_length": 2035.3326721191406, "epoch": 0.011052199238294378, - "grad_norm": 0.08151762932538986, - "kl": 0.000179290771484375, - "learning_rate": 1.1044776119402984e-08, - "loss": 0.008, - "reward": 0.305803582072258, - "reward_std": 0.04058648692443967, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 0.09053537994623184, + "kl": 0.0001785755157470703, + "learning_rate": 5.522388059701493e-08, + "loss": 0.0089, + "reward": 0.3113839402794838, + "reward_std": 0.06095702131278813, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.2622767984867096, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 2045.8371276855469, + "completion_length": 2046.5714721679688, "epoch": 0.011350907325815847, - "grad_norm": 0.06721360236406326, - "kl": 0.00018358230590820312, - "learning_rate": 1.1343283582089552e-08, - "loss": 0.0029, - "reward": 0.2924107238650322, - "reward_std": 0.023688509361818433, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.039195768535137177, + "kl": 0.00018405914306640625, + "learning_rate": 5.671641791044776e-08, + "loss": 0.0017, + "reward": 0.2885044738650322, + "reward_std": 0.0066489099990576506, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2544642984867096, + "rewards/tag_count_reward": 0.2527901828289032, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 2040.7545166015625, + "completion_length": 2040.6607666015625, "epoch": 0.011649615413337316, - "grad_norm": 0.0658109113574028, - "kl": 0.0001850128173828125, - "learning_rate": 1.1641791044776118e-08, - "loss": 0.0053, - "reward": 0.2650669738650322, - "reward_std": 0.036447175312787294, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.061163000762462616, + "kl": 0.0001842975616455078, + "learning_rate": 5.820895522388059e-08, + "loss": 0.0079, + "reward": 0.2762276828289032, + "reward_std": 0.05739667871966958, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2561383992433548, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 2026.9286804199219, + "completion_length": 2029.3884582519531, "epoch": 0.011948323500858785, - "grad_norm": 0.07697001844644547, - "kl": 0.00016927719116210938, - "learning_rate": 1.1940298507462685e-08, - "loss": 0.0154, - "reward": 0.3588169887661934, - "reward_std": 0.06394700147211552, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 0.08882297575473785, + "kl": 0.00017404556274414062, + "learning_rate": 5.970149253731343e-08, + "loss": 0.018, + "reward": 0.3593750149011612, + "reward_std": 0.06567940767854452, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.2656250074505806, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 2044.2522888183594, + "completion_length": 2037.27685546875, "epoch": 0.012247031588380256, - "grad_norm": 0.054422251880168915, - "kl": 0.00018143653869628906, - "learning_rate": 1.2238805970149253e-08, - "loss": 0.0031, - "reward": 0.3777901977300644, - "reward_std": 0.044269343838095665, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 0.07564129680395126, + "kl": 0.00018024444580078125, + "learning_rate": 6.119402985074627e-08, + "loss": 0.0138, + "reward": 0.3800223395228386, + "reward_std": 0.0511107025668025, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.2617187649011612, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 2048.0, + "completion_length": 2047.5491333007812, "epoch": 0.012545739675901725, - "grad_norm": 0.03895112872123718, - "kl": 0.0001785755157470703, - "learning_rate": 1.2537313432835819e-08, - "loss": 0.0, - "reward": 0.2901785895228386, - "reward_std": 0.01785714365541935, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.05243632569909096, + "kl": 0.00017881393432617188, + "learning_rate": 6.26865671641791e-08, + "loss": 0.0006, + "reward": 0.2885044813156128, + "reward_std": 0.00974611658602953, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.25, + "rewards/tag_count_reward": 0.2527901977300644, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 2043.8995971679688, + "completion_length": 2043.0000915527344, "epoch": 0.012844447763423195, - "grad_norm": 0.07898052781820297, - "kl": 0.00016999244689941406, - "learning_rate": 1.2835820895522386e-08, - "loss": 0.0062, - "reward": 0.2712053656578064, - "reward_std": 0.04921467509120703, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 0.07800869643688202, + "kl": 0.00017881393432617188, + "learning_rate": 6.417910447761193e-08, + "loss": 0.0083, + "reward": 0.2700892984867096, + "reward_std": 0.06405318272300065, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.2566964402794838, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 2035.94873046875, + "completion_length": 2031.6116638183594, "epoch": 0.013143155850944664, - "grad_norm": 0.0645826980471611, - "kl": 0.00017762184143066406, - "learning_rate": 1.3134328358208954e-08, - "loss": 0.007, - "reward": 0.3113839402794838, - "reward_std": 0.03382947063073516, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 0.07709817588329315, + "kl": 0.00018334388732910156, + "learning_rate": 6.567164179104477e-08, + "loss": 0.0088, + "reward": 0.3225446566939354, + "reward_std": 0.05965192103758454, + "rewards/accuracy_reward": 0.060267861699685454, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2622767984867096, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 2031.2166137695312, + "completion_length": 2036.4643859863281, "epoch": 0.013441863938466135, - "grad_norm": 0.08989221602678299, - "kl": 0.00017714500427246094, - "learning_rate": 1.3432835820895521e-08, - "loss": 0.0138, - "reward": 0.302455373108387, - "reward_std": 0.031658967258408666, + "grad_norm": 0.06595548987388611, + "kl": 0.0001723766326904297, + "learning_rate": 6.71641791044776e-08, + "loss": 0.0057, + "reward": 0.2996651902794838, + "reward_std": 0.021251919912174344, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2667410895228386, + "rewards/tag_count_reward": 0.2639509066939354, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 2046.35498046875, + "completion_length": 2048.0, "epoch": 0.013740572025987604, - "grad_norm": 0.06916478276252747, - "kl": 0.0001862049102783203, - "learning_rate": 1.3731343283582089e-08, - "loss": 0.0022, - "reward": 0.3275669738650322, - "reward_std": 0.021724376128986478, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.03848166763782501, + "kl": 0.00019097328186035156, + "learning_rate": 6.865671641791045e-08, + "loss": 0.0, + "reward": 0.3270089402794838, + "reward_std": 0.0223214291036129, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.251116082072258, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 2030.2076110839844, + "completion_length": 2033.2969360351562, "epoch": 0.014039280113509073, - "grad_norm": 0.07553732395172119, - "kl": 0.00017642974853515625, - "learning_rate": 1.4029850746268656e-08, - "loss": 0.0082, - "reward": 0.3353794813156128, - "reward_std": 0.01886316225863993, + "grad_norm": 0.06175874546170235, + "kl": 0.00018286705017089844, + "learning_rate": 7.014925373134329e-08, + "loss": 0.008, + "reward": 0.3337053656578064, + "reward_std": 0.018003552686423063, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.262276791036129, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 2027.450927734375, + "completion_length": 2019.2522888183594, "epoch": 0.014337988201030542, - "grad_norm": 0.06145080178976059, - "kl": 0.00018668174743652344, - "learning_rate": 1.4328358208955224e-08, - "loss": 0.0061, - "reward": 0.2885044738650322, - "reward_std": 0.021592256845906377, - "rewards/accuracy_reward": 0.03125, + "grad_norm": 0.06305798888206482, + "kl": 0.00019073486328125, + "learning_rate": 7.164179104477612e-08, + "loss": 0.0071, + "reward": 0.2924107238650322, + "reward_std": 0.017975562950596213, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.258928582072258, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 2008.0067138671875, + "completion_length": 2011.7879638671875, "epoch": 0.014636696288552013, - "grad_norm": 0.05273529142141342, - "kl": 0.00016999244689941406, - "learning_rate": 1.4626865671641791e-08, - "loss": -0.0016, - "reward": 0.3030134066939354, - "reward_std": 0.027849020436406136, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 0.07480230927467346, + "kl": 0.0001747608184814453, + "learning_rate": 7.313432835820896e-08, + "loss": 0.0087, + "reward": 0.3002232238650322, + "reward_std": 0.0174260251224041, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.2600446492433548, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 2019.2768249511719, + "completion_length": 2022.6875610351562, "epoch": 0.014935404376073482, - "grad_norm": 0.05945578217506409, - "kl": 0.00016999244689941406, - "learning_rate": 1.4925373134328357e-08, - "loss": 0.0112, - "reward": 0.2845982238650322, - "reward_std": 0.0405397464055568, - "rewards/accuracy_reward": 0.022321430267766118, + "grad_norm": 0.05982408672571182, + "kl": 0.00017499923706054688, + "learning_rate": 7.462686567164178e-08, + "loss": 0.0099, + "reward": 0.2873883992433548, + "reward_std": 0.030299714766442776, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.262834832072258, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 2043.1004943847656, + "completion_length": 2037.3437805175781, "epoch": 0.015234112463594952, - "grad_norm": 0.04945945367217064, - "kl": 0.0001761913299560547, - "learning_rate": 1.5223880597014923e-08, - "loss": 0.0042, - "reward": 0.3621651902794838, - "reward_std": 0.00903881760314107, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 0.054498765617609024, + "kl": 0.0001773834228515625, + "learning_rate": 7.611940298507462e-08, + "loss": 0.0053, + "reward": 0.3688616305589676, + "reward_std": 0.024287256645038724, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2550223246216774, + "rewards/tag_count_reward": 0.2572544738650322, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 2031.2902221679688, + "completion_length": 2023.9777221679688, "epoch": 0.01553282055111642, - "grad_norm": 0.052842386066913605, - "kl": 0.00017714500427246094, - "learning_rate": 1.5522388059701492e-08, - "loss": 0.0078, - "reward": 0.3002232164144516, - "reward_std": 0.02366199530661106, - "rewards/accuracy_reward": 0.0401785746216774, + "grad_norm": 0.05936834216117859, + "kl": 0.0001766681671142578, + "learning_rate": 7.761194029850746e-08, + "loss": 0.0108, + "reward": 0.310825914144516, + "reward_std": 0.04481076914817095, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446492433548, + "rewards/tag_count_reward": 0.2639509066939354, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 2042.3125305175781, + "completion_length": 2044.587158203125, "epoch": 0.01583152863863789, - "grad_norm": 0.060569122433662415, - "kl": 0.00018668174743652344, - "learning_rate": 1.5820895522388058e-08, - "loss": 0.0051, - "reward": 0.3309151902794838, - "reward_std": 0.02112739998847246, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.08270423114299774, + "kl": 0.00019097328186035156, + "learning_rate": 7.910447761194029e-08, + "loss": 0.0052, + "reward": 0.3459821566939354, + "reward_std": 0.07618198730051517, + "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.2566964477300644, "step": 53 }, - { - "clip_ratio": 0.0, - "completion_length": 2001.6853332519531, - "epoch": 0.01613023672615936, - "grad_norm": 0.08046639710664749, - "kl": 0.0001723766326904297, - "learning_rate": 1.6119402985074627e-08, - "loss": 0.0203, - "reward": 0.365513414144516, - "reward_std": 0.040689456509426236, - "rewards/accuracy_reward": 0.0937500037252903, + { + "clip_ratio": 0.0, + "completion_length": 2016.7879943847656, + "epoch": 0.01613023672615936, + "grad_norm": 0.06661954522132874, + "kl": 0.00017571449279785156, + "learning_rate": 8.059701492537313e-08, + "loss": 0.0178, + "reward": 0.3565848395228386, + "reward_std": 0.03894307743757963, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2717633992433548, + "rewards/tag_count_reward": 0.2672991156578064, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 2030.2790832519531, + "completion_length": 2034.5335083007812, "epoch": 0.016428944813680832, - "grad_norm": 0.06016090139746666, - "kl": 0.00017023086547851562, - "learning_rate": 1.6417910447761193e-08, - "loss": 0.013, - "reward": 0.2801339328289032, - "reward_std": 0.03603593120351434, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 0.07243689894676208, + "kl": 0.00017905235290527344, + "learning_rate": 8.208955223880596e-08, + "loss": 0.0078, + "reward": 0.2723214477300644, + "reward_std": 0.041094141313806176, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262276791036129, + "rewards/tag_count_reward": 0.2611607238650322, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 1987.7322082519531, + "completion_length": 1993.5469360351562, "epoch": 0.0167276529012023, - "grad_norm": 0.10209909826517105, - "kl": 0.00016927719116210938, - "learning_rate": 1.6716417910447762e-08, - "loss": 0.0252, - "reward": 0.2717633992433548, - "reward_std": 0.028499033767729998, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.08347208052873611, + "kl": 0.00016999244689941406, + "learning_rate": 8.35820895522388e-08, + "loss": 0.0186, + "reward": 0.2739955484867096, + "reward_std": 0.03362244344316423, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2717633992433548, + "rewards/tag_count_reward": 0.2695312649011612, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 2021.0425109863281, + "completion_length": 2014.4041137695312, "epoch": 0.01702636098872377, - "grad_norm": 0.07740095257759094, - "kl": 0.00017023086547851562, - "learning_rate": 1.7014925373134328e-08, - "loss": 0.0205, - "reward": 0.3186384066939354, - "reward_std": 0.06622493849135935, - "rewards/accuracy_reward": 0.05357143026776612, + "grad_norm": 0.08523304760456085, + "kl": 0.00016808509826660156, + "learning_rate": 8.507462686567165e-08, + "loss": 0.0184, + "reward": 0.3080357313156128, + "reward_std": 0.039523518877103925, + "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.2678571566939354, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 2040.1183776855469, + "completion_length": 2045.7567138671875, "epoch": 0.017325069076245238, - "grad_norm": 0.06567845493555069, - "kl": 0.0001728534698486328, - "learning_rate": 1.7313432835820898e-08, - "loss": 0.0079, - "reward": 0.3080357238650322, - "reward_std": 0.05259916931390762, - "rewards/accuracy_reward": 0.05133928940631449, + "grad_norm": 0.05177473649382591, + "kl": 0.00017523765563964844, + "learning_rate": 8.656716417910448e-08, + "loss": 0.0014, + "reward": 0.306361623108387, + "reward_std": 0.03653183137066662, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.255022332072258, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 2031.7455749511719, + "completion_length": 2035.2054443359375, "epoch": 0.01762377716376671, - "grad_norm": 0.09236067533493042, - "kl": 0.0001761913299560547, - "learning_rate": 1.761194029850746e-08, - "loss": 0.0153, - "reward": 0.3364955484867096, - "reward_std": 0.031892696395516396, + "grad_norm": 0.08070658892393112, + "kl": 0.00017690658569335938, + "learning_rate": 8.805970149253731e-08, + "loss": 0.0116, + "reward": 0.333705373108387, + "reward_std": 0.025266710203140974, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.262276791036129, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 2036.372802734375, + "completion_length": 2038.529052734375, "epoch": 0.01792248525128818, - "grad_norm": 0.06612980365753174, - "kl": 0.0001735687255859375, - "learning_rate": 1.791044776119403e-08, - "loss": 0.007, - "reward": 0.2929687574505806, - "reward_std": 0.013503103284165263, + "grad_norm": 0.05810768902301788, + "kl": 0.0001773834228515625, + "learning_rate": 8.955223880597014e-08, + "loss": 0.0045, + "reward": 0.2924107313156128, + "reward_std": 0.011270960327237844, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.2566964402794838, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 2024.0558471679688, + "completion_length": 2013.5000915527344, "epoch": 0.018221193338809647, - "grad_norm": 0.06619381159543991, - "kl": 0.00017499923706054688, - "learning_rate": 1.8208955223880595e-08, - "loss": 0.0156, - "reward": 0.297991082072258, - "reward_std": 0.02393188956193626, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.0838748887181282, + "kl": 0.0001804828643798828, + "learning_rate": 9.104477611940298e-08, + "loss": 0.0184, + "reward": 0.309709832072258, + "reward_std": 0.04307688772678375, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2650669664144516, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 2014.3884582519531, + "completion_length": 2027.3326721191406, "epoch": 0.018519901426331118, - "grad_norm": 0.08293049037456512, - "kl": 0.0001857280731201172, - "learning_rate": 1.8507462686567164e-08, - "loss": 0.0073, - "reward": 0.2728794738650322, - "reward_std": 0.042606034548953176, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.07275748252868652, + "kl": 0.00019049644470214844, + "learning_rate": 9.253731343283581e-08, + "loss": 0.0139, + "reward": 0.271205373108387, + "reward_std": 0.03510888130404055, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.2622767984867096, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 2013.2277526855469, + "completion_length": 2017.0089721679688, "epoch": 0.01881860951385259, - "grad_norm": 0.08474146574735641, - "kl": 0.00016927719116210938, - "learning_rate": 1.880597014925373e-08, - "loss": 0.0163, - "reward": 0.2661830484867096, - "reward_std": 0.023563439259305596, + "grad_norm": 0.06982765346765518, + "kl": 0.0001742839813232422, + "learning_rate": 9.402985074626865e-08, + "loss": 0.0141, + "reward": 0.2617187574505806, + "reward_std": 0.015106174862012267, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.2617187574505806, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 2024.4889221191406, + "completion_length": 2036.0312805175781, "epoch": 0.019117317601374056, - "grad_norm": 0.08578862994909286, - "kl": 0.00017452239990234375, - "learning_rate": 1.91044776119403e-08, - "loss": 0.0166, - "reward": 0.3800223469734192, - "reward_std": 0.041049079271033406, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 0.06350357085466385, + "kl": 0.00017786026000976562, + "learning_rate": 9.55223880597015e-08, + "loss": 0.0088, + "reward": 0.3816964477300644, + "reward_std": 0.04492915794253349, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2633928656578064, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 2037.4152221679688, + "completion_length": 2032.7590026855469, "epoch": 0.019416025688895527, - "grad_norm": 0.07190332561731339, - "kl": 0.0001697540283203125, - "learning_rate": 1.9402985074626865e-08, - "loss": 0.0084, - "reward": 0.263392873108387, - "reward_std": 0.026824323693290353, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.06322678178548813, + "kl": 0.0001800060272216797, + "learning_rate": 9.701492537313432e-08, + "loss": 0.0101, + "reward": 0.259486623108387, + "reward_std": 0.015845492016524076, + "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.259486623108387, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 2031.3438110351562, + "completion_length": 2037.4486999511719, "epoch": 0.019714733776416995, - "grad_norm": 0.05696462467312813, - "kl": 0.0001804828643798828, - "learning_rate": 1.970149253731343e-08, - "loss": 0.0081, - "reward": 0.3013392984867096, - "reward_std": 0.02673462708480656, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 0.04374164342880249, + "kl": 0.00018858909606933594, + "learning_rate": 9.850746268656717e-08, + "loss": 0.0051, + "reward": 0.2985491156578064, + "reward_std": 0.026388566941022873, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285895228386, + "rewards/tag_count_reward": 0.258370541036129, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 2044.6026916503906, + "completion_length": 2041.3214721679688, "epoch": 0.020013441863938466, - "grad_norm": 0.043405309319496155, - "kl": 0.0001659393310546875, - "learning_rate": 2e-08, - "loss": 0.0029, - "reward": 0.2918526902794838, - "reward_std": 0.015374185051769018, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.05143103748559952, + "kl": 0.0001704692840576172, + "learning_rate": 1e-07, + "loss": 0.0072, + "reward": 0.290736623108387, + "reward_std": 0.011788201984018087, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.255022332072258, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 2034.90185546875, + "completion_length": 2038.4375610351562, "epoch": 0.020312149951459937, - "grad_norm": 0.075312040746212, - "kl": 0.00017499923706054688, - "learning_rate": 2.0298507462686566e-08, - "loss": 0.0065, - "reward": 0.3515625223517418, - "reward_std": 0.060005169128999114, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 0.09448003768920898, + "kl": 0.00017786026000976562, + "learning_rate": 1.0149253731343282e-07, + "loss": 0.0083, + "reward": 0.3554687649011612, + "reward_std": 0.06969616492278874, + "rewards/accuracy_reward": 0.0937500074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.2617187649011612, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 2021.7991638183594, + "completion_length": 2023.22998046875, "epoch": 0.020610858038981404, - "grad_norm": 0.07574271410703659, - "kl": 0.00017786026000976562, - "learning_rate": 2.0597014925373132e-08, - "loss": 0.0156, - "reward": 0.3113839402794838, - "reward_std": 0.046980494633316994, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 0.0896100401878357, + "kl": 0.000171661376953125, + "learning_rate": 1.0298507462686567e-07, + "loss": 0.0196, + "reward": 0.318080373108387, + "reward_std": 0.06904237624257803, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.2645089402794838, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 2040.5715026855469, + "completion_length": 2045.3013916015625, "epoch": 0.020909566126502875, - "grad_norm": 0.07929424941539764, - "kl": 0.0001761913299560547, - "learning_rate": 2.08955223880597e-08, - "loss": 0.0088, - "reward": 0.2661830484867096, - "reward_std": 0.051287868060171604, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.05585716664791107, + "kl": 0.00017952919006347656, + "learning_rate": 1.044776119402985e-07, + "loss": 0.0015, + "reward": 0.2695312649011612, + "reward_std": 0.03684081370010972, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.2561384066939354, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 2022.274658203125, + "completion_length": 2014.7322082519531, "epoch": 0.021208274214024346, - "grad_norm": 0.08987763524055481, - "kl": 0.00017571449279785156, - "learning_rate": 2.1194029850746267e-08, - "loss": 0.0178, - "reward": 0.2963169813156128, - "reward_std": 0.09084296086803079, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 0.0923839658498764, + "kl": 0.0001842975616455078, + "learning_rate": 1.0597014925373134e-07, + "loss": 0.0187, + "reward": 0.2924107313156128, + "reward_std": 0.08336303988471627, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.2723214477300644, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 2021.3304443359375, + "completion_length": 2022.5179748535156, "epoch": 0.021506982301545814, - "grad_norm": 0.09324962645769119, - "kl": 0.00017690658569335938, - "learning_rate": 2.1492537313432833e-08, - "loss": 0.0219, - "reward": 0.3560267984867096, - "reward_std": 0.06340587045997381, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 0.07934773713350296, + "kl": 0.00018072128295898438, + "learning_rate": 1.0746268656716417e-07, + "loss": 0.0192, + "reward": 0.345982164144516, + "reward_std": 0.0572027824819088, + "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.2633928656578064, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 1994.0380249023438, + "completion_length": 1991.4375305175781, "epoch": 0.021805690389067284, - "grad_norm": 0.07613562792539597, - "kl": 0.00017023086547851562, - "learning_rate": 2.1791044776119402e-08, - "loss": 0.0056, - "reward": 0.4229910895228386, - "reward_std": 0.0479008168913424, - "rewards/accuracy_reward": 0.15401786682195961, + "grad_norm": 0.09225533157587051, + "kl": 0.0001690387725830078, + "learning_rate": 1.0895522388059701e-07, + "loss": 0.0219, + "reward": 0.4408482387661934, + "reward_std": 0.05567918159067631, + "rewards/accuracy_reward": 0.17187500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732313156128, + "rewards/tag_count_reward": 0.2689732238650322, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 2022.07373046875, + "completion_length": 2034.4286193847656, "epoch": 0.022104398476588755, - "grad_norm": 0.08374299108982086, - "kl": 0.00016546249389648438, - "learning_rate": 2.2089552238805968e-08, - "loss": 0.0184, - "reward": 0.270647332072258, - "reward_std": 0.05400885874405503, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.07546935975551605, + "kl": 0.00016951560974121094, + "learning_rate": 1.1044776119402985e-07, + "loss": 0.0101, + "reward": 0.2645089477300644, + "reward_std": 0.03580968314781785, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.2600446566939354, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 1999.9643859863281, + "completion_length": 1989.0290832519531, "epoch": 0.022403106564110223, - "grad_norm": 0.0860162004828453, - "kl": 0.0001704692840576172, - "learning_rate": 2.2388059701492537e-08, - "loss": 0.0203, - "reward": 0.3242187649011612, - "reward_std": 0.06455188570544124, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 0.0822647288441658, + "kl": 0.00017499923706054688, + "learning_rate": 1.1194029850746268e-07, + "loss": 0.0132, + "reward": 0.3398437649011612, + "reward_std": 0.0308816097676754, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2728794738650322, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 2015.1116638183594, + "completion_length": 2013.0848693847656, "epoch": 0.022701814651631694, - "grad_norm": 0.06589464098215103, - "kl": 0.000179290771484375, - "learning_rate": 2.2686567164179103e-08, - "loss": 0.0156, - "reward": 0.2996651902794838, - "reward_std": 0.015545189147815108, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.08197572827339172, + "kl": 0.00018215179443359375, + "learning_rate": 1.1343283582089553e-07, + "loss": 0.0141, + "reward": 0.3046875223517418, + "reward_std": 0.032418082701042295, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.2667410895228386, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 2009.3527526855469, + "completion_length": 2012.8304138183594, "epoch": 0.02300052273915316, - "grad_norm": 0.07650793343782425, - "kl": 0.00017571449279785156, - "learning_rate": 2.298507462686567e-08, - "loss": 0.0169, - "reward": 0.3147321492433548, - "reward_std": 0.03730185888707638, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 0.07649620622396469, + "kl": 0.00017404556274414062, + "learning_rate": 1.1492537313432834e-07, + "loss": 0.017, + "reward": 0.325892873108387, + "reward_std": 0.03620903054252267, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250074505806, + "rewards/tag_count_reward": 0.2656250149011612, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 2044.9732360839844, + "completion_length": 2043.5357360839844, "epoch": 0.023299230826674632, - "grad_norm": 0.04902816191315651, - "kl": 0.00018596649169921875, - "learning_rate": 2.3283582089552235e-08, + "grad_norm": 0.04983355477452278, + "kl": 0.00018095970153808594, + "learning_rate": 1.1641791044776119e-07, "loss": 0.0017, - "reward": 0.3621651977300644, - "reward_std": 0.009038817370310426, - "rewards/accuracy_reward": 0.1071428619325161, + "reward": 0.3671875149011612, + "reward_std": 0.02090683183632791, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255022332072258, + "rewards/tag_count_reward": 0.2578125074505806, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 2036.7098693847656, + "completion_length": 2036.232177734375, "epoch": 0.023597938914196103, - "grad_norm": 0.056347351521253586, - "kl": 0.00017714500427246094, - "learning_rate": 2.3582089552238804e-08, - "loss": 0.0037, - "reward": 0.2851562649011612, - "reward_std": 0.012527794111520052, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.0644640326499939, + "kl": 0.00017499923706054688, + "learning_rate": 1.1791044776119401e-07, + "loss": 0.0046, + "reward": 0.294084832072258, + "reward_std": 0.03216286585666239, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2494419775903225, + "rewards/tag_count_reward": 0.2516741156578064, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 2045.1183166503906, + "completion_length": 2040.5491638183594, "epoch": 0.02389664700171757, - "grad_norm": 0.05747372284531593, - "kl": 0.0001723766326904297, - "learning_rate": 2.388059701492537e-08, - "loss": 0.002, - "reward": 0.2940848395228386, - "reward_std": 0.01933446922339499, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.05917118489742279, + "kl": 0.00016927719116210938, + "learning_rate": 1.1940298507462686e-07, + "loss": 0.0101, + "reward": 0.2912946566939354, + "reward_std": 0.016127016162499785, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561383992433548, + "rewards/tag_count_reward": 0.2555803656578064, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 2031.3550109863281, + "completion_length": 2025.88623046875, "epoch": 0.02419535508923904, - "grad_norm": 0.07150711119174957, - "kl": 0.0001697540283203125, - "learning_rate": 2.417910447761194e-08, - "loss": 0.005, - "reward": 0.3671875149011612, - "reward_std": 0.023165614809840918, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 0.07038428634405136, + "kl": 0.0001671314239501953, + "learning_rate": 1.208955223880597e-07, + "loss": 0.0085, + "reward": 0.364955373108387, + "reward_std": 0.02935676625929773, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2622767984867096, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 2044.1072082519531, + "completion_length": 2041.138427734375, "epoch": 0.024494063176760512, - "grad_norm": 0.06667262315750122, - "kl": 0.00017571449279785156, - "learning_rate": 2.4477611940298505e-08, - "loss": 0.005, - "reward": 0.255580373108387, - "reward_std": 0.017809624783694744, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.07874462753534317, + "kl": 0.00017833709716796875, + "learning_rate": 1.2238805970149254e-07, + "loss": 0.0071, + "reward": 0.2639509066939354, + "reward_std": 0.042566894786432385, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.2594866156578064, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 2009.9107971191406, + "completion_length": 2001.4063110351562, "epoch": 0.02479277126428198, - "grad_norm": 0.08205609023571014, - "kl": 0.00017952919006347656, - "learning_rate": 2.4776119402985074e-08, - "loss": 0.0201, - "reward": 0.2773437649011612, - "reward_std": 0.051001871936023235, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.08482035994529724, + "kl": 0.00017404556274414062, + "learning_rate": 1.2388059701492537e-07, + "loss": 0.022, + "reward": 0.2829241156578064, + "reward_std": 0.03838158119469881, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473395228386, + "rewards/tag_count_reward": 0.278459832072258, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 2021.6094665527344, + "completion_length": 2018.4666137695312, "epoch": 0.02509147935180345, - "grad_norm": 0.08158819377422333, - "kl": 0.00018286705017089844, - "learning_rate": 2.5074626865671637e-08, - "loss": 0.0146, - "reward": 0.3041294813156128, - "reward_std": 0.06569371977820992, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.06894195824861526, + "kl": 0.0001761913299560547, + "learning_rate": 1.253731343283582e-07, + "loss": 0.0142, + "reward": 0.3119419738650322, + "reward_std": 0.05138242733664811, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.2650669738650322, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 2016.5714721679688, + "completion_length": 2014.7991333007812, "epoch": 0.02539018743932492, - "grad_norm": 0.08918548375368118, - "kl": 0.00017952919006347656, - "learning_rate": 2.537313432835821e-08, - "loss": 0.0091, - "reward": 0.3950892984867096, - "reward_std": 0.08980707312002778, - "rewards/accuracy_reward": 0.1250000037252903, + "grad_norm": 0.10145162791013718, + "kl": 0.0001747608184814453, + "learning_rate": 1.2686567164179106e-07, + "loss": 0.0171, + "reward": 0.3989955484867096, + "reward_std": 0.12134538032114506, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.2717633992433548, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 2041.9777526855469, + "completion_length": 2041.6652221679688, "epoch": 0.02568889552684639, - "grad_norm": 0.060746967792510986, - "kl": 0.0001819133758544922, - "learning_rate": 2.5671641791044772e-08, - "loss": 0.0084, - "reward": 0.2991071566939354, - "reward_std": 0.045083843637257814, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.05340084061026573, + "kl": 0.0001823902130126953, + "learning_rate": 1.2835820895522386e-07, + "loss": 0.0067, + "reward": 0.2963169738650322, + "reward_std": 0.03224052069708705, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2544642984867096, + "rewards/tag_count_reward": 0.2539062574505806, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 2037.6920471191406, + "completion_length": 2021.9666137695312, "epoch": 0.02598760361436786, - "grad_norm": 0.08566693961620331, - "kl": 0.0001647472381591797, - "learning_rate": 2.5970149253731345e-08, - "loss": 0.01, - "reward": 0.3554687649011612, - "reward_std": 0.07413291302509606, - "rewards/accuracy_reward": 0.09151785937137902, + "grad_norm": 0.09708087891340256, + "kl": 0.00016617774963378906, + "learning_rate": 1.2985074626865672e-07, + "loss": 0.0223, + "reward": 0.353794664144516, + "reward_std": 0.07332677789963782, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.263950914144516, + "rewards/tag_count_reward": 0.266741082072258, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 2029.0982971191406, + "completion_length": 2021.8527526855469, "epoch": 0.026286311701889328, - "grad_norm": 0.06861446052789688, - "kl": 0.0001773834228515625, - "learning_rate": 2.6268656716417907e-08, - "loss": 0.0105, - "reward": 0.2801339477300644, - "reward_std": 0.04473195574246347, + "grad_norm": 0.06138848513364792, + "kl": 0.000179290771484375, + "learning_rate": 1.3134328358208955e-07, + "loss": 0.0059, + "reward": 0.2823660895228386, + "reward_std": 0.03848878503777087, "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622768059372902, + "rewards/tag_count_reward": 0.2645089477300644, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 2027.4889221191406, + "completion_length": 2027.1541137695312, "epoch": 0.0265850197894108, - "grad_norm": 0.10190900415182114, - "kl": 0.000179290771484375, - "learning_rate": 2.656716417910448e-08, - "loss": 0.0111, - "reward": 0.3521205559372902, - "reward_std": 0.08330107759684324, - "rewards/accuracy_reward": 0.08705357555299997, + "grad_norm": 0.10084333270788193, + "kl": 0.0001804828643798828, + "learning_rate": 1.328358208955224e-07, + "loss": 0.0158, + "reward": 0.3476562649011612, + "reward_std": 0.07806684356182814, + "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.2650669813156128, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 2028.7701416015625, + "completion_length": 2025.8303833007812, "epoch": 0.02688372787693227, - "grad_norm": 0.05903898924589157, + "grad_norm": 0.05966576561331749, "kl": 0.00017905235290527344, - "learning_rate": 2.6865671641791042e-08, - "loss": 0.0073, - "reward": 0.2974330559372902, - "reward_std": 0.020985058741644025, - "rewards/accuracy_reward": 0.03794643026776612, + "learning_rate": 1.343283582089552e-07, + "loss": 0.0094, + "reward": 0.293526791036129, + "reward_std": 0.008457264630123973, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.259486623108387, + "rewards/tag_count_reward": 0.2578125074505806, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 2026.0179443359375, + "completion_length": 2012.3058776855469, "epoch": 0.027182435964453737, - "grad_norm": 0.07696609944105148, - "kl": 0.0001819133758544922, - "learning_rate": 2.7164179104477608e-08, - "loss": 0.0089, - "reward": 0.3046875149011612, - "reward_std": 0.07518337294459343, - "rewards/accuracy_reward": 0.033482144586741924, + "grad_norm": 0.08248227089643478, + "kl": 0.00018858909606933594, + "learning_rate": 1.3582089552238803e-07, + "loss": 0.012, + "reward": 0.2929687574505806, + "reward_std": 0.07058397959917784, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2712053656578064, + "rewards/tag_count_reward": 0.2684151977300644, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 2027.1541137695312, + "completion_length": 2025.9219360351562, "epoch": 0.027481144051975208, - "grad_norm": 0.06858372688293457, - "kl": 0.0001697540283203125, - "learning_rate": 2.7462686567164177e-08, - "loss": 0.0171, - "reward": 0.2695312649011612, - "reward_std": 0.045459296088665724, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.07628428936004639, + "kl": 0.0001621246337890625, + "learning_rate": 1.373134328358209e-07, + "loss": 0.0196, + "reward": 0.2672991156578064, + "reward_std": 0.037932771956548095, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.262834832072258, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 2020.8125610351562, + "completion_length": 2007.9419860839844, "epoch": 0.027779852139496675, - "grad_norm": 0.0849822536110878, - "kl": 0.00016617774963378906, - "learning_rate": 2.7761194029850743e-08, - "loss": 0.0187, - "reward": 0.3292410895228386, - "reward_std": 0.08324511162936687, - "rewards/accuracy_reward": 0.06696428824216127, + "grad_norm": 0.08513079583644867, + "kl": 0.00016760826110839844, + "learning_rate": 1.3880597014925372e-07, + "loss": 0.02, + "reward": 0.3376116156578064, + "reward_std": 0.0704271961003542, + "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.266183041036129, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 2044.6160888671875, + "completion_length": 2043.3304443359375, "epoch": 0.028078560227018146, - "grad_norm": 0.05687003210186958, - "kl": 0.0001709461212158203, - "learning_rate": 2.8059701492537312e-08, - "loss": 0.0028, - "reward": 0.2617187649011612, - "reward_std": 0.03441810538060963, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.07018104195594788, + "kl": 0.00017690658569335938, + "learning_rate": 1.4029850746268658e-07, + "loss": 0.0082, + "reward": 0.2544642984867096, + "reward_std": 0.016442545456811786, + "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255022332072258, + "rewards/tag_count_reward": 0.2544642984867096, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 2041.9888916015625, + "completion_length": 2038.9465026855469, "epoch": 0.028377268314539617, - "grad_norm": 0.04352659732103348, - "kl": 0.00018644332885742188, - "learning_rate": 2.8358208955223878e-08, - "loss": 0.007, - "reward": 0.2890625149011612, - "reward_std": 0.00845726439729333, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.051355332136154175, + "kl": 0.00018906593322753906, + "learning_rate": 1.4179104477611938e-07, + "loss": 0.0034, + "reward": 0.2935268059372902, + "reward_std": 0.014339675661176443, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2533482313156128, + "rewards/tag_count_reward": 0.2555803656578064, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 2031.5379943847656, + "completion_length": 2035.7902221679688, "epoch": 0.028675976402061085, - "grad_norm": 0.05200180411338806, - "kl": 0.00018525123596191406, - "learning_rate": 2.8656716417910448e-08, - "loss": 0.0036, - "reward": 0.3024553656578064, - "reward_std": 0.03631146717816591, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 0.06005255505442619, + "kl": 0.0001811981201171875, + "learning_rate": 1.4328358208955223e-07, + "loss": 0.0061, + "reward": 0.3074776977300644, + "reward_std": 0.04148227581754327, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.2583705559372902, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 2008.5781555175781, + "completion_length": 2009.4754943847656, "epoch": 0.028974684489582556, - "grad_norm": 0.07165008783340454, - "kl": 0.00016880035400390625, - "learning_rate": 2.8955223880597013e-08, - "loss": 0.0036, - "reward": 0.2779018059372902, - "reward_std": 0.039677223190665245, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.05179405212402344, + "kl": 0.00016689300537109375, + "learning_rate": 1.4477611940298506e-07, + "loss": 0.0022, + "reward": 0.2806919738650322, + "reward_std": 0.03668789169751108, + "rewards/accuracy_reward": 0.01116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732313156128, + "rewards/tag_count_reward": 0.2695312574505806, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 2040.2991943359375, + "completion_length": 2039.5647888183594, "epoch": 0.029273392577104027, - "grad_norm": 0.05489411950111389, - "kl": 0.00018477439880371094, - "learning_rate": 2.9253731343283583e-08, - "loss": 0.0047, - "reward": 0.3716517984867096, - "reward_std": 0.06736839306540787, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 0.06932333111763, + "kl": 0.00017881393432617188, + "learning_rate": 1.4626865671641792e-07, + "loss": 0.0054, + "reward": 0.3816964402794838, + "reward_std": 0.08063330501317978, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262276791036129, + "rewards/tag_count_reward": 0.2611607238650322, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 2027.7277526855469, + "completion_length": 2024.915283203125, "epoch": 0.029572100664625494, - "grad_norm": 0.06458111107349396, - "kl": 0.00017571449279785156, - "learning_rate": 2.955223880597015e-08, - "loss": 0.0147, - "reward": 0.2600446492433548, - "reward_std": 0.018784933490678668, + "grad_norm": 0.0861809253692627, + "kl": 0.0001742839813232422, + "learning_rate": 1.4776119402985075e-07, + "loss": 0.0137, + "reward": 0.2639509066939354, + "reward_std": 0.027508222497999668, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446492433548, + "rewards/tag_count_reward": 0.2639509066939354, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 1999.7054443359375, + "completion_length": 1998.0201721191406, "epoch": 0.029870808752146965, - "grad_norm": 0.07467396557331085, - "kl": 0.0001888275146484375, - "learning_rate": 2.9850746268656714e-08, - "loss": 0.0003, - "reward": 0.3214285895228386, - "reward_std": 0.04537490173242986, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 0.07582151889801025, + "kl": 0.00018906593322753906, + "learning_rate": 1.4925373134328355e-07, + "loss": 0.0295, + "reward": 0.3164062649011612, + "reward_std": 0.04657869855873287, + "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.2628348395228386, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 2040.5022888183594, + "completion_length": 2043.654052734375, "epoch": 0.030169516839668432, - "grad_norm": 0.06994633376598358, - "kl": 0.0001704692840576172, - "learning_rate": 3.014925373134328e-08, - "loss": 0.0092, - "reward": 0.310267873108387, - "reward_std": 0.049059626180678606, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 0.07266262918710709, + "kl": 0.00016546249389648438, + "learning_rate": 1.507462686567164e-07, + "loss": 0.0059, + "reward": 0.2963169738650322, + "reward_std": 0.02936145500279963, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285895228386, + "rewards/tag_count_reward": 0.2561384066939354, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 2039.0045471191406, + "completion_length": 2044.7969360351562, "epoch": 0.030468224927189903, - "grad_norm": 0.057079847902059555, - "kl": 0.000179290771484375, - "learning_rate": 3.0447761194029846e-08, - "loss": 0.0086, - "reward": 0.3270089477300644, - "reward_std": 0.01402034517377615, + "grad_norm": 0.06327736377716064, + "kl": 0.0001804828643798828, + "learning_rate": 1.5223880597014924e-07, + "loss": 0.0031, + "reward": 0.3264509066939354, + "reward_std": 0.013739095069468021, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.255022332072258, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 2024.466552734375, + "completion_length": 2034.2098693847656, "epoch": 0.030766933014711374, - "grad_norm": 0.0730607733130455, - "kl": 0.00017881393432617188, - "learning_rate": 3.074626865671642e-08, - "loss": 0.0142, - "reward": 0.301897332072258, - "reward_std": 0.03335842536762357, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 0.060658276081085205, + "kl": 0.00017571449279785156, + "learning_rate": 1.537313432835821e-07, + "loss": 0.0087, + "reward": 0.2935267984867096, + "reward_std": 0.01807763520628214, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.259486623108387, + "rewards/tag_count_reward": 0.255580373108387, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 2018.685302734375, + "completion_length": 2018.9576416015625, "epoch": 0.03106564110223284, - "grad_norm": 0.06794976443052292, - "kl": 0.00017833709716796875, - "learning_rate": 3.1044776119402985e-08, - "loss": 0.012, - "reward": 0.369419664144516, - "reward_std": 0.04871091921813786, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 0.0794973373413086, + "kl": 0.00017499923706054688, + "learning_rate": 1.5522388059701492e-07, + "loss": 0.0126, + "reward": 0.368861623108387, + "reward_std": 0.051864683628082275, + "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.2617187574505806, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 2047.3817138671875, + "completion_length": 2044.6964416503906, "epoch": 0.03136434918975431, - "grad_norm": 0.04564929008483887, - "kl": 0.0001766681671142578, - "learning_rate": 3.134328358208955e-08, - "loss": 0.0008, - "reward": 0.3236607313156128, - "reward_std": 0.007513973629102111, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 0.05705650523304939, + "kl": 0.000179290771484375, + "learning_rate": 1.5671641791044775e-07, + "loss": 0.0062, + "reward": 0.325892873108387, + "reward_std": 0.017857143888249993, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2522321492433548, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 2046.8326416015625, + "completion_length": 2047.9464416503906, "epoch": 0.03166305727727578, - "grad_norm": 0.044195279479026794, - "kl": 0.00017595291137695312, - "learning_rate": 3.1641791044776116e-08, - "loss": 0.002, - "reward": 0.2617187649011612, - "reward_std": 0.023535723332315683, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.03761906176805496, + "kl": 0.00017333030700683594, + "learning_rate": 1.5820895522388058e-07, + "loss": 0.0001, + "reward": 0.2578125074505806, + "reward_std": 0.024980786256492138, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901902794838, + "rewards/tag_count_reward": 0.251116082072258, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 2041.3147583007812, + "completion_length": 2037.4933776855469, "epoch": 0.03196176536479725, - "grad_norm": 0.07494223117828369, - "kl": 0.0001723766326904297, - "learning_rate": 3.194029850746269e-08, - "loss": 0.0081, - "reward": 0.3007812649011612, - "reward_std": 0.04224912659265101, + "grad_norm": 0.08437875658273697, + "kl": 0.0001685619354248047, + "learning_rate": 1.5970149253731344e-07, + "loss": 0.0108, + "reward": 0.3035714402794838, + "reward_std": 0.040224216878414154, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.2611607238650322, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 2042.4152526855469, + "completion_length": 2042.9509582519531, "epoch": 0.03226047345231872, - "grad_norm": 0.08929815888404846, - "kl": 0.00018024444580078125, - "learning_rate": 3.2238805970149255e-08, - "loss": 0.0055, - "reward": 0.2672991305589676, - "reward_std": 0.054948376258835196, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.06780313700437546, + "kl": 0.00017261505126953125, + "learning_rate": 1.6119402985074627e-07, + "loss": 0.006, + "reward": 0.2712053656578064, + "reward_std": 0.043657024623826146, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.2578125149011612, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 2020.5580749511719, + "completion_length": 2022.8505249023438, "epoch": 0.03255918153984019, - "grad_norm": 0.067294642329216, - "kl": 0.00019121170043945312, - "learning_rate": 3.2537313432835814e-08, - "loss": 0.0159, - "reward": 0.3727678656578064, - "reward_std": 0.025727201718837023, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 0.08141493797302246, + "kl": 0.00018525123596191406, + "learning_rate": 1.626865671641791e-07, + "loss": 0.0163, + "reward": 0.3727678805589676, + "reward_std": 0.035714288242161274, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2633928656578064, + "rewards/tag_count_reward": 0.2611607313156128, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 1996.27685546875, + "completion_length": 1993.1295471191406, "epoch": 0.032857889627361664, - "grad_norm": 0.09131090342998505, - "kl": 0.0001862049102783203, - "learning_rate": 3.2835820895522386e-08, - "loss": 0.0224, - "reward": 0.2762276977300644, - "reward_std": 0.039576003327965736, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.08384981006383896, + "kl": 0.0001888275146484375, + "learning_rate": 1.6417910447761193e-07, + "loss": 0.0195, + "reward": 0.2840401902794838, + "reward_std": 0.0435154908336699, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.271763414144516, + "rewards/tag_count_reward": 0.2751116156578064, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 2041.0982666015625, + "completion_length": 2039.0670166015625, "epoch": 0.03315659771488313, - "grad_norm": 0.04531782865524292, - "kl": 0.00018095970153808594, - "learning_rate": 3.313432835820895e-08, - "loss": 0.0044, - "reward": 0.2611607238650322, - "reward_std": 0.02203672006726265, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.07148782163858414, + "kl": 0.00017333030700683594, + "learning_rate": 1.6567164179104476e-07, + "loss": 0.0052, + "reward": 0.2661830559372902, + "reward_std": 0.03985769907012582, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964328289032, + "rewards/tag_count_reward": 0.2594866305589676, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 2004.0513610839844, + "completion_length": 2002.8214721679688, "epoch": 0.0334553058024046, - "grad_norm": 0.08483865112066269, - "kl": 0.00019097328186035156, - "learning_rate": 3.3432835820895525e-08, - "loss": 0.0148, - "reward": 0.3013392984867096, - "reward_std": 0.021968542132526636, + "grad_norm": 0.07812437415122986, + "kl": 0.000186920166015625, + "learning_rate": 1.671641791044776e-07, + "loss": 0.0165, + "reward": 0.3002232238650322, + "reward_std": 0.021530525060370564, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.263392873108387, + "rewards/tag_count_reward": 0.262276791036129, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 2047.0312805175781, + "completion_length": 2044.3326416015625, "epoch": 0.03375401388992607, - "grad_norm": 0.05587835982441902, - "kl": 0.00019240379333496094, - "learning_rate": 3.3731343283582084e-08, - "loss": 0.0012, - "reward": 0.3599330559372902, - "reward_std": 0.00974611658602953, + "grad_norm": 0.05895126983523369, + "kl": 0.00018262863159179688, + "learning_rate": 1.6865671641791044e-07, + "loss": 0.0049, + "reward": 0.361049123108387, + "reward_std": 0.011113195912912488, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901828289032, + "rewards/tag_count_reward": 0.2539062649011612, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 2033.5648193359375, + "completion_length": 2018.3460693359375, "epoch": 0.03405272197744754, - "grad_norm": 0.0658949688076973, - "kl": 0.00017976760864257812, - "learning_rate": 3.4029850746268657e-08, - "loss": 0.0073, - "reward": 0.3437500149011612, - "reward_std": 0.04002849687822163, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 0.08534523099660873, + "kl": 0.000171661376953125, + "learning_rate": 1.701492537313433e-07, + "loss": 0.0188, + "reward": 0.3588169813156128, + "reward_std": 0.06383948028087616, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.263392873108387, + "rewards/tag_count_reward": 0.2695312574505806, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 2045.2544860839844, + "completion_length": 2042.1027221679688, "epoch": 0.03435143006496901, - "grad_norm": 0.052605319768190384, - "kl": 0.00018310546875, - "learning_rate": 3.432835820895522e-08, - "loss": 0.0028, - "reward": 0.2572544738650322, - "reward_std": 0.023817782290279865, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.0591108538210392, + "kl": 0.0001659393310546875, + "learning_rate": 1.716417910447761e-07, + "loss": 0.0048, + "reward": 0.258928582072258, + "reward_std": 0.022477608872577548, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901902794838, + "rewards/tag_count_reward": 0.2566964402794838, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 2040.357177734375, + "completion_length": 2036.5558776855469, "epoch": 0.034650138152490476, - "grad_norm": 0.06905686855316162, - "kl": 0.00018167495727539062, - "learning_rate": 3.4626865671641795e-08, - "loss": 0.0079, - "reward": 0.2578125149011612, - "reward_std": 0.017102325102314353, + "grad_norm": 0.0832672119140625, + "kl": 0.0001685619354248047, + "learning_rate": 1.7313432835820896e-07, + "loss": 0.0105, + "reward": 0.2594866156578064, + "reward_std": 0.0251810597255826, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.2594866156578064, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 2047.5625305175781, + "completion_length": 2048.0, "epoch": 0.034948846240011947, - "grad_norm": 0.04538140445947647, - "kl": 0.00018262863159179688, - "learning_rate": 3.4925373134328354e-08, - "loss": 0.0004, - "reward": 0.3242187574505806, - "reward_std": 0.008063508197665215, + "grad_norm": 0.0015277797356247902, + "kl": 0.0001728534698486328, + "learning_rate": 1.7462686567164178e-07, + "loss": 0.0, + "reward": 0.321428582072258, + "reward_std": 0.0, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901902794838, + "rewards/tag_count_reward": 0.25, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 2026.6808776855469, + "completion_length": 2019.4844360351562, "epoch": 0.03524755432753342, - "grad_norm": 0.08781690150499344, - "kl": 0.0001785755157470703, - "learning_rate": 3.522388059701492e-08, - "loss": 0.0144, - "reward": 0.407924123108387, - "reward_std": 0.02946685627102852, + "grad_norm": 0.08572108298540115, + "kl": 0.0001685619354248047, + "learning_rate": 1.7611940298507461e-07, + "loss": 0.0139, + "reward": 0.4123884066939354, + "reward_std": 0.02840944961644709, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.2695312723517418, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 2042.5358276367188, + "completion_length": 2033.4978332519531, "epoch": 0.03554626241505489, - "grad_norm": 0.07069502770900726, - "kl": 0.00018286705017089844, - "learning_rate": 3.552238805970149e-08, - "loss": 0.004, - "reward": 0.3097098395228386, - "reward_std": 0.04894589842297137, - "rewards/accuracy_reward": 0.05133928777649999, + "grad_norm": 0.07758825272321701, + "kl": 0.00016450881958007812, + "learning_rate": 1.7761194029850744e-07, + "loss": 0.0108, + "reward": 0.3225446566939354, + "reward_std": 0.05096272611990571, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705559372902, + "rewards/tag_count_reward": 0.2600446492433548, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 2023.2254943847656, + "completion_length": 2034.2254943847656, "epoch": 0.03584497050257636, - "grad_norm": 0.05157100781798363, - "kl": 0.00017905235290527344, - "learning_rate": 3.582089552238806e-08, - "loss": 0.0152, - "reward": 0.3035714328289032, - "reward_std": 0.028138648718595505, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.049877848476171494, + "kl": 0.0001666545867919922, + "learning_rate": 1.7910447761194027e-07, + "loss": 0.0058, + "reward": 0.309151791036129, + "reward_std": 0.029003681614995003, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285746216774, + "rewards/tag_count_reward": 0.2600446492433548, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 1999.3572387695312, + "completion_length": 1999.0000610351562, "epoch": 0.03614367859009783, - "grad_norm": 0.0842297226190567, - "kl": 0.000179290771484375, - "learning_rate": 3.6119402985074624e-08, - "loss": 0.0213, - "reward": 0.3297991156578064, - "reward_std": 0.07239642925560474, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 0.08883936703205109, + "kl": 0.0001862049102783203, + "learning_rate": 1.8059701492537313e-07, + "loss": 0.0261, + "reward": 0.3191964402794838, + "reward_std": 0.0875182505697012, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276828289032, + "rewards/tag_count_reward": 0.274553582072258, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 2008.0848999023438, + "completion_length": 2005.6429138183594, "epoch": 0.036442386677619294, - "grad_norm": 0.07199127227067947, - "kl": 0.0001804828643798828, - "learning_rate": 3.641791044776119e-08, - "loss": 0.0145, - "reward": 0.2672991156578064, - "reward_std": 0.017887852154672146, + "grad_norm": 0.09178085625171661, + "kl": 0.00017452239990234375, + "learning_rate": 1.8208955223880596e-07, + "loss": 0.0188, + "reward": 0.2695312574505806, + "reward_std": 0.026280531659722328, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.2695312574505806, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 2019.888427734375, + "completion_length": 2016.9197082519531, "epoch": 0.036741094765140765, - "grad_norm": 0.07131193578243256, - "kl": 0.0001747608184814453, - "learning_rate": 3.671641791044776e-08, - "loss": 0.0104, - "reward": 0.301897332072258, - "reward_std": 0.019554960541427135, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.06954555213451385, + "kl": 0.00016760826110839844, + "learning_rate": 1.8358208955223881e-07, + "loss": 0.0176, + "reward": 0.3052455559372902, + "reward_std": 0.038904729299247265, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.2650669813156128, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 2036.712158203125, + "completion_length": 2041.5916137695312, "epoch": 0.037039802852662236, - "grad_norm": 0.0755307525396347, - "kl": 0.000186920166015625, - "learning_rate": 3.701492537313433e-08, - "loss": 0.0112, - "reward": 0.3147321566939354, - "reward_std": 0.04349109251052141, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 0.06397812813520432, + "kl": 0.00016689300537109375, + "learning_rate": 1.8507462686567162e-07, + "loss": 0.0067, + "reward": 0.3018973246216774, + "reward_std": 0.030726489843800664, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.2572544738650322, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 2016.1607971191406, + "completion_length": 2006.9866638183594, "epoch": 0.03733851094018371, - "grad_norm": 0.10289999097585678, - "kl": 0.00016307830810546875, - "learning_rate": 3.7313432835820895e-08, - "loss": 0.0209, - "reward": 0.3482142984867096, - "reward_std": 0.05298853572458029, + "grad_norm": 0.09185361117124557, + "kl": 0.00017261505126953125, + "learning_rate": 1.8656716417910447e-07, + "loss": 0.0185, + "reward": 0.3487723395228386, + "reward_std": 0.042637437116354704, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.2706473246216774, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 2046.1384582519531, + "completion_length": 2046.5848999023438, "epoch": 0.03763721902770518, - "grad_norm": 0.04824988171458244, - "kl": 0.000186920166015625, - "learning_rate": 3.761194029850746e-08, - "loss": 0.0024, - "reward": 0.3638393059372902, - "reward_std": 0.01971272611990571, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 0.05983275547623634, + "kl": 0.00016832351684570312, + "learning_rate": 1.880597014925373e-07, + "loss": 0.0026, + "reward": 0.3621651977300644, + "reward_std": 0.020089286379516125, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2522321566939354, + "rewards/tag_count_reward": 0.2527901977300644, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 2021.0134582519531, + "completion_length": 2016.3952026367188, "epoch": 0.03793592711522664, - "grad_norm": 0.08407486975193024, - "kl": 0.00017404556274414062, - "learning_rate": 3.7910447761194026e-08, - "loss": 0.021, - "reward": 0.3811384215950966, - "reward_std": 0.046502585522830486, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 0.09380698204040527, + "kl": 0.00017547607421875, + "learning_rate": 1.8955223880597013e-07, + "loss": 0.0224, + "reward": 0.380580373108387, + "reward_std": 0.045238421531394124, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.2689732238650322, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 2046.6116333007812, + "completion_length": 2045.8505249023438, "epoch": 0.03823463520274811, - "grad_norm": 0.05819227546453476, - "kl": 0.000179290771484375, - "learning_rate": 3.82089552238806e-08, - "loss": -0.0002, - "reward": 0.2890625149011612, - "reward_std": 0.010295651154592633, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.08567311614751816, + "kl": 0.00015878677368164062, + "learning_rate": 1.91044776119403e-07, + "loss": 0.0034, + "reward": 0.298549123108387, + "reward_std": 0.0419728672131896, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2533482238650322, + "rewards/tag_count_reward": 0.2561384066939354, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 1997.6585388183594, + "completion_length": 1984.0290832519531, "epoch": 0.038533343290269584, - "grad_norm": 0.09039855003356934, - "kl": 0.00017690658569335938, - "learning_rate": 3.850746268656716e-08, - "loss": 0.0232, - "reward": 0.435267873108387, - "reward_std": 0.08239764836616814, - "rewards/accuracy_reward": 0.16294643376022577, + "grad_norm": 0.11230553686618805, + "kl": 0.00020241737365722656, + "learning_rate": 1.925373134328358e-07, + "loss": 0.0317, + "reward": 0.4575893059372902, + "reward_std": 0.10603193193674088, + "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.2790178656578064, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 2014.3505249023438, + "completion_length": 2000.2120971679688, "epoch": 0.038832051377791055, - "grad_norm": 0.09833338856697083, - "kl": 0.0001723766326904297, - "learning_rate": 3.880597014925373e-08, - "loss": 0.0221, - "reward": 0.346540205180645, - "reward_std": 0.11547893099486828, - "rewards/accuracy_reward": 0.07142857508733869, + "grad_norm": 0.08793243020772934, + "kl": 0.00019097328186035156, + "learning_rate": 1.9402985074626865e-07, + "loss": 0.0255, + "reward": 0.3510044738650322, + "reward_std": 0.0863843634724617, + "rewards/accuracy_reward": 0.07142857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "rewards/tag_count_reward": 0.2795759066939354, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 2040.9107971191406, + "completion_length": 2026.97998046875, "epoch": 0.039130759465312526, - "grad_norm": 0.06471653282642365, - "kl": 0.0001780986785888672, - "learning_rate": 3.9104477611940297e-08, - "loss": 0.0075, - "reward": 0.2695312574505806, - "reward_std": 0.04992468957789242, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 0.06578430533409119, + "kl": 0.00017571449279785156, + "learning_rate": 1.9552238805970148e-07, + "loss": 0.0128, + "reward": 0.274553582072258, + "reward_std": 0.048977102153003216, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.2611607238650322, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 2035.5402526855469, + "completion_length": 2028.12060546875, "epoch": 0.03942946755283399, - "grad_norm": 0.05135807394981384, - "kl": 0.00016546249389648438, - "learning_rate": 3.940298507462686e-08, - "loss": 0.0081, - "reward": 0.2617187649011612, - "reward_std": 0.024574029492214322, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.07726174592971802, + "kl": 0.00016307830810546875, + "learning_rate": 1.9701492537313433e-07, + "loss": 0.0166, + "reward": 0.2779017984867096, + "reward_std": 0.05110340751707554, + "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.2600446566939354, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 2041.8236999511719, + "completion_length": 2040.2098693847656, "epoch": 0.03972817564035546, - "grad_norm": 0.07172587513923645, - "kl": 0.0001876354217529297, - "learning_rate": 3.970149253731343e-08, - "loss": 0.0054, - "reward": 0.3660714477300644, - "reward_std": 0.01917670457623899, + "grad_norm": 0.0705326646566391, + "kl": 0.0001780986785888672, + "learning_rate": 1.9850746268656716e-07, + "loss": 0.0041, + "reward": 0.368861623108387, + "reward_std": 0.019334469456225634, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.2617187574505806, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 2041.3348388671875, + "completion_length": 2035.1786193847656, "epoch": 0.04002688372787693, - "grad_norm": 0.08621275424957275, - "kl": 0.000179290771484375, - "learning_rate": 4e-08, - "loss": 0.0048, - "reward": 0.3314732238650322, - "reward_std": 0.025291305501013994, + "grad_norm": 0.08128512650728226, + "kl": 0.00018453598022460938, + "learning_rate": 2e-07, + "loss": 0.0095, + "reward": 0.334263414144516, + "reward_std": 0.02304393658414483, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.262834832072258, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 2016.38623046875, + "completion_length": 2009.9956359863281, "epoch": 0.0403255918153984, - "grad_norm": 0.06165175512433052, - "kl": 0.00018739700317382812, - "learning_rate": 4.029850746268657e-08, - "loss": 0.0103, - "reward": 0.3281250149011612, - "reward_std": 0.028875397285446525, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 0.09043879806995392, + "kl": 0.0001990795135498047, + "learning_rate": 2.0149253731343282e-07, + "loss": 0.0151, + "reward": 0.3342634066939354, + "reward_std": 0.05122541659511626, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250074505806, + "rewards/tag_count_reward": 0.2695312649011612, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 2046.7388610839844, + "completion_length": 2045.732177734375, "epoch": 0.04062429990291987, - "grad_norm": 0.052536625415086746, - "kl": 0.0001842975616455078, - "learning_rate": 4.059701492537313e-08, - "loss": 0.0022, - "reward": 0.2572544813156128, - "reward_std": 0.02194486791267991, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.05822138860821724, + "kl": 0.00017023086547851562, + "learning_rate": 2.0298507462686565e-07, + "loss": 0.003, + "reward": 0.2539062574505806, + "reward_std": 0.01252779457718134, + "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2527901902794838, + "rewards/tag_count_reward": 0.2539062574505806, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 2040.5915832519531, + "completion_length": 2042.2790832519531, "epoch": 0.040923007990441344, - "grad_norm": 0.07016008347272873, - "kl": 0.0001823902130126953, - "learning_rate": 4.08955223880597e-08, - "loss": 0.01, - "reward": 0.2656250074505806, - "reward_std": 0.041813663206994534, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.06518001854419708, + "kl": 0.00017118453979492188, + "learning_rate": 2.044776119402985e-07, + "loss": 0.0047, + "reward": 0.2566964402794838, + "reward_std": 0.017070032423362136, + "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2566964402794838, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 2043.1004638671875, + "completion_length": 2039.3973999023438, "epoch": 0.04122171607796281, - "grad_norm": 0.05937737226486206, - "kl": 0.00018715858459472656, - "learning_rate": 4.1194029850746264e-08, - "loss": 0.0044, - "reward": 0.2940848246216774, - "reward_std": 0.020207706140354276, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.06714539229869843, + "kl": 0.00017905235290527344, + "learning_rate": 2.0597014925373134e-07, + "loss": 0.0059, + "reward": 0.3074776977300644, + "reward_std": 0.04616858554072678, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561383992433548, + "rewards/tag_count_reward": 0.2583705484867096, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 2019.2076721191406, + "completion_length": 2017.4219360351562, "epoch": 0.04152042416548428, - "grad_norm": 0.08936110883951187, - "kl": 0.00019240379333496094, - "learning_rate": 4.149253731343284e-08, - "loss": 0.0189, - "reward": 0.4453125074505806, - "reward_std": 0.08725984394550323, - "rewards/accuracy_reward": 0.17633928963914514, + "grad_norm": 0.07822741568088531, + "kl": 0.00020813941955566406, + "learning_rate": 2.0746268656716416e-07, + "loss": 0.017, + "reward": 0.4458705633878708, + "reward_std": 0.06650421163067222, + "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.2672991156578064, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 2012.0179138183594, + "completion_length": 1998.2366943359375, "epoch": 0.04181913225300575, - "grad_norm": 0.08243108540773392, - "kl": 0.0001614093780517578, - "learning_rate": 4.17910447761194e-08, - "loss": 0.0155, - "reward": 0.2695312723517418, - "reward_std": 0.03069381252862513, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.07849594205617905, + "kl": 0.0001964569091796875, + "learning_rate": 2.08955223880597e-07, + "loss": 0.0147, + "reward": 0.2963169738650322, + "reward_std": 0.07019457221031189, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.273995541036129, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 2038.0357971191406, + "completion_length": 2026.83935546875, "epoch": 0.04211784034052722, - "grad_norm": 0.0669916495680809, - "kl": 0.00017952919006347656, - "learning_rate": 4.208955223880597e-08, - "loss": 0.0083, - "reward": 0.2700892984867096, - "reward_std": 0.036209032172337174, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 0.07670233398675919, + "kl": 0.00019168853759765625, + "learning_rate": 2.1044776119402985e-07, + "loss": 0.0124, + "reward": 0.2829241156578064, + "reward_std": 0.052819991018623114, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.262834832072258, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 2030.12060546875, + "completion_length": 2015.1161193847656, "epoch": 0.04241654842804869, - "grad_norm": 0.09221672266721725, - "kl": 0.00018095970153808594, - "learning_rate": 4.2388059701492535e-08, - "loss": 0.0098, - "reward": 0.2801339328289032, - "reward_std": 0.06609318405389786, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 0.08329468965530396, + "kl": 0.00020742416381835938, + "learning_rate": 2.1194029850746268e-07, + "loss": 0.0171, + "reward": 0.2756696492433548, + "reward_std": 0.0507113803178072, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089328289032, + "rewards/tag_count_reward": 0.2667410746216774, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 2017.1719055175781, + "completion_length": 2008.1317749023438, "epoch": 0.042715256515570156, - "grad_norm": 0.07622247189283371, - "kl": 0.0001888275146484375, - "learning_rate": 4.268656716417911e-08, - "loss": 0.0083, - "reward": 0.3459821492433548, - "reward_std": 0.0325979134067893, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 0.07765533030033112, + "kl": 0.00021910667419433594, + "learning_rate": 2.1343283582089554e-07, + "loss": 0.0167, + "reward": 0.3582589328289032, + "reward_std": 0.037035543005913496, + "rewards/accuracy_reward": 0.0915178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2633928656578064, + "rewards/tag_count_reward": 0.266741082072258, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 2018.6005249023438, + "completion_length": 2003.9531860351562, "epoch": 0.04301396460309163, - "grad_norm": 0.06551355123519897, - "kl": 0.00017523765563964844, - "learning_rate": 4.2985074626865666e-08, - "loss": 0.0068, - "reward": 0.2728794738650322, - "reward_std": 0.0328283435665071, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.07212961465120316, + "kl": 0.0002162456512451172, + "learning_rate": 2.1492537313432834e-07, + "loss": 0.0095, + "reward": 0.2818080484867096, + "reward_std": 0.04215896176174283, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.266183041036129, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 2027.1763916015625, + "completion_length": 2016.5804138183594, "epoch": 0.0433126726906131, - "grad_norm": 0.07970447838306427, - "kl": 0.00018167495727539062, - "learning_rate": 4.328358208955223e-08, - "loss": 0.0127, - "reward": 0.3113839402794838, - "reward_std": 0.042309300508350134, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.09445866197347641, + "kl": 0.00021696090698242188, + "learning_rate": 2.1641791044776117e-07, + "loss": 0.0177, + "reward": 0.3085937574505806, + "reward_std": 0.05049149505794048, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2667410746216774, + "rewards/tag_count_reward": 0.2684151828289032, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 2028.2790832519531, + "completion_length": 2025.1273498535156, "epoch": 0.04361138077813457, - "grad_norm": 0.06940615177154541, - "kl": 0.00017523765563964844, - "learning_rate": 4.3582089552238805e-08, - "loss": 0.009, - "reward": 0.3030134066939354, - "reward_std": 0.02637713449075818, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 0.07148821651935577, + "kl": 0.0002124309539794922, + "learning_rate": 2.1791044776119402e-07, + "loss": 0.0092, + "reward": 0.310267873108387, + "reward_std": 0.03736744727939367, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.2678571566939354, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 2040.2522888183594, + "completion_length": 2030.2411499023438, "epoch": 0.04391008886565604, - "grad_norm": 0.07751772552728653, - "kl": 0.00018668174743652344, - "learning_rate": 4.388059701492537e-08, - "loss": 0.0086, - "reward": 0.3119419813156128, - "reward_std": 0.07020024140365422, - "rewards/accuracy_reward": 0.05357143236324191, + "grad_norm": 0.0774233415722847, + "kl": 0.000213623046875, + "learning_rate": 2.1940298507462685e-07, + "loss": 0.0142, + "reward": 0.3219866156578064, + "reward_std": 0.07287575164809823, + "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.2639509066939354, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 2001.8326416015625, + "completion_length": 1998.4263916015625, "epoch": 0.04420879695317751, - "grad_norm": 0.06476179510354996, - "kl": 0.00017142295837402344, - "learning_rate": 4.4179104477611936e-08, - "loss": 0.0071, - "reward": 0.3085937649011612, - "reward_std": 0.02855393895879388, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.0685528814792633, + "kl": 0.00023698806762695312, + "learning_rate": 2.208955223880597e-07, + "loss": 0.0138, + "reward": 0.3002232238650322, + "reward_std": 0.008928572060540318, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2645089402794838, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 2019.3416137695312, + "completion_length": 2000.587158203125, "epoch": 0.044507505040698975, - "grad_norm": 0.08871780335903168, - "kl": 0.00016951560974121094, - "learning_rate": 4.44776119402985e-08, - "loss": 0.0174, - "reward": 0.3085937649011612, - "reward_std": 0.04574006795883179, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 0.09035354852676392, + "kl": 0.00025200843811035156, + "learning_rate": 2.223880597014925e-07, + "loss": 0.0241, + "reward": 0.314174123108387, + "reward_std": 0.04314139811322093, + "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2739955484867096, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 2042.857177734375, + "completion_length": 2043.4085693359375, "epoch": 0.044806213128220446, - "grad_norm": 0.0747586041688919, - "kl": 0.00017952919006347656, - "learning_rate": 4.4776119402985075e-08, + "grad_norm": 0.059354864060878754, + "kl": 0.0001952648162841797, + "learning_rate": 2.2388059701492537e-07, "loss": 0.0062, - "reward": 0.2639508992433548, - "reward_std": 0.04494157922454178, - "rewards/accuracy_reward": 0.006696428870782256, + "reward": 0.2594866156578064, + "reward_std": 0.02636163542047143, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.255022332072258, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 2032.5670166015625, + "completion_length": 2017.7724304199219, "epoch": 0.04510492121574192, - "grad_norm": 0.07108791172504425, - "kl": 0.000186920166015625, - "learning_rate": 4.507462686567164e-08, - "loss": 0.0065, - "reward": 0.2717634066939354, - "reward_std": 0.032258107559755445, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 0.08813710510730743, + "kl": 0.0002467632293701172, + "learning_rate": 2.253731343283582e-07, + "loss": 0.0144, + "reward": 0.2896205484867096, + "reward_std": 0.052120857406407595, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.2672991156578064, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 2016.2254943847656, + "completion_length": 2014.7522888183594, "epoch": 0.04540362930326339, - "grad_norm": 0.0805094987154007, - "kl": 0.00017309188842773438, - "learning_rate": 4.5373134328358207e-08, - "loss": 0.0174, - "reward": 0.3370535969734192, - "reward_std": 0.03155971853993833, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 0.07390749454498291, + "kl": 0.00024819374084472656, + "learning_rate": 2.2686567164179105e-07, + "loss": 0.015, + "reward": 0.345424123108387, + "reward_std": 0.04857433959841728, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.262834832072258, "step": 152 }, - { - "clip_ratio": 0.0, - "completion_length": 2039.6183776855469, - "epoch": 0.04570233739078486, - "grad_norm": 0.060844846069812775, - "kl": 0.000171661376953125, - "learning_rate": 4.567164179104477e-08, - "loss": 0.0094, - "reward": 0.3666294813156128, - "reward_std": 0.024978037225082517, - "rewards/accuracy_reward": 0.1093750037252903, + { + "clip_ratio": 0.0, + "completion_length": 2034.9889221191406, + "epoch": 0.04570233739078486, + "grad_norm": 0.07716519385576248, + "kl": 0.0002162456512451172, + "learning_rate": 2.2835820895522386e-07, + "loss": 0.0137, + "reward": 0.3738839402794838, + "reward_std": 0.046810401137918234, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544887661934, + "rewards/tag_count_reward": 0.2600446566939354, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 2033.9219360351562, + "completion_length": 2038.6629943847656, "epoch": 0.04600104547830632, - "grad_norm": 0.07296580076217651, - "kl": 0.0001838207244873047, - "learning_rate": 4.597014925373134e-08, - "loss": 0.0114, - "reward": 0.2600446566939354, - "reward_std": 0.027713506016880274, + "grad_norm": 0.06641247123479843, + "kl": 0.00021338462829589844, + "learning_rate": 2.2985074626865669e-07, + "loss": 0.0085, + "reward": 0.2606026902794838, + "reward_std": 0.02775944024324417, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.2583705484867096, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 1999.1741638183594, + "completion_length": 2012.1495971679688, "epoch": 0.04629975356582779, - "grad_norm": 0.07672085613012314, - "kl": 0.00018215179443359375, - "learning_rate": 4.626865671641791e-08, - "loss": 0.0129, - "reward": 0.3844866305589676, - "reward_std": 0.0428217698354274, - "rewards/accuracy_reward": 0.12053571827709675, + "grad_norm": 0.07931293547153473, + "kl": 0.00025773048400878906, + "learning_rate": 2.3134328358208954e-07, + "loss": 0.0142, + "reward": 0.3995535895228386, + "reward_std": 0.05525131174363196, + "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.267857164144516, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 2047.2678833007812, + "completion_length": 2048.0, "epoch": 0.046598461653349264, - "grad_norm": 0.02432144619524479, - "kl": 0.00018858909606933594, - "learning_rate": 4.656716417910447e-08, - "loss": 0.0014, - "reward": 0.3577009066939354, - "reward_std": 0.0022321429569274187, + "grad_norm": 0.004591322038322687, + "kl": 0.00019311904907226562, + "learning_rate": 2.3283582089552237e-07, + "loss": 0.0, + "reward": 0.357142873108387, + "reward_std": 0.0, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.250558041036129, + "rewards/tag_count_reward": 0.25, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 2031.7947082519531, + "completion_length": 2019.1652526855469, "epoch": 0.046897169740870735, - "grad_norm": 0.07927047461271286, - "kl": 0.00017952919006347656, - "learning_rate": 4.686567164179104e-08, - "loss": 0.0158, - "reward": 0.3085937723517418, - "reward_std": 0.056583448546007276, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 0.0849694237112999, + "kl": 0.0002593994140625, + "learning_rate": 2.3432835820895523e-07, + "loss": 0.015, + "reward": 0.314174123108387, + "reward_std": 0.05707283806987107, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187723517418, + "rewards/tag_count_reward": 0.2695312649011612, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 2028.1183776855469, + "completion_length": 2024.0000305175781, "epoch": 0.047195877828392206, - "grad_norm": 0.06557939946651459, - "kl": 0.00017189979553222656, - "learning_rate": 4.716417910447761e-08, - "loss": 0.0072, - "reward": 0.2723214477300644, - "reward_std": 0.03487544087693095, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.066276416182518, + "kl": 0.0002455711364746094, + "learning_rate": 2.3582089552238803e-07, + "loss": 0.0057, + "reward": 0.2806919664144516, + "reward_std": 0.05360085237771273, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2678571566939354, + "rewards/tag_count_reward": 0.2695312574505806, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 2035.5201416015625, + "completion_length": 2023.3482666015625, "epoch": 0.04749458591591367, - "grad_norm": 0.06396804749965668, - "kl": 0.0001773834228515625, - "learning_rate": 4.746268656716418e-08, + "grad_norm": 0.07106047123670578, + "kl": 0.00024819374084472656, + "learning_rate": 2.3731343283582089e-07, "loss": 0.0123, - "reward": 0.329799123108387, - "reward_std": 0.016879975330084562, - "rewards/accuracy_reward": 0.0714285746216774, + "reward": 0.3454241082072258, + "reward_std": 0.03872924018651247, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258370541036129, + "rewards/tag_count_reward": 0.2650669738650322, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 2038.2277221679688, + "completion_length": 2034.930908203125, "epoch": 0.04779329400343514, - "grad_norm": 0.07285719364881516, - "kl": 0.0001881122589111328, - "learning_rate": 4.776119402985074e-08, - "loss": 0.0109, - "reward": 0.4051339477300644, - "reward_std": 0.029109308030456305, - "rewards/accuracy_reward": 0.1495535783469677, + "grad_norm": 0.08272335678339005, + "kl": 0.00025153160095214844, + "learning_rate": 2.388059701492537e-07, + "loss": 0.0087, + "reward": 0.439174123108387, + "reward_std": 0.07193068787455559, + "rewards/accuracy_reward": 0.1741071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2555803656578064, + "rewards/tag_count_reward": 0.2650669738650322, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 2013.3906860351562, + "completion_length": 2000.8148193359375, "epoch": 0.04809200209095661, - "grad_norm": 0.07732822746038437, - "kl": 0.0001659393310546875, - "learning_rate": 4.805970149253731e-08, - "loss": 0.0172, - "reward": 0.321986623108387, - "reward_std": 0.06705568102188408, - "rewards/accuracy_reward": 0.053571430034935474, + "grad_norm": 0.09384261071681976, + "kl": 0.0003211498260498047, + "learning_rate": 2.4029850746268654e-07, + "loss": 0.0226, + "reward": 0.3387276902794838, + "reward_std": 0.07356675760820508, + "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2739955559372902, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 2008.3861999511719, + "completion_length": 2002.7879943847656, "epoch": 0.04839071017847808, - "grad_norm": 0.0659981518983841, - "kl": 0.00017261505126953125, - "learning_rate": 4.835820895522388e-08, - "loss": 0.0075, - "reward": 0.2996651902794838, - "reward_std": 0.015153693500906229, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.08281215280294418, + "kl": 0.00030422210693359375, + "learning_rate": 2.417910447761194e-07, + "loss": 0.0133, + "reward": 0.3069196566939354, + "reward_std": 0.030919374199584126, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.266741082072258, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 2047.544677734375, + "completion_length": 2042.8572082519531, "epoch": 0.048689418265999554, - "grad_norm": 0.07228440791368484, - "kl": 0.00017380714416503906, - "learning_rate": 4.8656716417910445e-08, - "loss": 0.0008, - "reward": 0.2929687649011612, - "reward_std": 0.025445276405662298, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.06405437737703323, + "kl": 0.000217437744140625, + "learning_rate": 2.432835820895522e-07, + "loss": 0.0066, + "reward": 0.2963169738650322, + "reward_std": 0.0340583180077374, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2550223395228386, + "rewards/tag_count_reward": 0.2561383992433548, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 2041.5447082519531, + "completion_length": 2036.7857971191406, "epoch": 0.048988126353521025, - "grad_norm": 0.04018021747469902, - "kl": 0.0001761913299560547, - "learning_rate": 4.895522388059701e-08, - "loss": 0.0078, - "reward": 0.2940848395228386, - "reward_std": 0.021473561646416783, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.0808204635977745, + "kl": 0.0002453327178955078, + "learning_rate": 2.447761194029851e-07, + "loss": 0.0067, + "reward": 0.3125000149011612, + "reward_std": 0.056629103841260076, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062649011612, + "rewards/tag_count_reward": 0.2633928656578064, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 2018.22998046875, + "completion_length": 2017.0023193359375, "epoch": 0.04928683444104249, - "grad_norm": 0.07271625101566315, - "kl": 0.00017833709716796875, - "learning_rate": 4.9253731343283576e-08, - "loss": 0.0175, - "reward": 0.2773437574505806, - "reward_std": 0.044676147401332855, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 0.0806288942694664, + "kl": 0.0002999305725097656, + "learning_rate": 2.4626865671641786e-07, + "loss": 0.0133, + "reward": 0.305803582072258, + "reward_std": 0.0739367906935513, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.2723214402794838, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 2009.87060546875, + "completion_length": 1993.0089721679688, "epoch": 0.04958554252856396, - "grad_norm": 0.09698507934808731, - "kl": 0.00018310546875, - "learning_rate": 4.955223880597015e-08, - "loss": 0.0197, - "reward": 0.404017873108387, - "reward_std": 0.10822395700961351, + "grad_norm": 0.10269467532634735, + "kl": 0.00039839744567871094, + "learning_rate": 2.4776119402985074e-07, + "loss": 0.0286, + "reward": 0.4095982313156128, + "reward_std": 0.11181990662589669, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.274553582072258, + "rewards/tag_count_reward": 0.2801339477300644, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 2018.7322082519531, + "completion_length": 2015.4710693359375, "epoch": 0.04988425061608543, - "grad_norm": 0.09137682616710663, - "kl": 0.00017786026000976562, - "learning_rate": 4.9850746268656715e-08, - "loss": 0.0131, - "reward": 0.3716518059372902, - "reward_std": 0.026358082424849272, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 0.09286423772573471, + "kl": 0.00032258033752441406, + "learning_rate": 2.492537313432836e-07, + "loss": 0.0183, + "reward": 0.3833705559372902, + "reward_std": 0.049257969949394464, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089477300644, + "rewards/tag_count_reward": 0.2695312574505806, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 2032.6830749511719, + "completion_length": 2012.0982666015625, "epoch": 0.0501829587036069, - "grad_norm": 0.07489944249391556, - "kl": 0.00016999244689941406, - "learning_rate": 5.0149253731343274e-08, - "loss": 0.0124, - "reward": 0.2650669738650322, - "reward_std": 0.030723849078640342, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.09242697060108185, + "kl": 0.0003287792205810547, + "learning_rate": 2.507462686567164e-07, + "loss": 0.0272, + "reward": 0.275111623108387, + "reward_std": 0.059367209672927856, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262834832072258, + "rewards/tag_count_reward": 0.2684151902794838, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 1987.0536193847656, + "completion_length": 1978.33935546875, "epoch": 0.05048166679112837, - "grad_norm": 0.08938367664813995, - "kl": 0.00017786026000976562, - "learning_rate": 5.0447761194029847e-08, - "loss": 0.0174, - "reward": 0.2779017984867096, - "reward_std": 0.04185536922886968, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 0.08841448277235031, + "kl": 0.0004553794860839844, + "learning_rate": 2.5223880597014923e-07, + "loss": 0.0081, + "reward": 0.2756696566939354, + "reward_std": 0.033615279011428356, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2712053656578064, + "rewards/tag_count_reward": 0.271205373108387, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 2043.1986999511719, + "completion_length": 2044.5781555175781, "epoch": 0.05078037487864984, - "grad_norm": 0.04914980009198189, - "kl": 0.0001773834228515625, - "learning_rate": 5.074626865671642e-08, - "loss": 0.0023, - "reward": 0.3521205484867096, - "reward_std": 0.044650423573330045, - "rewards/accuracy_reward": 0.09375000465661287, + "grad_norm": 0.06271202862262726, + "kl": 0.0002613067626953125, + "learning_rate": 2.537313432835821e-07, + "loss": 0.0011, + "reward": 0.368303582072258, + "reward_std": 0.07509586028754711, + "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258370541036129, + "rewards/tag_count_reward": 0.2633928656578064, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 1984.7389526367188, + "completion_length": 1972.055908203125, "epoch": 0.05107908296617131, - "grad_norm": 0.08516225963830948, - "kl": 0.00017333030700683594, - "learning_rate": 5.1044776119402985e-08, - "loss": 0.0223, - "reward": 0.3420759066939354, - "reward_std": 0.04381162440404296, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 0.09658797830343246, + "kl": 0.0004730224609375, + "learning_rate": 2.5522388059701494e-07, + "loss": 0.0326, + "reward": 0.3498884066939354, + "reward_std": 0.03977184183895588, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2795758992433548, + "rewards/tag_count_reward": 0.2806919738650322, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 2017.30810546875, + "completion_length": 2000.3907165527344, "epoch": 0.05137779105369278, - "grad_norm": 0.07000594586133957, - "kl": 0.00018310546875, - "learning_rate": 5.1343283582089544e-08, - "loss": 0.0111, - "reward": 0.309709832072258, - "reward_std": 0.061625832226127386, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 0.08945363014936447, + "kl": 0.0004181861877441406, + "learning_rate": 2.567164179104477e-07, + "loss": 0.0162, + "reward": 0.340401791036129, + "reward_std": 0.08885344956070185, + "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.2756696566939354, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 2025.2054138183594, + "completion_length": 2023.0536499023438, "epoch": 0.05167649914121425, - "grad_norm": 0.052084676921367645, - "kl": 0.00018596649169921875, - "learning_rate": 5.164179104477612e-08, - "loss": 0.0102, - "reward": 0.301897332072258, - "reward_std": 0.022677436005324125, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.06620240211486816, + "kl": 0.0003552436828613281, + "learning_rate": 2.582089552238806e-07, + "loss": 0.0092, + "reward": 0.3158482238650322, + "reward_std": 0.041362314485013485, + "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187574505806, + "rewards/tag_count_reward": 0.2645089402794838, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 2036.9554138183594, + "completion_length": 2024.8192443847656, "epoch": 0.05197520722873572, - "grad_norm": 0.06566321104764938, - "kl": 0.0001876354217529297, - "learning_rate": 5.194029850746269e-08, + "grad_norm": 0.07562818378210068, + "kl": 0.0003762245178222656, + "learning_rate": 2.5970149253731343e-07, "loss": 0.0108, - "reward": 0.2572544813156128, - "reward_std": 0.017195779364556074, - "rewards/accuracy_reward": 0.0, + "reward": 0.2689732238650322, + "reward_std": 0.029865912161767483, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.266741082072258, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 2040.0111694335938, + "completion_length": 2031.05810546875, "epoch": 0.052273915316257184, - "grad_norm": 0.052309151738882065, - "kl": 0.0001685619354248047, - "learning_rate": 5.223880597014925e-08, - "loss": 0.0056, - "reward": 0.2918526902794838, - "reward_std": 0.011307123582810163, + "grad_norm": 0.06956419348716736, + "kl": 0.0003361701965332031, + "learning_rate": 2.611940298507462e-07, + "loss": 0.0113, + "reward": 0.299107164144516, + "reward_std": 0.020119994645938277, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.263392873108387, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 2041.57373046875, + "completion_length": 2034.6072082519531, "epoch": 0.052572623403778655, - "grad_norm": 0.0631534680724144, - "kl": 0.00017595291137695312, - "learning_rate": 5.2537313432835814e-08, - "loss": 0.008, - "reward": 0.3152901902794838, - "reward_std": 0.06942915916442871, - "rewards/accuracy_reward": 0.05803571711294353, + "grad_norm": 0.08091429620981216, + "kl": 0.00033473968505859375, + "learning_rate": 2.626865671641791e-07, + "loss": 0.0096, + "reward": 0.3286830484867096, + "reward_std": 0.07504337094724178, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544738650322, + "rewards/tag_count_reward": 0.2617187649011612, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 2038.9397888183594, + "completion_length": 2033.3750610351562, "epoch": 0.052871331491300126, - "grad_norm": 0.08446615934371948, - "kl": 0.00018095970153808594, - "learning_rate": 5.283582089552239e-08, - "loss": 0.0099, - "reward": 0.3956473395228386, - "reward_std": 0.06817971775308251, - "rewards/accuracy_reward": 0.1339285783469677, + "grad_norm": 0.07236970961093903, + "kl": 0.0003561973571777344, + "learning_rate": 2.641791044776119e-07, + "loss": 0.0104, + "reward": 0.388392873108387, + "reward_std": 0.05889256298542023, + "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187574505806, + "rewards/tag_count_reward": 0.2611607313156128, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 2023.1116638183594, + "completion_length": 2021.0201721191406, "epoch": 0.0531700395788216, - "grad_norm": 0.07789263129234314, - "kl": 0.00018787384033203125, - "learning_rate": 5.313432835820896e-08, - "loss": 0.013, - "reward": 0.3655134066939354, - "reward_std": 0.06300813518464565, - "rewards/accuracy_reward": 0.09821428847499192, + "grad_norm": 0.08121785521507263, + "kl": 0.0004134178161621094, + "learning_rate": 2.656716417910448e-07, + "loss": 0.0165, + "reward": 0.3588169887661934, + "reward_std": 0.07267192704603076, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.2650669813156128, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 2024.2143249511719, + "completion_length": 2010.5983276367188, "epoch": 0.05346874766634307, - "grad_norm": 0.08232831209897995, - "kl": 0.00017905235290527344, - "learning_rate": 5.343283582089552e-08, - "loss": 0.0162, - "reward": 0.3710937723517418, - "reward_std": 0.06551747000776231, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 0.09636127948760986, + "kl": 0.00045299530029296875, + "learning_rate": 2.671641791044776e-07, + "loss": 0.0223, + "reward": 0.3839285895228386, + "reward_std": 0.06966402754187584, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830559372902, + "rewards/tag_count_reward": 0.274553582072258, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 2039.1719665527344, + "completion_length": 2035.7701721191406, "epoch": 0.05376745575386454, - "grad_norm": 0.06550660729408264, - "kl": 0.00017905235290527344, - "learning_rate": 5.3731343283582085e-08, - "loss": 0.0085, + "grad_norm": 0.09264959394931793, + "kl": 0.0003719329833984375, + "learning_rate": 2.686567164179104e-07, + "loss": 0.0126, "reward": 0.314174123108387, - "reward_std": 0.06533667258918285, - "rewards/accuracy_reward": 0.05803571757860482, + "reward_std": 0.06786144804209471, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.2628348395228386, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 1987.5022888183594, + "completion_length": 1970.44873046875, "epoch": 0.054066163841386, - "grad_norm": 0.08046255260705948, - "kl": 0.00017452239990234375, - "learning_rate": 5.402985074626866e-08, - "loss": 0.0174, - "reward": 0.3158482238650322, - "reward_std": 0.041268323780968785, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 0.09745245426893234, + "kl": 0.0006742477416992188, + "learning_rate": 2.701492537313433e-07, + "loss": 0.0225, + "reward": 0.318080373108387, + "reward_std": 0.040587071562185884, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375074505806, + "rewards/tag_count_reward": 0.2779018059372902, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 2030.5045471191406, + "completion_length": 2022.649658203125, "epoch": 0.054364871928907474, - "grad_norm": 0.06388089060783386, - "kl": 0.00018215179443359375, - "learning_rate": 5.4328358208955216e-08, - "loss": 0.0041, - "reward": 0.3772321566939354, - "reward_std": 0.035156750585883856, - "rewards/accuracy_reward": 0.11607143376022577, + "grad_norm": 0.09323485195636749, + "kl": 0.0004668235778808594, + "learning_rate": 2.7164179104477607e-07, + "loss": 0.0139, + "reward": 0.3777901977300644, + "reward_std": 0.047922884579747915, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.2661830484867096, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 2040.950927734375, + "completion_length": 2027.26123046875, "epoch": 0.054663580016428945, - "grad_norm": 0.08246054500341415, - "kl": 0.0001652240753173828, - "learning_rate": 5.462686567164179e-08, - "loss": 0.0047, - "reward": 0.2779017984867096, - "reward_std": 0.07219411921687424, - "rewards/accuracy_reward": 0.017857143422588706, + "grad_norm": 0.10046170651912689, + "kl": 0.00045013427734375, + "learning_rate": 2.7313432835820895e-07, + "loss": 0.0174, + "reward": 0.3270089402794838, + "reward_std": 0.1327594155445695, + "rewards/accuracy_reward": 0.049107145983725786, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2779017984867096, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 2026.6339721679688, + "completion_length": 1997.7746276855469, "epoch": 0.054962288103950416, - "grad_norm": 0.06341168284416199, - "kl": 0.0001735687255859375, - "learning_rate": 5.4925373134328355e-08, - "loss": 0.0118, - "reward": 0.3487723395228386, - "reward_std": 0.034377917647361755, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 0.08770574629306793, + "kl": 0.0006394386291503906, + "learning_rate": 2.746268656716418e-07, + "loss": 0.0186, + "reward": 0.3794643059372902, + "reward_std": 0.04777905996888876, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.2767857238650322, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 2036.0313110351562, + "completion_length": 2012.6808776855469, "epoch": 0.05526099619147189, - "grad_norm": 0.06304925680160522, - "kl": 0.0001742839813232422, - "learning_rate": 5.522388059701493e-08, - "loss": 0.0077, - "reward": 0.2963169813156128, - "reward_std": 0.022600972559303045, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.08510146290063858, + "kl": 0.000522613525390625, + "learning_rate": 2.761194029850746e-07, + "loss": 0.0111, + "reward": 0.3231026902794838, + "reward_std": 0.064674180932343, + "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705559372902, + "rewards/tag_count_reward": 0.2717634066939354, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 2006.1763916015625, + "completion_length": 1981.0090026855469, "epoch": 0.05555970427899335, - "grad_norm": 0.07361990213394165, - "kl": 0.00017380714416503906, - "learning_rate": 5.5522388059701486e-08, - "loss": 0.0018, - "reward": 0.3431919813156128, - "reward_std": 0.04154403507709503, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 0.09995196014642715, + "kl": 0.0007429122924804688, + "learning_rate": 2.7761194029850744e-07, + "loss": 0.0247, + "reward": 0.3482143059372902, + "reward_std": 0.04264641599729657, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262834832072258, + "rewards/tag_count_reward": 0.2723214402794838, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 2042.7947082519531, + "completion_length": 2036.5826721191406, "epoch": 0.05585841236651482, - "grad_norm": 0.06344898790121078, - "kl": 0.00017642974853515625, - "learning_rate": 5.582089552238806e-08, - "loss": 0.0015, - "reward": 0.3526785895228386, - "reward_std": 0.038312783697620034, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 0.06772337108850479, + "kl": 0.0004420280456542969, + "learning_rate": 2.7910447761194027e-07, + "loss": 0.0054, + "reward": 0.3554687649011612, + "reward_std": 0.04325869586318731, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285895228386, + "rewards/tag_count_reward": 0.2617187574505806, "step": 187 }, { - "clip_ratio": 0.0, - "completion_length": 2023.6429443359375, - "epoch": 0.05615712045403629, - "grad_norm": 0.08292632550001144, - "kl": 0.00018215179443359375, - "learning_rate": 5.6119402985074625e-08, - "loss": 0.0105, - "reward": 0.3643973395228386, - "reward_std": 0.050688936840742826, - "rewards/accuracy_reward": 0.09598215040750802, + "clip_ratio": 0.0, + "completion_length": 2017.743408203125, + "epoch": 0.05615712045403629, + "grad_norm": 0.08378896862268448, + "kl": 0.0005846023559570312, + "learning_rate": 2.8059701492537315e-07, + "loss": 0.012, + "reward": 0.3738839477300644, + "reward_std": 0.060776281636208296, + "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.2756696566939354, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 2029.2611694335938, + "completion_length": 2031.747802734375, "epoch": 0.056455828541557763, - "grad_norm": 0.03447030857205391, - "kl": 0.00017380714416503906, - "learning_rate": 5.64179104477612e-08, - "loss": -0.0011, - "reward": 0.3348214402794838, - "reward_std": 0.016624096781015396, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 0.060328032821416855, + "kl": 0.0004730224609375, + "learning_rate": 2.82089552238806e-07, + "loss": 0.0074, + "reward": 0.336495541036129, + "reward_std": 0.0339016814250499, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.254464291036129, + "rewards/tag_count_reward": 0.2561384066939354, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 2035.6138916015625, + "completion_length": 2010.9331359863281, "epoch": 0.056754536629079234, - "grad_norm": 0.09496764838695526, - "kl": 0.0001647472381591797, - "learning_rate": 5.6716417910447757e-08, - "loss": 0.01, - "reward": 0.3203125149011612, - "reward_std": 0.06926085543818772, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 0.1147855892777443, + "kl": 0.000637054443359375, + "learning_rate": 2.8358208955223876e-07, + "loss": 0.0203, + "reward": 0.3415178805589676, + "reward_std": 0.07420498505234718, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.2745535895228386, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 2038.6541137695312, + "completion_length": 2015.8929138183594, "epoch": 0.057053244716600705, - "grad_norm": 0.05893353745341301, - "kl": 0.00018167495727539062, - "learning_rate": 5.701492537313433e-08, - "loss": 0.0053, - "reward": 0.2957589402794838, - "reward_std": 0.055102501064538956, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 0.0833522379398346, + "kl": 0.000637054443359375, + "learning_rate": 2.8507462686567164e-07, + "loss": 0.0149, + "reward": 0.3074776977300644, + "reward_std": 0.0650724545121193, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.267299123108387, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 2044.8080444335938, + "completion_length": 2033.3504943847656, "epoch": 0.05735195280412217, - "grad_norm": 0.05859486386179924, - "kl": 0.00018548965454101562, - "learning_rate": 5.7313432835820895e-08, - "loss": 0.0019, - "reward": 0.344866082072258, - "reward_std": 0.040178572526201606, + "grad_norm": 0.07649856805801392, + "kl": 0.000545501708984375, + "learning_rate": 2.8656716417910447e-07, + "loss": 0.0104, + "reward": 0.3470982238650322, + "reward_std": 0.05379192181862891, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.2578125074505806, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 2025.6318054199219, + "completion_length": 1989.2947387695312, "epoch": 0.05765066089164364, - "grad_norm": 0.06982550024986267, - "kl": 0.00018024444580078125, - "learning_rate": 5.7611940298507454e-08, - "loss": 0.0156, - "reward": 0.3504464477300644, - "reward_std": 0.046196039067581296, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 0.11090464890003204, + "kl": 0.0008726119995117188, + "learning_rate": 2.880597014925373e-07, + "loss": 0.0297, + "reward": 0.3828125149011612, + "reward_std": 0.0587000735104084, + "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.2756696566939354, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 2036.7098999023438, + "completion_length": 2020.8683776855469, "epoch": 0.05794936897916511, - "grad_norm": 0.0578986294567585, - "kl": 0.00018095970153808594, - "learning_rate": 5.791044776119403e-08, - "loss": 0.0074, - "reward": 0.2935267984867096, - "reward_std": 0.014906345633789897, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.09388872981071472, + "kl": 0.0006723403930664062, + "learning_rate": 2.8955223880597013e-07, + "loss": 0.0177, + "reward": 0.3046875074505806, + "reward_std": 0.03817676939070225, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.266741082072258, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 2019.1407165527344, + "completion_length": 1999.3527221679688, "epoch": 0.05824807706668658, - "grad_norm": 0.08268532156944275, - "kl": 0.00017595291137695312, - "learning_rate": 5.820895522388059e-08, - "loss": 0.0091, - "reward": 0.3018973246216774, - "reward_std": 0.08250449411571026, - "rewards/accuracy_reward": 0.04017857299186289, + "grad_norm": 0.07774338871240616, + "kl": 0.0007452964782714844, + "learning_rate": 2.9104477611940296e-07, + "loss": 0.0094, + "reward": 0.3264508992433548, + "reward_std": 0.07910588569939137, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187574505806, + "rewards/tag_count_reward": 0.2684151865541935, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 2030.3638916015625, + "completion_length": 2013.7143859863281, "epoch": 0.05854678515420805, - "grad_norm": 0.05854840576648712, - "kl": 0.00018405914306640625, - "learning_rate": 5.8507462686567165e-08, - "loss": 0.0106, - "reward": 0.3325893059372902, - "reward_std": 0.017306024441495538, + "grad_norm": 0.10577115416526794, + "kl": 0.0007781982421875, + "learning_rate": 2.9253731343283584e-07, + "loss": 0.0289, + "reward": 0.345424123108387, + "reward_std": 0.04334849026054144, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.2739955559372902, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 2010.5648193359375, + "completion_length": 1994.1607666015625, "epoch": 0.05884549324172952, - "grad_norm": 0.06504366546869278, - "kl": 0.00017571449279785156, - "learning_rate": 5.8805970149253724e-08, - "loss": 0.0114, - "reward": 0.2695312649011612, - "reward_std": 0.027531619183719158, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.08896368741989136, + "kl": 0.000904083251953125, + "learning_rate": 2.940298507462686e-07, + "loss": 0.0191, + "reward": 0.2801339402794838, + "reward_std": 0.045548498164862394, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.2712053656578064, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 2028.884033203125, + "completion_length": 2018.2813110351562, "epoch": 0.05914420132925099, - "grad_norm": 0.07553350925445557, - "kl": 0.00016498565673828125, - "learning_rate": 5.91044776119403e-08, - "loss": 0.0178, - "reward": 0.325892873108387, - "reward_std": 0.0671832945663482, - "rewards/accuracy_reward": 0.06473214598372579, + "grad_norm": 0.09556843340396881, + "kl": 0.0007686614990234375, + "learning_rate": 2.955223880597015e-07, + "loss": 0.0131, + "reward": 0.3565848395228386, + "reward_std": 0.10666803759522736, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.2717634066939354, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 2038.5380249023438, + "completion_length": 2009.1942749023438, "epoch": 0.05944290941677246, - "grad_norm": 0.043797723948955536, - "kl": 0.0001773834228515625, - "learning_rate": 5.940298507462686e-08, - "loss": 0.0068, - "reward": 0.258928582072258, - "reward_std": 0.01954634115099907, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.0993853360414505, + "kl": 0.0008955001831054688, + "learning_rate": 2.9701492537313433e-07, + "loss": 0.0222, + "reward": 0.2812500149011612, + "reward_std": 0.0642466563731432, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.2700892984867096, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 2021.8326110839844, + "completion_length": 2008.6295166015625, "epoch": 0.05974161750429393, - "grad_norm": 0.06777962297201157, - "kl": 0.00017690658569335938, - "learning_rate": 5.970149253731343e-08, - "loss": 0.008, - "reward": 0.270089291036129, - "reward_std": 0.0323697947897017, + "grad_norm": 0.09655937552452087, + "kl": 0.0009717941284179688, + "learning_rate": 2.985074626865671e-07, + "loss": 0.0233, + "reward": 0.275111623108387, + "reward_std": 0.042582636466249824, "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2633928656578064, + "rewards/tag_count_reward": 0.2684151902794838, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 2047.810302734375, + "completion_length": 2045.0759582519531, "epoch": 0.0600403255918154, - "grad_norm": 0.03844344988465309, - "kl": 0.0001952648162841797, - "learning_rate": 6e-08, - "loss": 0.0002, - "reward": 0.2873883992433548, - "reward_std": 0.005281830672174692, + "grad_norm": 0.0667165145277977, + "kl": 0.0006380081176757812, + "learning_rate": 3e-07, + "loss": 0.0034, + "reward": 0.2929687649011612, + "reward_std": 0.017887577414512634, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2516741082072258, + "rewards/tag_count_reward": 0.2572544738650322, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 2014.3884582519531, + "completion_length": 1981.1585083007812, "epoch": 0.060339033679336865, - "grad_norm": 0.06974104046821594, - "kl": 0.00018215179443359375, - "learning_rate": 6.029850746268656e-08, - "loss": 0.0169, - "reward": 0.3364955484867096, - "reward_std": 0.029421457555145025, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 0.08809357136487961, + "kl": 0.0011606216430664062, + "learning_rate": 3.014925373134328e-07, + "loss": 0.0175, + "reward": 0.369419664144516, + "reward_std": 0.026709757279604673, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.266741082072258, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 2026.5803833007812, + "completion_length": 2015.9219360351562, "epoch": 0.060637741766858336, - "grad_norm": 0.07938475906848907, - "kl": 0.0001747608184814453, - "learning_rate": 6.059701492537314e-08, - "loss": 0.0059, - "reward": 0.2706473246216774, - "reward_std": 0.03765821922570467, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.09069853276014328, + "kl": 0.0008955001831054688, + "learning_rate": 3.029850746268657e-07, + "loss": 0.0076, + "reward": 0.2885044738650322, + "reward_std": 0.0576875985134393, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266183041036129, + "rewards/tag_count_reward": 0.275111623108387, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 2033.1942443847656, + "completion_length": 2000.76123046875, "epoch": 0.06093644985437981, - "grad_norm": 0.07190212607383728, - "kl": 0.000186920166015625, - "learning_rate": 6.089552238805969e-08, - "loss": 0.0121, - "reward": 0.3498884066939354, - "reward_std": 0.04718865570612252, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 0.09514117985963821, + "kl": 0.0011749267578125, + "learning_rate": 3.044776119402985e-07, + "loss": 0.0227, + "reward": 0.3738839477300644, + "reward_std": 0.0642862762324512, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.2734375149011612, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 2022.5447082519531, + "completion_length": 1987.99560546875, "epoch": 0.06123515794190128, - "grad_norm": 0.08950942754745483, - "kl": 0.0001690387725830078, - "learning_rate": 6.119402985074627e-08, - "loss": 0.0164, - "reward": 0.3253348395228386, - "reward_std": 0.07001883792690933, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 0.13551844656467438, + "kl": 0.001445770263671875, + "learning_rate": 3.059701492537313e-07, + "loss": 0.0227, + "reward": 0.365513414144516, + "reward_std": 0.09118174319155514, + "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.2918526902794838, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 2036.4554138183594, + "completion_length": 2022.5960693359375, "epoch": 0.06153386602942275, - "grad_norm": 0.06489112228155136, - "kl": 0.0001857280731201172, - "learning_rate": 6.149253731343284e-08, - "loss": 0.0098, - "reward": 0.258928582072258, - "reward_std": 0.0246277314145118, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.07157505303621292, + "kl": 0.0010194778442382812, + "learning_rate": 3.074626865671642e-07, + "loss": 0.0168, + "reward": 0.2689732238650322, + "reward_std": 0.03119064774364233, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.2645089402794838, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 2025.58935546875, + "completion_length": 1997.1228637695312, "epoch": 0.06183257411694422, - "grad_norm": 0.07683850079774857, - "kl": 0.00018262863159179688, - "learning_rate": 6.17910447761194e-08, - "loss": 0.0146, - "reward": 0.3945312723517418, - "reward_std": 0.08876218646764755, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 0.13191767036914825, + "kl": 0.00151824951171875, + "learning_rate": 3.08955223880597e-07, + "loss": 0.0221, + "reward": 0.4386160969734192, + "reward_std": 0.11744646169245243, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.2845982238650322, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 2025.40185546875, + "completion_length": 2015.1875915527344, "epoch": 0.06213128220446568, - "grad_norm": 0.06477277725934982, - "kl": 0.00017714500427246094, - "learning_rate": 6.208955223880597e-08, - "loss": 0.0186, - "reward": 0.258928582072258, - "reward_std": 0.018223890103399754, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.10034525394439697, + "kl": 0.001201629638671875, + "learning_rate": 3.1044776119402985e-07, + "loss": 0.0177, + "reward": 0.2745535895228386, + "reward_std": 0.0458978358656168, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.2700893059372902, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 1994.9286193847656, + "completion_length": 1959.9041137695312, "epoch": 0.062429990291987154, - "grad_norm": 0.06651078164577484, - "kl": 0.0001800060272216797, - "learning_rate": 6.238805970149253e-08, - "loss": 0.0207, - "reward": 0.2996651902794838, - "reward_std": 0.007592200767248869, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.09279676526784897, + "kl": 0.001972198486328125, + "learning_rate": 3.119402985074627e-07, + "loss": 0.0086, + "reward": 0.3130580484867096, + "reward_std": 0.029862043214961886, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.275111623108387, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 2033.6964721679688, + "completion_length": 2003.9018249511719, "epoch": 0.06272869837950862, - "grad_norm": 0.0769982859492302, - "kl": 0.00017023086547851562, - "learning_rate": 6.26865671641791e-08, - "loss": 0.0047, - "reward": 0.3041294813156128, - "reward_std": 0.04018836980685592, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 0.11280187964439392, + "kl": 0.0014495849609375, + "learning_rate": 3.134328358208955e-07, + "loss": 0.0172, + "reward": 0.3231026902794838, + "reward_std": 0.0484922849573195, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.2806919813156128, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 1991.6831665039062, + "completion_length": 1956.9063415527344, "epoch": 0.06302740646703009, - "grad_norm": 0.0942424014210701, - "kl": 0.0001850128173828125, - "learning_rate": 6.298507462686567e-08, - "loss": 0.0228, - "reward": 0.364955373108387, - "reward_std": 0.05182072729803622, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 0.10100255906581879, + "kl": 0.002162933349609375, + "learning_rate": 3.1492537313432833e-07, + "loss": 0.005, + "reward": 0.3973214328289032, + "reward_std": 0.06160887097939849, + "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.285714291036129, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 2017.8572082519531, + "completion_length": 1979.3326721191406, "epoch": 0.06332611455455156, - "grad_norm": 0.08709774166345596, - "kl": 0.00017762184143066406, - "learning_rate": 6.328358208955223e-08, - "loss": 0.0155, - "reward": 0.3409598395228386, - "reward_std": 0.03618597984313965, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 0.12217862159013748, + "kl": 0.00196075439453125, + "learning_rate": 3.1641791044776116e-07, + "loss": 0.0191, + "reward": 0.368861623108387, + "reward_std": 0.07665435736998916, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.2795758992433548, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 2004.0245971679688, + "completion_length": 1962.6607666015625, "epoch": 0.06362482264207303, - "grad_norm": 0.07841403037309647, - "kl": 0.0001785755157470703, - "learning_rate": 6.35820895522388e-08, - "loss": 0.0095, - "reward": 0.3381696566939354, - "reward_std": 0.05418444913811982, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 0.10036396980285645, + "kl": 0.0022592544555664062, + "learning_rate": 3.1791044776119405e-07, + "loss": 0.0117, + "reward": 0.3549107313156128, + "reward_std": 0.07280908105894923, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732313156128, + "rewards/tag_count_reward": 0.2834821566939354, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 2010.6585388183594, + "completion_length": 1977.3594665527344, "epoch": 0.0639235307295945, - "grad_norm": 0.06797685474157333, - "kl": 0.0001823902130126953, - "learning_rate": 6.388059701492538e-08, - "loss": 0.0132, - "reward": 0.3694196492433548, - "reward_std": 0.034742471762001514, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 0.1393975466489792, + "kl": 0.00225067138671875, + "learning_rate": 3.194029850746269e-07, + "loss": 0.0257, + "reward": 0.3978794813156128, + "reward_std": 0.07919775880873203, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089328289032, + "rewards/tag_count_reward": 0.2818080484867096, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 2041.3460388183594, + "completion_length": 2023.7500915527344, "epoch": 0.06422223881711597, - "grad_norm": 0.06770811975002289, - "kl": 0.00017976760864257812, - "learning_rate": 6.417910447761193e-08, - "loss": 0.0062, - "reward": 0.2968750149011612, - "reward_std": 0.027413203148171306, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.14747878909111023, + "kl": 0.001598358154296875, + "learning_rate": 3.2089552238805965e-07, + "loss": 0.02, + "reward": 0.3119419738650322, + "reward_std": 0.0637703463435173, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.2717634066939354, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 2038.0156555175781, + "completion_length": 2020.6675109863281, "epoch": 0.06452094690463744, - "grad_norm": 0.07012468576431274, - "kl": 0.0001742839813232422, - "learning_rate": 6.447761194029851e-08, - "loss": 0.01, - "reward": 0.2678571566939354, - "reward_std": 0.049191949190571904, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 0.13431069254875183, + "kl": 0.0016956329345703125, + "learning_rate": 3.2238805970149253e-07, + "loss": 0.0227, + "reward": 0.2935268059372902, + "reward_std": 0.08633354329504073, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964477300644, + "rewards/tag_count_reward": 0.2734375149011612, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 2007.2165832519531, + "completion_length": 1989.3192749023438, "epoch": 0.06481965499215891, - "grad_norm": 0.08644060045480728, - "kl": 0.000171661376953125, - "learning_rate": 6.477611940298508e-08, - "loss": 0.0129, - "reward": 0.3839285969734192, - "reward_std": 0.03191607724875212, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 0.10301192104816437, + "kl": 0.0023717880249023438, + "learning_rate": 3.2388059701492536e-07, + "loss": 0.0061, + "reward": 0.3889509066939354, + "reward_std": 0.03769204462878406, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.2818080484867096, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 2032.7322387695312, + "completion_length": 2009.4532470703125, "epoch": 0.06511836307968039, - "grad_norm": 0.0795249342918396, - "kl": 0.00018215179443359375, - "learning_rate": 6.507462686567163e-08, - "loss": 0.0141, - "reward": 0.270089291036129, - "reward_std": 0.04693942563608289, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.11261283606290817, + "kl": 0.0020198822021484375, + "learning_rate": 3.253731343283582e-07, + "loss": 0.0203, + "reward": 0.286272332072258, + "reward_std": 0.059363205917179585, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.2728794738650322, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 2044.7098693847656, + "completion_length": 2034.0290832519531, "epoch": 0.06541707116720186, - "grad_norm": 0.05372878164052963, - "kl": 0.0001728534698486328, - "learning_rate": 6.537313432835821e-08, - "loss": 0.0003, - "reward": 0.2924107238650322, - "reward_std": 0.01779635902494192, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.1040838211774826, + "kl": 0.0014801025390625, + "learning_rate": 3.26865671641791e-07, + "loss": 0.011, + "reward": 0.3191964477300644, + "reward_std": 0.06018436141312122, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.254464291036129, + "rewards/tag_count_reward": 0.2723214402794838, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 2036.2835388183594, + "completion_length": 2023.2723999023438, "epoch": 0.06571577925472333, - "grad_norm": 0.06349258869886398, - "kl": 0.0001747608184814453, - "learning_rate": 6.567164179104477e-08, - "loss": 0.0073, - "reward": 0.333705373108387, - "reward_std": 0.025559590198099613, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.10817437618970871, + "kl": 0.0018329620361328125, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.0171, + "reward": 0.3465401977300644, + "reward_std": 0.04764120141044259, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446492433548, + "rewards/tag_count_reward": 0.270647332072258, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 2024.2433776855469, + "completion_length": 2003.1853332519531, "epoch": 0.06601448734224478, - "grad_norm": 0.0750095546245575, - "kl": 0.00018548965454101562, - "learning_rate": 6.597014925373134e-08, - "loss": 0.0094, - "reward": 0.2979910895228386, - "reward_std": 0.015027947491034865, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.1078382208943367, + "kl": 0.0023593902587890625, + "learning_rate": 3.2985074626865673e-07, + "loss": 0.0206, + "reward": 0.3125000223517418, + "reward_std": 0.03786096489056945, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622768059372902, + "rewards/tag_count_reward": 0.2745535895228386, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 2024.4777526855469, + "completion_length": 1965.6384582519531, "epoch": 0.06631319542976626, - "grad_norm": 0.060976795852184296, - "kl": 0.00018405914306640625, - "learning_rate": 6.62686567164179e-08, - "loss": 0.0131, - "reward": 0.3303571566939354, - "reward_std": 0.05990245519205928, - "rewards/accuracy_reward": 0.06919643236324191, + "grad_norm": 0.10188373178243637, + "kl": 0.003177642822265625, + "learning_rate": 3.313432835820895e-07, + "loss": 0.0258, + "reward": 0.3800223395228386, + "reward_std": 0.06845005787909031, + "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.2840401977300644, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 2044.7232666015625, + "completion_length": 2022.44873046875, "epoch": 0.06661190351728773, - "grad_norm": 0.07003290206193924, - "kl": 0.0001823902130126953, - "learning_rate": 6.656716417910447e-08, - "loss": 0.0032, - "reward": 0.344866082072258, - "reward_std": 0.04686067556031048, - "rewards/accuracy_reward": 0.08705357694998384, + "grad_norm": 0.12264679372310638, + "kl": 0.0021533966064453125, + "learning_rate": 3.328358208955224e-07, + "loss": 0.0181, + "reward": 0.3683035895228386, + "reward_std": 0.0722658357117325, + "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.2745535895228386, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 2044.7634582519531, + "completion_length": 2020.4911499023438, "epoch": 0.0669106116048092, - "grad_norm": 0.06935867667198181, - "kl": 0.00017404556274414062, - "learning_rate": 6.686567164179105e-08, - "loss": 0.002, - "reward": 0.3744419887661934, - "reward_std": 0.04867021832615137, - "rewards/accuracy_reward": 0.11607143259607255, + "grad_norm": 0.12676876783370972, + "kl": 0.0020999908447265625, + "learning_rate": 3.343283582089552e-07, + "loss": 0.0238, + "reward": 0.3822544813156128, + "reward_std": 0.06705653108656406, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.270647332072258, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 2045.0870971679688, + "completion_length": 2028.0960693359375, "epoch": 0.06720931969233067, - "grad_norm": 0.06359179317951202, - "kl": 0.000171661376953125, - "learning_rate": 6.716417910447762e-08, - "loss": 0.0019, - "reward": 0.3002232164144516, - "reward_std": 0.0347850089892745, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 0.0972866490483284, + "kl": 0.002017974853515625, + "learning_rate": 3.3582089552238805e-07, + "loss": 0.0126, + "reward": 0.3130580484867096, + "reward_std": 0.0542047219350934, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.2728794738650322, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 2037.4777221679688, + "completion_length": 2003.4643859863281, "epoch": 0.06750802777985214, - "grad_norm": 0.04886753484606743, - "kl": 0.0001704692840576172, - "learning_rate": 6.746268656716417e-08, - "loss": 0.0055, - "reward": 0.294084832072258, - "reward_std": 0.0158645196352154, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 0.11222253739833832, + "kl": 0.0025501251220703125, + "learning_rate": 3.373134328358209e-07, + "loss": 0.0246, + "reward": 0.3242187723517418, + "reward_std": 0.07053999602794647, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.279575914144516, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 2043.0402526855469, + "completion_length": 2014.6853637695312, "epoch": 0.06780673586737361, - "grad_norm": 0.0828593447804451, - "kl": 0.0001742839813232422, - "learning_rate": 6.776119402985075e-08, - "loss": 0.0044, - "reward": 0.2639508992433548, - "reward_std": 0.03887422033585608, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.11316769570112228, + "kl": 0.0024852752685546875, + "learning_rate": 3.388059701492537e-07, + "loss": 0.0155, + "reward": 0.3030134066939354, + "reward_std": 0.09731542132794857, + "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.2829241156578064, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 2022.3549499511719, + "completion_length": 1990.4375610351562, "epoch": 0.06810544395489508, - "grad_norm": 0.07621882855892181, - "kl": 0.000171661376953125, - "learning_rate": 6.805970149253731e-08, - "loss": 0.0121, - "reward": 0.2991071566939354, - "reward_std": 0.06104383198544383, - "rewards/accuracy_reward": 0.03348214365541935, + "grad_norm": 0.12635187804698944, + "kl": 0.00305938720703125, + "learning_rate": 3.402985074626866e-07, + "loss": 0.026, + "reward": 0.3331473395228386, + "reward_std": 0.130776546895504, + "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.2840401902794838, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 2041.3795166015625, + "completion_length": 2024.805908203125, "epoch": 0.06840415204241655, - "grad_norm": 0.07036849856376648, - "kl": 0.00016951560974121094, - "learning_rate": 6.835820895522388e-08, - "loss": 0.0061, - "reward": 0.2678571566939354, - "reward_std": 0.04284272133372724, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.10100309550762177, + "kl": 0.002277374267578125, + "learning_rate": 3.4179104477611937e-07, + "loss": 0.021, + "reward": 0.2929687649011612, + "reward_std": 0.07502161804586649, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.2728794813156128, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 2019.6161804199219, + "completion_length": 1949.1451721191406, "epoch": 0.06870286012993802, - "grad_norm": 0.07226591557264328, - "kl": 0.0001785755157470703, - "learning_rate": 6.865671641791045e-08, - "loss": 0.0188, - "reward": 0.3052455559372902, - "reward_std": 0.03287034225650132, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.11945144832134247, + "kl": 0.004306793212890625, + "learning_rate": 3.432835820895522e-07, + "loss": 0.0318, + "reward": 0.3420759066939354, + "reward_std": 0.0771923428401351, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.297433041036129, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 2008.0536499023438, + "completion_length": 1954.1429443359375, "epoch": 0.0690015682174595, - "grad_norm": 0.0774410143494606, - "kl": 0.0001819133758544922, - "learning_rate": 6.895522388059701e-08, - "loss": 0.0244, - "reward": 0.3286830484867096, - "reward_std": 0.061982205137610435, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 0.1123351976275444, + "kl": 0.004077911376953125, + "learning_rate": 3.447761194029851e-07, + "loss": 0.0268, + "reward": 0.3789062574505806, + "reward_std": 0.10282119456678629, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266183041036129, + "rewards/tag_count_reward": 0.2918526902794838, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 2012.5849304199219, + "completion_length": 1985.8438110351562, "epoch": 0.06930027630498095, - "grad_norm": 0.06406281888484955, - "kl": 0.0001881122589111328, - "learning_rate": 6.925373134328359e-08, - "loss": 0.014, - "reward": 0.3420759066939354, - "reward_std": 0.028559773229062557, + "grad_norm": 0.11126238852739334, + "kl": 0.003376007080078125, + "learning_rate": 3.462686567164179e-07, + "loss": 0.0137, + "reward": 0.3593750149011612, + "reward_std": 0.060739551205188036, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.2812500074505806, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 2018.4888610839844, + "completion_length": 1960.2098999023438, "epoch": 0.06959898439250242, - "grad_norm": 0.06883462518453598, - "kl": 0.00018668174743652344, - "learning_rate": 6.955223880597014e-08, - "loss": 0.0181, - "reward": 0.3348214402794838, - "reward_std": 0.019397195195779204, + "grad_norm": 0.11529677361249924, + "kl": 0.004241943359375, + "learning_rate": 3.477611940298507e-07, + "loss": 0.036, + "reward": 0.3599330484867096, + "reward_std": 0.04292834363877773, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2633928656578064, + "rewards/tag_count_reward": 0.2885044813156128, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 2000.4531860351562, + "completion_length": 1950.6429443359375, "epoch": 0.06989769248002389, - "grad_norm": 0.09442322701215744, - "kl": 0.00018167495727539062, - "learning_rate": 6.985074626865671e-08, - "loss": 0.0216, - "reward": 0.3554687649011612, - "reward_std": 0.047269190894439816, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 0.14292439818382263, + "kl": 0.0045623779296875, + "learning_rate": 3.4925373134328357e-07, + "loss": 0.0525, + "reward": 0.3883928805589676, + "reward_std": 0.07674214523285627, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473395228386, + "rewards/tag_count_reward": 0.294642873108387, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 2038.3170166015625, + "completion_length": 2024.9487609863281, "epoch": 0.07019640056754536, - "grad_norm": 0.06465152651071548, - "kl": 0.00017976760864257812, - "learning_rate": 7.014925373134329e-08, - "loss": 0.007, - "reward": 0.2979910895228386, - "reward_std": 0.035179960541427135, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 0.1095770075917244, + "kl": 0.0025806427001953125, + "learning_rate": 3.507462686567164e-07, + "loss": 0.0118, + "reward": 0.318080373108387, + "reward_std": 0.06389593239873648, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.2779017984867096, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 2039.341552734375, + "completion_length": 1993.9398498535156, "epoch": 0.07049510865506683, - "grad_norm": 0.05660472810268402, - "kl": 0.0001723766326904297, - "learning_rate": 7.044776119402984e-08, - "loss": 0.0053, - "reward": 0.3816964477300644, - "reward_std": 0.030476388055831194, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 0.12697340548038483, + "kl": 0.003467559814453125, + "learning_rate": 3.5223880597014923e-07, + "loss": 0.0335, + "reward": 0.4174107313156128, + "reward_std": 0.08734581060707569, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285746216774, + "rewards/tag_count_reward": 0.2879464402794838, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 2031.8371276855469, + "completion_length": 1992.6808776855469, "epoch": 0.0707938167425883, - "grad_norm": 0.07402362674474716, - "kl": 0.00017333030700683594, - "learning_rate": 7.07462686567164e-08, - "loss": 0.0067, - "reward": 0.305803582072258, - "reward_std": 0.05324844201095402, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.11588187515735626, + "kl": 0.003673553466796875, + "learning_rate": 3.5373134328358206e-07, + "loss": 0.0204, + "reward": 0.349330373108387, + "reward_std": 0.08447225950658321, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.2912946566939354, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 2007.1763916015625, + "completion_length": 1927.74560546875, "epoch": 0.07109252483010978, - "grad_norm": 0.09320622682571411, - "kl": 0.00018334388732910156, - "learning_rate": 7.104477611940299e-08, - "loss": 0.0111, - "reward": 0.3046875074505806, - "reward_std": 0.029972289223223925, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 0.15074247121810913, + "kl": 0.005779266357421875, + "learning_rate": 3.552238805970149e-07, + "loss": 0.0528, + "reward": 0.3822544738650322, + "reward_std": 0.11215895228087902, + "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.3063616156578064, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 2040.5759582519531, + "completion_length": 1994.2389221191406, "epoch": 0.07139123291763125, - "grad_norm": 0.06865691393613815, - "kl": 0.00017786026000976562, - "learning_rate": 7.134328358208955e-08, - "loss": 0.0092, - "reward": 0.329799123108387, - "reward_std": 0.027131953742355108, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.1309901624917984, + "kl": 0.003681182861328125, + "learning_rate": 3.5671641791044777e-07, + "loss": 0.0373, + "reward": 0.3560268133878708, + "reward_std": 0.07334023341536522, + "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561383992433548, + "rewards/tag_count_reward": 0.2779017984867096, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 2038.9219665527344, + "completion_length": 1966.6362609863281, "epoch": 0.07168994100515272, - "grad_norm": 0.07534286379814148, - "kl": 0.00018548965454101562, - "learning_rate": 7.164179104477612e-08, - "loss": 0.0089, - "reward": 0.306361623108387, - "reward_std": 0.04350207722745836, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 0.1274717003107071, + "kl": 0.005023956298828125, + "learning_rate": 3.5820895522388055e-07, + "loss": 0.02, + "reward": 0.357142873108387, + "reward_std": 0.09366848599165678, + "rewards/accuracy_reward": 0.053571432596072555, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.3035714477300644, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 2014.1719055175781, + "completion_length": 1948.4732971191406, "epoch": 0.07198864909267419, - "grad_norm": 0.08272381871938705, - "kl": 0.00018286705017089844, - "learning_rate": 7.194029850746268e-08, - "loss": 0.0079, - "reward": 0.3638392984867096, - "reward_std": 0.07487664488144219, - "rewards/accuracy_reward": 0.09598214598372579, + "grad_norm": 0.11765734851360321, + "kl": 0.00518798828125, + "learning_rate": 3.5970149253731343e-07, + "loss": 0.0277, + "reward": 0.4274553805589676, + "reward_std": 0.124588243663311, + "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267857164144516, + "rewards/tag_count_reward": 0.3024553656578064, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 2046.6630249023438, + "completion_length": 1994.3416137695312, "epoch": 0.07228735718019566, - "grad_norm": 0.0700802430510521, - "kl": 0.0001876354217529297, - "learning_rate": 7.223880597014925e-08, - "loss": 0.0018, - "reward": 0.325892873108387, - "reward_std": 0.015027947491034865, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 0.13728979229927063, + "kl": 0.004390716552734375, + "learning_rate": 3.6119402985074626e-07, + "loss": 0.0298, + "reward": 0.3867187723517418, + "reward_std": 0.0792832737788558, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2544642984867096, + "rewards/tag_count_reward": 0.2907366305589676, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 2031.5469360351562, + "completion_length": 2010.3750915527344, "epoch": 0.07258606526771712, - "grad_norm": 0.05091555789113045, - "kl": 0.0001709461212158203, - "learning_rate": 7.253731343283583e-08, - "loss": 0.0081, - "reward": 0.3152901902794838, - "reward_std": 0.027603259775787592, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 0.12306191027164459, + "kl": 0.003597259521484375, + "learning_rate": 3.626865671641791e-07, + "loss": 0.0291, + "reward": 0.3353794887661934, + "reward_std": 0.08254489116370678, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544664144516, + "rewards/tag_count_reward": 0.279575914144516, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 2039.2567443847656, + "completion_length": 1989.7612609863281, "epoch": 0.07288477335523859, - "grad_norm": 0.06395834684371948, - "kl": 0.00017213821411132812, - "learning_rate": 7.283582089552238e-08, - "loss": 0.0099, - "reward": 0.4029018059372902, - "reward_std": 0.027721676276996732, - "rewards/accuracy_reward": 0.14732143515720963, + "grad_norm": 0.1261904090642929, + "kl": 0.0045013427734375, + "learning_rate": 3.641791044776119e-07, + "loss": 0.0369, + "reward": 0.4497768133878708, + "reward_std": 0.09010925143957138, + "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.2868303656578064, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 2013.1763916015625, + "completion_length": 1933.2344665527344, "epoch": 0.07318348144276006, - "grad_norm": 0.07908426225185394, - "kl": 0.00017595291137695312, - "learning_rate": 7.313432835820895e-08, - "loss": 0.0161, - "reward": 0.271205373108387, - "reward_std": 0.02809590636752546, + "grad_norm": 0.10987003892660141, + "kl": 0.0060577392578125, + "learning_rate": 3.6567164179104475e-07, + "loss": 0.025, + "reward": 0.3018973395228386, + "reward_std": 0.05150948045775294, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.2996651977300644, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 2038.1384582519531, + "completion_length": 2002.0357971191406, "epoch": 0.07348218953028153, - "grad_norm": 0.06283558905124664, - "kl": 0.00017762184143066406, - "learning_rate": 7.343283582089553e-08, - "loss": 0.0081, - "reward": 0.4229910969734192, - "reward_std": 0.05596610438078642, - "rewards/accuracy_reward": 0.16294643399305642, + "grad_norm": 0.1187586858868599, + "kl": 0.0045013427734375, + "learning_rate": 3.6716417910447763e-07, + "loss": 0.0237, + "reward": 0.4743303880095482, + "reward_std": 0.11379939876496792, + "rewards/accuracy_reward": 0.18750000093132257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2868303656578064, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 2045.5134582519531, + "completion_length": 2016.8058471679688, "epoch": 0.073780897617803, - "grad_norm": 0.07397637516260147, - "kl": 0.00017333030700683594, - "learning_rate": 7.373134328358208e-08, - "loss": 0.0046, - "reward": 0.3297991156578064, - "reward_std": 0.03348214295692742, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 0.1318998634815216, + "kl": 0.00414276123046875, + "learning_rate": 3.686567164179104e-07, + "loss": 0.0252, + "reward": 0.3755580484867096, + "reward_std": 0.11344264447689056, + "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.2885044813156128, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 2034.9107666015625, + "completion_length": 1981.5514221191406, "epoch": 0.07407960570532447, - "grad_norm": 0.06232650950551033, - "kl": 0.00017714500427246094, - "learning_rate": 7.402985074626866e-08, - "loss": 0.0096, - "reward": 0.2706473395228386, - "reward_std": 0.04423070792108774, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 0.12940748035907745, + "kl": 0.005161285400390625, + "learning_rate": 3.7014925373134323e-07, + "loss": 0.0354, + "reward": 0.3069196566939354, + "reward_std": 0.08730226382613182, + "rewards/accuracy_reward": 0.01116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.2957589402794838, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 2043.3304138183594, + "completion_length": 1978.3795776367188, "epoch": 0.07437831379284594, - "grad_norm": 0.07078929245471954, - "kl": 0.0001819133758544922, - "learning_rate": 7.432835820895522e-08, - "loss": 0.0039, - "reward": 0.3370535895228386, - "reward_std": 0.03600323898717761, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 0.1312132328748703, + "kl": 0.005615234375, + "learning_rate": 3.716417910447761e-07, + "loss": 0.0268, + "reward": 0.3816964477300644, + "reward_std": 0.08822421077638865, + "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285895228386, + "rewards/tag_count_reward": 0.294642873108387, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 2039.7277221679688, + "completion_length": 2002.4532165527344, "epoch": 0.07467702188036741, - "grad_norm": 0.06750496476888657, - "kl": 0.00018525123596191406, - "learning_rate": 7.462686567164179e-08, - "loss": 0.0065, - "reward": 0.2578125149011612, - "reward_std": 0.017809625016525388, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.13362780213356018, + "kl": 0.00518035888671875, + "learning_rate": 3.7313432835820895e-07, + "loss": 0.032, + "reward": 0.2996651902794838, + "reward_std": 0.07552485540509224, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.2974330484867096, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 2045.8661193847656, + "completion_length": 1987.8326721191406, "epoch": 0.07497572996788888, - "grad_norm": 0.06724486500024796, - "kl": 0.0001697540283203125, - "learning_rate": 7.492537313432836e-08, - "loss": 0.0027, - "reward": 0.2606026902794838, - "reward_std": 0.036972816567867994, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.12010441720485687, + "kl": 0.00563812255859375, + "learning_rate": 3.746268656716418e-07, + "loss": 0.0278, + "reward": 0.3599330484867096, + "reward_std": 0.12494660541415215, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.2996651902794838, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 2041.01123046875, + "completion_length": 1997.212158203125, "epoch": 0.07527443805541036, - "grad_norm": 0.07399393618106842, - "kl": 0.0001735687255859375, - "learning_rate": 7.522388059701492e-08, - "loss": 0.0082, - "reward": 0.294642873108387, - "reward_std": 0.02121879020705819, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.1480213850736618, + "kl": 0.0054168701171875, + "learning_rate": 3.761194029850746e-07, + "loss": 0.0296, + "reward": 0.3325892984867096, + "reward_std": 0.07889928296208382, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.294642873108387, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 2031.8750915527344, + "completion_length": 1922.7322082519531, "epoch": 0.07557314614293181, - "grad_norm": 0.09286254644393921, - "kl": 0.00017213821411132812, - "learning_rate": 7.552238805970149e-08, - "loss": 0.01, - "reward": 0.3856026977300644, - "reward_std": 0.09623699868097901, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 0.14510825276374817, + "kl": 0.00768280029296875, + "learning_rate": 3.7761194029850743e-07, + "loss": 0.0486, + "reward": 0.5000000149011612, + "reward_std": 0.1937486194074154, + "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2739955484867096, + "rewards/tag_count_reward": 0.330357164144516, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 2042.3103332519531, + "completion_length": 1985.1451721191406, "epoch": 0.07587185423045328, - "grad_norm": 0.0871184840798378, - "kl": 0.00017309188842773438, - "learning_rate": 7.582089552238805e-08, - "loss": 0.0073, - "reward": 0.2728794813156128, - "reward_std": 0.060657752910628915, - "rewards/accuracy_reward": 0.013392857508733869, + "grad_norm": 0.13130295276641846, + "kl": 0.00585174560546875, + "learning_rate": 3.7910447761194026e-07, + "loss": 0.0307, + "reward": 0.3342634066939354, + "reward_std": 0.12067104503512383, + "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.259486623108387, + "rewards/tag_count_reward": 0.2918526902794838, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 2036.0469665527344, + "completion_length": 1950.2768859863281, "epoch": 0.07617056231797475, - "grad_norm": 0.07985838502645493, - "kl": 0.00016999244689941406, - "learning_rate": 7.611940298507462e-08, - "loss": 0.0064, - "reward": 0.3264509066939354, - "reward_std": 0.06861974066123366, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 0.14331762492656708, + "kl": 0.006862640380859375, + "learning_rate": 3.805970149253731e-07, + "loss": 0.0403, + "reward": 0.407924123108387, + "reward_std": 0.1356019787490368, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.314174123108387, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 2030.6897888183594, + "completion_length": 1987.290283203125, "epoch": 0.07646927040549623, - "grad_norm": 0.05969567969441414, - "kl": 0.00017142295837402344, - "learning_rate": 7.64179104477612e-08, - "loss": 0.0083, - "reward": 0.2963169813156128, - "reward_std": 0.019323222106322646, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 0.1288195699453354, + "kl": 0.006378173828125, + "learning_rate": 3.82089552238806e-07, + "loss": 0.0266, + "reward": 0.3526785895228386, + "reward_std": 0.11634229216724634, + "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.2968750074505806, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 2009.6786193847656, + "completion_length": 1938.4063415527344, "epoch": 0.0767679784930177, - "grad_norm": 0.07523754239082336, - "kl": 0.00017714500427246094, - "learning_rate": 7.671641791044776e-08, - "loss": 0.0127, - "reward": 0.3130580559372902, - "reward_std": 0.0608008645940572, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 0.13025054335594177, + "kl": 0.00708770751953125, + "learning_rate": 3.835820895522388e-07, + "loss": 0.0269, + "reward": 0.3811384066939354, + "reward_std": 0.1112741008400917, + "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.305245541036129, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 2032.7657165527344, + "completion_length": 1939.4778137207031, "epoch": 0.07706668658053917, - "grad_norm": 0.06983206421136856, - "kl": 0.0001742839813232422, - "learning_rate": 7.701492537313432e-08, - "loss": 0.0142, - "reward": 0.4012276902794838, - "reward_std": 0.019254656974226236, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 0.1458735316991806, + "kl": 0.0081024169921875, + "learning_rate": 3.850746268656716e-07, + "loss": 0.0466, + "reward": 0.4693080559372902, + "reward_std": 0.10791886784136295, + "rewards/accuracy_reward": 0.1540178640279919, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258370541036129, + "rewards/tag_count_reward": 0.3152901828289032, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 2030.0558166503906, + "completion_length": 1945.0000915527344, "epoch": 0.07736539466806064, - "grad_norm": 0.07675250619649887, - "kl": 0.00019025802612304688, - "learning_rate": 7.73134328358209e-08, - "loss": 0.0118, - "reward": 0.3018973395228386, - "reward_std": 0.04445280064828694, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 0.1510995477437973, + "kl": 0.0080413818359375, + "learning_rate": 3.8656716417910446e-07, + "loss": 0.0499, + "reward": 0.388392873108387, + "reward_std": 0.12511307187378407, + "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.259486623108387, + "rewards/tag_count_reward": 0.305803582072258, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 2047.5848999023438, + "completion_length": 2011.9040832519531, "epoch": 0.07766410275558211, - "grad_norm": 0.0632825717329979, - "kl": 0.0001671314239501953, - "learning_rate": 7.761194029850746e-08, - "loss": 0.0005, - "reward": 0.255022332072258, - "reward_std": 0.015577482059597969, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.13501103222370148, + "kl": 0.006011962890625, + "learning_rate": 3.880597014925373e-07, + "loss": 0.03, + "reward": 0.3091518059372902, + "reward_std": 0.12465482205152512, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255022332072258, + "rewards/tag_count_reward": 0.2935267984867096, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 2020.1853637695312, + "completion_length": 1950.7210693359375, "epoch": 0.07796281084310358, - "grad_norm": 0.07898829132318497, - "kl": 0.00017404556274414062, - "learning_rate": 7.791044776119403e-08, - "loss": 0.0191, - "reward": 0.2857142984867096, - "reward_std": 0.04993052873760462, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 0.13436302542686462, + "kl": 0.00799560546875, + "learning_rate": 3.895522388059702e-07, + "loss": 0.0391, + "reward": 0.3325892984867096, + "reward_std": 0.09527294337749481, + "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.3147321566939354, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 2033.3683776855469, + "completion_length": 1989.2880249023438, "epoch": 0.07826151893062505, - "grad_norm": 0.08013086766004562, - "kl": 0.00017571449279785156, - "learning_rate": 7.820895522388059e-08, - "loss": 0.0078, - "reward": 0.322544664144516, - "reward_std": 0.04490569024346769, - "rewards/accuracy_reward": 0.06250000116415322, + "grad_norm": 0.1438877284526825, + "kl": 0.00701141357421875, + "learning_rate": 3.9104477611940295e-07, + "loss": 0.0371, + "reward": 0.366629496216774, + "reward_std": 0.11085704527795315, + "rewards/accuracy_reward": 0.0736607147846371, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2929687649011612, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 2038.9174499511719, + "completion_length": 1978.3773498535156, "epoch": 0.07856022701814652, - "grad_norm": 0.06517571955919266, - "kl": 0.00017762184143066406, - "learning_rate": 7.850746268656716e-08, - "loss": 0.0047, - "reward": 0.338169664144516, - "reward_std": 0.02882780064828694, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 0.11992578208446503, + "kl": 0.00756072998046875, + "learning_rate": 3.925373134328358e-07, + "loss": 0.027, + "reward": 0.3794642984867096, + "reward_std": 0.0741656506434083, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.2924107313156128, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 2037.65185546875, + "completion_length": 1958.2032165527344, "epoch": 0.07885893510566798, - "grad_norm": 0.06402231007814407, - "kl": 0.00017976760864257812, - "learning_rate": 7.880597014925372e-08, - "loss": 0.0055, - "reward": 0.3113839402794838, - "reward_std": 0.06150377355515957, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 0.1292649656534195, + "kl": 0.00836944580078125, + "learning_rate": 3.9402985074626866e-07, + "loss": 0.0389, + "reward": 0.3777901902794838, + "reward_std": 0.1510040182620287, + "rewards/accuracy_reward": 0.07366071571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.3041294813156128, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 2016.54248046875, + "completion_length": 1912.3460693359375, "epoch": 0.07915764319318945, - "grad_norm": 0.07130460441112518, - "kl": 0.0001773834228515625, - "learning_rate": 7.910447761194029e-08, - "loss": 0.0157, - "reward": 0.2929687649011612, - "reward_std": 0.031008854741230607, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 0.14024987816810608, + "kl": 0.009674072265625, + "learning_rate": 3.9552238805970144e-07, + "loss": 0.0295, + "reward": 0.3588169813156128, + "reward_std": 0.09611312951892614, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.3164062649011612, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 2038.3170776367188, + "completion_length": 1977.9710998535156, "epoch": 0.07945635128071092, - "grad_norm": 0.06842610239982605, - "kl": 0.00017833709716796875, - "learning_rate": 7.940298507462686e-08, - "loss": 0.0085, - "reward": 0.2639508992433548, - "reward_std": 0.03004591865465045, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.1375216543674469, + "kl": 0.00830078125, + "learning_rate": 3.970149253731343e-07, + "loss": 0.0299, + "reward": 0.326450914144516, + "reward_std": 0.11287194304168224, + "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.3063616156578064, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 2040.05810546875, + "completion_length": 1963.9532165527344, "epoch": 0.07975505936823239, - "grad_norm": 0.06028510257601738, - "kl": 0.0001671314239501953, - "learning_rate": 7.970149253731344e-08, - "loss": 0.0018, - "reward": 0.3007812649011612, - "reward_std": 0.02960525150410831, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.14886994659900665, + "kl": 0.0087890625, + "learning_rate": 3.9850746268656715e-07, + "loss": 0.0378, + "reward": 0.3604910895228386, + "reward_std": 0.12182177230715752, + "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.3113839402794838, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 2033.4375610351562, + "completion_length": 1931.149658203125, "epoch": 0.08005376745575386, - "grad_norm": 0.06567355990409851, - "kl": 0.00017786026000976562, - "learning_rate": 8e-08, - "loss": 0.0084, - "reward": 0.321428582072258, - "reward_std": 0.037495787255465984, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 0.14939945936203003, + "kl": 0.009857177734375, + "learning_rate": 4e-07, + "loss": 0.0574, + "reward": 0.389508955180645, + "reward_std": 0.12413572147488594, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2633928656578064, + "rewards/tag_count_reward": 0.318080373108387, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 2030.3550109863281, + "completion_length": 1889.2657165527344, "epoch": 0.08035247554327533, - "grad_norm": 0.08356145024299622, - "kl": 0.0001742839813232422, - "learning_rate": 8.029850746268655e-08, - "loss": 0.0127, - "reward": 0.349888414144516, - "reward_std": 0.043405378703027964, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 0.15972210466861725, + "kl": 0.0113372802734375, + "learning_rate": 4.014925373134328e-07, + "loss": 0.0609, + "reward": 0.4397321566939354, + "reward_std": 0.12599130906164646, + "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.3281250149011612, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 2005.1652221679688, + "completion_length": 1956.0313110351562, "epoch": 0.0806511836307968, - "grad_norm": 0.06689172983169556, - "kl": 0.00017786026000976562, - "learning_rate": 8.059701492537313e-08, - "loss": 0.0123, - "reward": 0.3984375223517418, - "reward_std": 0.05883302283473313, - "rewards/accuracy_reward": 0.129464291036129, + "grad_norm": 0.14033837616443634, + "kl": 0.00965118408203125, + "learning_rate": 4.0298507462686564e-07, + "loss": 0.0267, + "reward": 0.4776785969734192, + "reward_std": 0.15621951408684254, + "rewards/accuracy_reward": 0.17410714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.3035714402794838, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 2025.0469360351562, + "completion_length": 1930.5447082519531, "epoch": 0.08094989171831828, - "grad_norm": 0.07057687640190125, - "kl": 0.00016927719116210938, - "learning_rate": 8.08955223880597e-08, - "loss": 0.0148, - "reward": 0.321428582072258, - "reward_std": 0.0509061892516911, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 0.16332978010177612, + "kl": 0.010589599609375, + "learning_rate": 4.044776119402985e-07, + "loss": 0.0448, + "reward": 0.3995535895228386, + "reward_std": 0.15651614032685757, + "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250074505806, + "rewards/tag_count_reward": 0.3258928656578064, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 2022.2054443359375, + "completion_length": 1905.2076416015625, "epoch": 0.08124859980583975, - "grad_norm": 0.08516235649585724, - "kl": 0.00016117095947265625, - "learning_rate": 8.119402985074627e-08, - "loss": 0.013, - "reward": 0.3002232313156128, - "reward_std": 0.029787883860990405, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 0.1436133086681366, + "kl": 0.0112457275390625, + "learning_rate": 4.059701492537313e-07, + "loss": 0.0511, + "reward": 0.3710937649011612, + "reward_std": 0.10712305642664433, + "rewards/accuracy_reward": 0.05580357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622768059372902, + "rewards/tag_count_reward": 0.3152901902794838, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 2043.2054443359375, + "completion_length": 1935.8907165527344, "epoch": 0.08154730789336122, - "grad_norm": 0.05657872185111046, - "kl": 0.00017571449279785156, - "learning_rate": 8.149253731343283e-08, - "loss": 0.0037, - "reward": 0.271205373108387, - "reward_std": 0.032184453681111336, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.15039677917957306, + "kl": 0.01129150390625, + "learning_rate": 4.0746268656716413e-07, + "loss": 0.0554, + "reward": 0.3549107313156128, + "reward_std": 0.11849102191627026, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.3214285895228386, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 2036.40185546875, + "completion_length": 1928.9375610351562, "epoch": 0.08184601598088269, - "grad_norm": 0.07261265069246292, - "kl": 0.00017714500427246094, - "learning_rate": 8.17910447761194e-08, - "loss": 0.0129, - "reward": 0.4475446566939354, - "reward_std": 0.037911696126684546, - "rewards/accuracy_reward": 0.18750000838190317, + "grad_norm": 0.17037099599838257, + "kl": 0.0119171142578125, + "learning_rate": 4.08955223880597e-07, + "loss": 0.061, + "reward": 0.541294664144516, + "reward_std": 0.1505908127874136, + "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.333705373108387, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 1998.4888916015625, + "completion_length": 1842.02685546875, "epoch": 0.08214472406840415, - "grad_norm": 0.09104951471090317, - "kl": 0.00019168853759765625, - "learning_rate": 8.208955223880598e-08, - "loss": 0.0098, - "reward": 0.337611623108387, - "reward_std": 0.037211291724815965, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 0.1686246395111084, + "kl": 0.0147247314453125, + "learning_rate": 4.1044776119402984e-07, + "loss": 0.0457, + "reward": 0.4207589477300644, + "reward_std": 0.1368700247257948, + "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.3359375074505806, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 2036.4554443359375, + "completion_length": 1908.7768859863281, "epoch": 0.08244343215592562, - "grad_norm": 0.055579010397195816, - "kl": 0.0001800060272216797, - "learning_rate": 8.238805970149253e-08, - "loss": 0.0101, - "reward": 0.2946428656578064, - "reward_std": 0.016362733440473676, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.15488971769809723, + "kl": 0.013427734375, + "learning_rate": 4.1194029850746267e-07, + "loss": 0.0419, + "reward": 0.4185267984867096, + "reward_std": 0.1520002130419016, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.344866082072258, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 2024.263427734375, + "completion_length": 1918.1652526855469, "epoch": 0.08274214024344709, - "grad_norm": 0.07032553851604462, - "kl": 0.0001621246337890625, - "learning_rate": 8.26865671641791e-08, - "loss": 0.0091, - "reward": 0.3074776977300644, - "reward_std": 0.04421520931646228, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 0.1761590540409088, + "kl": 0.012664794921875, + "learning_rate": 4.134328358208955e-07, + "loss": 0.0539, + "reward": 0.407366082072258, + "reward_std": 0.14757522195577621, + "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.3314732313156128, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 2023.3393859863281, + "completion_length": 1892.0916137695312, "epoch": 0.08304084833096856, - "grad_norm": 0.08701573312282562, - "kl": 0.0001761913299560547, - "learning_rate": 8.298507462686567e-08, - "loss": 0.0164, - "reward": 0.2728794738650322, - "reward_std": 0.05483417911455035, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.17069895565509796, + "kl": 0.0139617919921875, + "learning_rate": 4.1492537313432833e-07, + "loss": 0.0582, + "reward": 0.3627232313156128, + "reward_std": 0.13992940448224545, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.3337053656578064, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 1975.8415832519531, + "completion_length": 1831.7121276855469, "epoch": 0.08333955641849003, - "grad_norm": 0.09772663563489914, - "kl": 0.00018930435180664062, - "learning_rate": 8.328358208955224e-08, - "loss": 0.0136, - "reward": 0.3164062649011612, - "reward_std": 0.039832796435803175, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.17750589549541473, + "kl": 0.0163726806640625, + "learning_rate": 4.164179104477612e-07, + "loss": 0.0518, + "reward": 0.4045759066939354, + "reward_std": 0.11226295493543148, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276828289032, + "rewards/tag_count_reward": 0.3443080484867096, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 2044.8661499023438, + "completion_length": 1964.7657165527344, "epoch": 0.0836382645060115, - "grad_norm": 0.042674124240875244, - "kl": 0.00017070770263671875, - "learning_rate": 8.35820895522388e-08, - "loss": 0.0042, - "reward": 0.2935267984867096, - "reward_std": 0.02107980544678867, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.15312790870666504, + "kl": 0.0133056640625, + "learning_rate": 4.17910447761194e-07, + "loss": 0.0483, + "reward": 0.368303582072258, + "reward_std": 0.11700172908604145, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2533482238650322, + "rewards/tag_count_reward": 0.3169642984867096, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 2029.9375305175781, + "completion_length": 1957.99560546875, "epoch": 0.08393697259353297, - "grad_norm": 0.04566473513841629, - "kl": 0.00016498565673828125, - "learning_rate": 8.388059701492537e-08, - "loss": 0.01, - "reward": 0.298549123108387, - "reward_std": 0.02875154372304678, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 0.16639547049999237, + "kl": 0.0134735107421875, + "learning_rate": 4.1940298507462687e-07, + "loss": 0.045, + "reward": 0.3649553805589676, + "reward_std": 0.11487031169235706, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561383992433548, + "rewards/tag_count_reward": 0.3225446566939354, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 2045.69873046875, + "completion_length": 1979.2255249023438, "epoch": 0.08423568068105444, - "grad_norm": 0.07262533158063889, - "kl": 0.00016307830810546875, - "learning_rate": 8.417910447761194e-08, - "loss": 0.0016, - "reward": 0.3660714402794838, - "reward_std": 0.026321796234697104, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 0.13930590450763702, + "kl": 0.0135345458984375, + "learning_rate": 4.208955223880597e-07, + "loss": 0.039, + "reward": 0.4335937723517418, + "reward_std": 0.1112994160503149, + "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964402794838, + "rewards/tag_count_reward": 0.3152901902794838, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 2036.5848693847656, + "completion_length": 1907.8884582519531, "epoch": 0.08453438876857591, - "grad_norm": 0.0558808296918869, - "kl": 0.00017452239990234375, - "learning_rate": 8.44776119402985e-08, - "loss": 0.0101, - "reward": 0.294084832072258, - "reward_std": 0.013885503867641091, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.15998511016368866, + "kl": 0.0160064697265625, + "learning_rate": 4.223880597014925e-07, + "loss": 0.0495, + "reward": 0.380580373108387, + "reward_std": 0.11969679035246372, + "rewards/accuracy_reward": 0.05580357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.3247768059372902, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 2038.8259887695312, + "completion_length": 1902.8326721191406, "epoch": 0.08483309685609738, - "grad_norm": 0.08060611039400101, - "kl": 0.0001804828643798828, - "learning_rate": 8.477611940298507e-08, - "loss": 0.0086, - "reward": 0.3443080559372902, - "reward_std": 0.06067852070555091, - "rewards/accuracy_reward": 0.08258928940631449, + "grad_norm": 0.16308756172657013, + "kl": 0.017120361328125, + "learning_rate": 4.2388059701492536e-07, + "loss": 0.0597, + "reward": 0.4787946715950966, + "reward_std": 0.17327484115958214, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.3448660895228386, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 2039.7166137695312, + "completion_length": 1934.0134887695312, "epoch": 0.08513180494361886, - "grad_norm": 0.06980064511299133, - "kl": 0.00018548965454101562, - "learning_rate": 8.507462686567163e-08, - "loss": 0.0089, - "reward": 0.2974330559372902, - "reward_std": 0.033961516339331865, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 0.15444514155387878, + "kl": 0.015869140625, + "learning_rate": 4.253731343283582e-07, + "loss": 0.0489, + "reward": 0.3677455559372902, + "reward_std": 0.11584950052201748, + "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.3186383992433548, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 2041.3817138671875, + "completion_length": 1907.7813110351562, "epoch": 0.08543051303114031, - "grad_norm": 0.06692071259021759, - "kl": 0.00017833709716796875, - "learning_rate": 8.537313432835821e-08, - "loss": 0.0045, - "reward": 0.3761160895228386, - "reward_std": 0.03253185795620084, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 0.165074422955513, + "kl": 0.017547607421875, + "learning_rate": 4.2686567164179107e-07, + "loss": 0.0528, + "reward": 0.4810268133878708, + "reward_std": 0.12199302576482296, + "rewards/accuracy_reward": 0.14062500977888703, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.3404018059372902, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 2046.6361694335938, + "completion_length": 1947.7277526855469, "epoch": 0.08572922111866178, - "grad_norm": 0.06754952669143677, - "kl": 0.00016927719116210938, - "learning_rate": 8.567164179104477e-08, - "loss": 0.0025, - "reward": 0.3632812649011612, - "reward_std": 0.023138975026085973, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 0.16399633884429932, + "kl": 0.01678466796875, + "learning_rate": 4.2835820895522385e-07, + "loss": 0.0547, + "reward": 0.4386160895228386, + "reward_std": 0.11736038699746132, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.3270089477300644, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 2043.1563415527344, + "completion_length": 1851.12060546875, "epoch": 0.08602792920618325, - "grad_norm": 0.08280470222234726, - "kl": 0.00018787384033203125, - "learning_rate": 8.597014925373133e-08, - "loss": 0.0053, - "reward": 0.2617187649011612, - "reward_std": 0.0324064576998353, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.16809551417827606, + "kl": 0.020721435546875, + "learning_rate": 4.298507462686567e-07, + "loss": 0.0541, + "reward": 0.368303582072258, + "reward_std": 0.1017408948391676, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.3616071566939354, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 2045.3169860839844, + "completion_length": 1920.4241943359375, "epoch": 0.08632663729370472, - "grad_norm": 0.06793849170207977, - "kl": 0.0001647472381591797, - "learning_rate": 8.626865671641791e-08, - "loss": 0.0015, - "reward": 0.3487723395228386, - "reward_std": 0.030627010855823755, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 0.152552992105484, + "kl": 0.018463134765625, + "learning_rate": 4.3134328358208956e-07, + "loss": 0.0512, + "reward": 0.443080373108387, + "reward_std": 0.12782950140535831, + "rewards/accuracy_reward": 0.11383929569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2550223395228386, + "rewards/tag_count_reward": 0.329241082072258, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 2021.2835693359375, + "completion_length": 1861.1161499023438, "epoch": 0.0866253453812262, - "grad_norm": 0.0964171439409256, - "kl": 0.00018310546875, - "learning_rate": 8.656716417910446e-08, - "loss": 0.0212, - "reward": 0.3465401828289032, - "reward_std": 0.06944564543664455, - "rewards/accuracy_reward": 0.08482142887078226, + "grad_norm": 0.1719377040863037, + "kl": 0.02117919921875, + "learning_rate": 4.3283582089552234e-07, + "loss": 0.0622, + "reward": 0.4765625223517418, + "reward_std": 0.16266490519046783, + "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187574505806, + "rewards/tag_count_reward": 0.349330373108387, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 2016.0335998535156, + "completion_length": 1811.7366943359375, "epoch": 0.08692405346874767, - "grad_norm": 0.07801539450883865, - "kl": 0.00019502639770507812, - "learning_rate": 8.686567164179104e-08, - "loss": 0.0168, - "reward": 0.3705357313156128, - "reward_std": 0.051721637370064855, - "rewards/accuracy_reward": 0.10491071874275804, + "grad_norm": 0.1808566302061081, + "kl": 0.022918701171875, + "learning_rate": 4.343283582089552e-07, + "loss": 0.0661, + "reward": 0.494419664144516, + "reward_std": 0.14984366297721863, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.3515625149011612, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 2015.5156860351562, + "completion_length": 1776.18310546875, "epoch": 0.08722276155626914, - "grad_norm": 0.10252294689416885, - "kl": 0.00018167495727539062, - "learning_rate": 8.716417910447761e-08, - "loss": 0.0221, - "reward": 0.2890625149011612, - "reward_std": 0.08257917687296867, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 0.17152146995067596, + "kl": 0.024078369140625, + "learning_rate": 4.3582089552238805e-07, + "loss": 0.0508, + "reward": 0.4017857313156128, + "reward_std": 0.13406887464225292, + "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2756696566939354, + "rewards/tag_count_reward": 0.368303582072258, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 2029.3861999511719, + "completion_length": 1864.5916137695312, "epoch": 0.08752146964379061, - "grad_norm": 0.06602814048528671, - "kl": 0.00018310546875, - "learning_rate": 8.746268656716418e-08, - "loss": 0.0105, - "reward": 0.419642873108387, - "reward_std": 0.04341266420669854, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 0.18318171799182892, + "kl": 0.023651123046875, + "learning_rate": 4.373134328358209e-07, + "loss": 0.0787, + "reward": 0.5340402126312256, + "reward_std": 0.13480005599558353, + "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.3621651902794838, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 2039.5067443847656, + "completion_length": 1870.7366943359375, "epoch": 0.08782017773131208, - "grad_norm": 0.058905210345983505, - "kl": 0.00017762184143066406, - "learning_rate": 8.776119402985074e-08, - "loss": 0.0048, - "reward": 0.2723214477300644, - "reward_std": 0.03193584317341447, - "rewards/accuracy_reward": 0.0133928582072258, + "grad_norm": 0.17291474342346191, + "kl": 0.02362060546875, + "learning_rate": 4.388059701492537e-07, + "loss": 0.0645, + "reward": 0.3699776977300644, + "reward_std": 0.14799103513360023, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2589285895228386, + "rewards/tag_count_reward": 0.349888414144516, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 2047.5044860839844, + "completion_length": 1889.7567749023438, "epoch": 0.08811888581883355, - "grad_norm": 0.06724610924720764, - "kl": 0.0001621246337890625, - "learning_rate": 8.805970149253731e-08, - "loss": 0.0004, - "reward": 0.2963169813156128, - "reward_std": 0.03058863733895123, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 0.16443517804145813, + "kl": 0.02313232421875, + "learning_rate": 4.4029850746268654e-07, + "loss": 0.0547, + "reward": 0.4090401902794838, + "reward_std": 0.1376936323940754, + "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.256138414144516, + "rewards/tag_count_reward": 0.353236623108387, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 2003.524658203125, + "completion_length": 1825.0670776367188, "epoch": 0.08841759390635502, - "grad_norm": 0.08900944143533707, - "kl": 0.00018167495727539062, - "learning_rate": 8.835820895522387e-08, - "loss": 0.0182, - "reward": 0.2823660895228386, - "reward_std": 0.05768031952902675, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 0.1642121970653534, + "kl": 0.025177001953125, + "learning_rate": 4.417910447761194e-07, + "loss": 0.0426, + "reward": 0.3928571566939354, + "reward_std": 0.14122464135289192, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.3571428656578064, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 2038.4353637695312, + "completion_length": 1829.2099304199219, "epoch": 0.08871630199387648, - "grad_norm": 0.06259770691394806, - "kl": 0.00017595291137695312, - "learning_rate": 8.865671641791045e-08, - "loss": 0.0115, - "reward": 0.3270089402794838, - "reward_std": 0.01402034517377615, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 0.17570245265960693, + "kl": 0.027008056640625, + "learning_rate": 4.4328358208955225e-07, + "loss": 0.0623, + "reward": 0.4849330484867096, + "reward_std": 0.1573524996638298, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.368861623108387, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 2018.1518859863281, + "completion_length": 1836.1630554199219, "epoch": 0.08901501008139795, - "grad_norm": 0.07403476536273956, - "kl": 0.00019788742065429688, - "learning_rate": 8.8955223880597e-08, - "loss": 0.0113, - "reward": 0.301897332072258, - "reward_std": 0.05154251237399876, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.18993113934993744, + "kl": 0.0281982421875, + "learning_rate": 4.44776119402985e-07, + "loss": 0.0595, + "reward": 0.4090401902794838, + "reward_std": 0.16793609783053398, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.3643973395228386, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 2030.7277526855469, + "completion_length": 1814.8460693359375, "epoch": 0.08931371816891942, - "grad_norm": 0.07169602066278458, - "kl": 0.00018787384033203125, - "learning_rate": 8.925373134328358e-08, - "loss": 0.0112, - "reward": 0.2996651902794838, - "reward_std": 0.022979625733569264, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.20305633544921875, + "kl": 0.0294189453125, + "learning_rate": 4.462686567164179e-07, + "loss": 0.061, + "reward": 0.428571455180645, + "reward_std": 0.12973556481301785, + "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.377232164144516, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 1985.841552734375, + "completion_length": 1752.0068054199219, "epoch": 0.08961242625644089, - "grad_norm": 0.09746788442134857, - "kl": 0.0002048015594482422, - "learning_rate": 8.955223880597015e-08, - "loss": 0.0201, - "reward": 0.451450914144516, - "reward_std": 0.0374235340859741, - "rewards/accuracy_reward": 0.17857143026776612, + "grad_norm": 0.19890770316123962, + "kl": 0.03204345703125, + "learning_rate": 4.4776119402985074e-07, + "loss": 0.0752, + "reward": 0.564174123108387, + "reward_std": 0.13523666746914387, + "rewards/accuracy_reward": 0.19642857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794813156128, + "rewards/tag_count_reward": 0.3677455559372902, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 2040.8236999511719, + "completion_length": 1866.7545471191406, "epoch": 0.08991113434396236, - "grad_norm": 0.06310214102268219, - "kl": 0.00017881393432617188, - "learning_rate": 8.98507462686567e-08, - "loss": 0.0052, - "reward": 0.2606026902794838, - "reward_std": 0.02447376074269414, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.1725112944841385, + "kl": 0.029510498046875, + "learning_rate": 4.492537313432835e-07, + "loss": 0.0422, + "reward": 0.400669664144516, + "reward_std": 0.12570929899811745, + "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.3515625149011612, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 2032.5045166015625, + "completion_length": 1807.4264221191406, "epoch": 0.09020984243148383, - "grad_norm": 0.07202090322971344, - "kl": 0.00018095970153808594, - "learning_rate": 9.014925373134328e-08, - "loss": 0.0115, - "reward": 0.3080357238650322, - "reward_std": 0.041298309341073036, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 0.18205706775188446, + "kl": 0.032073974609375, + "learning_rate": 4.507462686567164e-07, + "loss": 0.0644, + "reward": 0.434709832072258, + "reward_std": 0.15390844270586967, + "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.3632812649011612, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 2022.0357666015625, + "completion_length": 1794.4107971191406, "epoch": 0.0905085505190053, - "grad_norm": 0.07451662421226501, - "kl": 0.00019216537475585938, - "learning_rate": 9.044776119402985e-08, - "loss": 0.0119, - "reward": 0.2896205484867096, - "reward_std": 0.03663508058525622, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 0.18576619029045105, + "kl": 0.03338623046875, + "learning_rate": 4.522388059701492e-07, + "loss": 0.0694, + "reward": 0.4185268133878708, + "reward_std": 0.14628276601433754, + "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.369419664144516, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 2043.9219055175781, + "completion_length": 1854.3973999023438, "epoch": 0.09080725860652678, - "grad_norm": 0.08032865077257156, - "kl": 0.000164031982421875, - "learning_rate": 9.074626865671641e-08, - "loss": 0.0035, - "reward": 0.2645089477300644, - "reward_std": 0.04309586458839476, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 0.19164440035820007, + "kl": 0.032928466796875, + "learning_rate": 4.537313432835821e-07, + "loss": 0.0607, + "reward": 0.3945312649011612, + "reward_std": 0.13554842211306095, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.260044664144516, + "rewards/tag_count_reward": 0.3722098395228386, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 1963.9620971679688, + "completion_length": 1732.3996276855469, "epoch": 0.09110596669404825, - "grad_norm": 0.10249388962984085, - "kl": 0.0002167224884033203, - "learning_rate": 9.104477611940298e-08, - "loss": 0.0366, - "reward": 0.2801339402794838, - "reward_std": 0.03935734392143786, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.1819790154695511, + "kl": 0.03582763671875, + "learning_rate": 4.552238805970149e-07, + "loss": 0.0548, + "reward": 0.4017857387661934, + "reward_std": 0.14014336094260216, + "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2779017984867096, + "rewards/tag_count_reward": 0.377232164144516, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 2015.5156860351562, + "completion_length": 1821.6786804199219, "epoch": 0.09140467478156972, - "grad_norm": 0.0784238800406456, - "kl": 0.00019931793212890625, - "learning_rate": 9.134328358208955e-08, - "loss": 0.0143, - "reward": 0.3521205484867096, - "reward_std": 0.0655469261109829, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 0.18072210252285004, + "kl": 0.0360107421875, + "learning_rate": 4.567164179104477e-07, + "loss": 0.0614, + "reward": 0.4776785969734192, + "reward_std": 0.17274628207087517, + "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312574505806, + "rewards/tag_count_reward": 0.3727678805589676, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 2026.372802734375, + "completion_length": 1810.9978332519531, "epoch": 0.09170338286909117, - "grad_norm": 0.06298230588436127, - "kl": 0.00018835067749023438, - "learning_rate": 9.164179104477612e-08, - "loss": 0.0165, - "reward": 0.3666294813156128, - "reward_std": 0.058321744203567505, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 0.2081269770860672, + "kl": 0.03955078125, + "learning_rate": 4.582089552238806e-07, + "loss": 0.0743, + "reward": 0.5055803805589676, + "reward_std": 0.17858647927641869, + "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187574505806, + "rewards/tag_count_reward": 0.3805803805589676, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 2044.7879638671875, + "completion_length": 1865.5469360351562, "epoch": 0.09200209095661264, - "grad_norm": 0.05014129355549812, - "kl": 0.00017070770263671875, - "learning_rate": 9.194029850746268e-08, - "loss": 0.002, - "reward": 0.255580373108387, - "reward_std": 0.01575813489034772, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.18621991574764252, + "kl": 0.0396728515625, + "learning_rate": 4.5970149253731337e-07, + "loss": 0.0589, + "reward": 0.4146205484867096, + "reward_std": 0.173295671120286, + "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2533482313156128, + "rewards/tag_count_reward": 0.3722098395228386, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 2009.7991638183594, + "completion_length": 1689.26123046875, "epoch": 0.09230079904413412, - "grad_norm": 0.0935504212975502, - "kl": 0.00020003318786621094, - "learning_rate": 9.223880597014924e-08, - "loss": 0.0248, - "reward": 0.301897332072258, - "reward_std": 0.08109899330884218, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 0.21320658922195435, + "kl": 0.04302978515625, + "learning_rate": 4.6119402985074625e-07, + "loss": 0.0762, + "reward": 0.4832589402794838, + "reward_std": 0.1881660707294941, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.270647332072258, + "rewards/tag_count_reward": 0.4073660969734192, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 1984.3014221191406, + "completion_length": 1722.2500610351562, "epoch": 0.09259950713165559, - "grad_norm": 0.11777330935001373, - "kl": 0.0002219676971435547, - "learning_rate": 9.253731343283582e-08, - "loss": 0.0416, - "reward": 0.3878348469734192, - "reward_std": 0.049569939263165, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 0.18839113414287567, + "kl": 0.04547119140625, + "learning_rate": 4.626865671641791e-07, + "loss": 0.0672, + "reward": 0.4888393059372902, + "reward_std": 0.10311116091907024, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.3772321566939354, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 2003.8326721191406, + "completion_length": 1770.6295471191406, "epoch": 0.09289821521917706, - "grad_norm": 0.08007004112005234, - "kl": 0.00019478797912597656, - "learning_rate": 9.283582089552239e-08, - "loss": 0.0191, - "reward": 0.294084832072258, - "reward_std": 0.05850633792579174, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 0.19653482735157013, + "kl": 0.0462646484375, + "learning_rate": 4.641791044776119e-07, + "loss": 0.0656, + "reward": 0.3934151977300644, + "reward_std": 0.11920988373458385, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.3710937649011612, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 2003.6965026855469, + "completion_length": 1748.5647888183594, "epoch": 0.09319692330669853, - "grad_norm": 0.08561088889837265, - "kl": 0.00020742416381835938, - "learning_rate": 9.313432835820894e-08, - "loss": 0.0192, - "reward": 0.3521205484867096, - "reward_std": 0.0483149653300643, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 0.17870937287807465, + "kl": 0.0498046875, + "learning_rate": 4.6567164179104474e-07, + "loss": 0.05, + "reward": 0.5050223395228386, + "reward_std": 0.1616399846971035, + "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.273995541036129, + "rewards/tag_count_reward": 0.3800223395228386, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 2000.2389221191406, + "completion_length": 1728.7679138183594, "epoch": 0.09349563139422, - "grad_norm": 0.08387095481157303, - "kl": 0.00019884109497070312, - "learning_rate": 9.343283582089552e-08, - "loss": 0.0146, - "reward": 0.3091517984867096, - "reward_std": 0.03914292505942285, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 0.22175025939941406, + "kl": 0.0509033203125, + "learning_rate": 4.6716417910447757e-07, + "loss": 0.0752, + "reward": 0.454799123108387, + "reward_std": 0.1308348085731268, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.3945312723517418, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 2041.0447387695312, + "completion_length": 1712.8594970703125, "epoch": 0.09379433948174147, - "grad_norm": 0.07228925079107285, - "kl": 0.00017309188842773438, - "learning_rate": 9.373134328358209e-08, - "loss": 0.0072, - "reward": 0.2946428656578064, - "reward_std": 0.023437693249434233, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.277186781167984, + "kl": 0.050048828125, + "learning_rate": 4.6865671641791045e-07, + "loss": 0.109, + "reward": 0.4760044813156128, + "reward_std": 0.14633923582732677, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.4023437723517418, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 2030.4129943847656, + "completion_length": 1684.7322387695312, "epoch": 0.09409304756926294, - "grad_norm": 0.09286735951900482, - "kl": 0.00018525123596191406, - "learning_rate": 9.402985074626865e-08, - "loss": 0.0119, - "reward": 0.2751116156578064, - "reward_std": 0.06346299429424107, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 0.20811457931995392, + "kl": 0.052490234375, + "learning_rate": 4.701492537313433e-07, + "loss": 0.0649, + "reward": 0.4810268059372902, + "reward_std": 0.1761021763086319, + "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.4274553805589676, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 2039.9129943847656, + "completion_length": 1752.2456359863281, "epoch": 0.09439175565678441, - "grad_norm": 0.07319129258394241, - "kl": 0.0001919269561767578, - "learning_rate": 9.432835820895522e-08, - "loss": 0.0059, - "reward": 0.3649553656578064, - "reward_std": 0.02126630791462958, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 0.21474464237689972, + "kl": 0.05609130859375, + "learning_rate": 4.7164179104477606e-07, + "loss": 0.0747, + "reward": 0.5669642984867096, + "reward_std": 0.18811174109578133, + "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.4151785969734192, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 2002.5692749023438, + "completion_length": 1734.7322387695312, "epoch": 0.09469046374430588, - "grad_norm": 0.07555918395519257, - "kl": 0.0002181529998779297, - "learning_rate": 9.462686567164178e-08, - "loss": 0.0175, - "reward": 0.3264509066939354, - "reward_std": 0.04147233534604311, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 0.18936482071876526, + "kl": 0.0614013671875, + "learning_rate": 4.7313432835820894e-07, + "loss": 0.0559, + "reward": 0.4547991305589676, + "reward_std": 0.12690494582057, + "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "rewards/tag_count_reward": 0.3833705484867096, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 2038.7857666015625, + "completion_length": 1835.7098693847656, "epoch": 0.09498917183182734, - "grad_norm": 0.044198375195264816, - "kl": 0.0001919269561767578, - "learning_rate": 9.492537313432836e-08, - "loss": 0.0056, - "reward": 0.266741082072258, - "reward_std": 0.025516751455143094, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.2056136578321457, + "kl": 0.0660400390625, + "learning_rate": 4.7462686567164177e-07, + "loss": 0.0638, + "reward": 0.396205373108387, + "reward_std": 0.14485053718090057, + "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125149011612, + "rewards/tag_count_reward": 0.3738839477300644, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 2021.8616943359375, + "completion_length": 1714.7009887695312, "epoch": 0.09528787991934881, - "grad_norm": 0.07965958118438721, - "kl": 0.00019788742065429688, - "learning_rate": 9.522388059701491e-08, - "loss": 0.0113, - "reward": 0.3203125074505806, - "reward_std": 0.04675568360835314, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 0.25511667132377625, + "kl": 0.06439208984375, + "learning_rate": 4.761194029850746e-07, + "loss": 0.0798, + "reward": 0.467075914144516, + "reward_std": 0.15433309227228165, + "rewards/accuracy_reward": 0.060267857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.4068080559372902, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 2019.6406860351562, + "completion_length": 1714.1228332519531, "epoch": 0.09558658800687028, - "grad_norm": 0.08278489857912064, - "kl": 0.00020313262939453125, - "learning_rate": 9.552238805970148e-08, - "loss": 0.011, - "reward": 0.2661830559372902, - "reward_std": 0.030269039096310735, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 0.23663757741451263, + "kl": 0.06732177734375, + "learning_rate": 4.776119402985074e-07, + "loss": 0.0842, + "reward": 0.4179687649011612, + "reward_std": 0.12431371957063675, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.263950914144516, + "rewards/tag_count_reward": 0.4068080559372902, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 2034.6897583007812, + "completion_length": 1771.9777526855469, "epoch": 0.09588529609439175, - "grad_norm": 0.07489446550607681, - "kl": 0.00019288063049316406, - "learning_rate": 9.582089552238806e-08, - "loss": 0.0049, - "reward": 0.3052455484867096, - "reward_std": 0.05098936823196709, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 0.24057133495807648, + "kl": 0.073974609375, + "learning_rate": 4.791044776119403e-07, + "loss": 0.074, + "reward": 0.4581473395228386, + "reward_std": 0.1377399731427431, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.3911830484867096, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 2038.7790832519531, + "completion_length": 1752.3973999023438, "epoch": 0.09618400418191322, - "grad_norm": 0.07859861850738525, - "kl": 0.00017976760864257812, - "learning_rate": 9.611940298507463e-08, - "loss": 0.0039, - "reward": 0.3069196492433548, - "reward_std": 0.04866099264472723, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 0.2211206555366516, + "kl": 0.0755615234375, + "learning_rate": 4.805970149253731e-07, + "loss": 0.0675, + "reward": 0.4603794738650322, + "reward_std": 0.148013174533844, + "rewards/accuracy_reward": 0.06473214761354029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446492433548, + "rewards/tag_count_reward": 0.3956473395228386, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 2038.794677734375, + "completion_length": 1853.9889221191406, "epoch": 0.0964827122694347, - "grad_norm": 0.05863932892680168, - "kl": 0.00018358230590820312, - "learning_rate": 9.641791044776119e-08, - "loss": -0.0101, - "reward": 0.3242187649011612, - "reward_std": 0.018674688646569848, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.2801364064216614, + "kl": 0.0860595703125, + "learning_rate": 4.820895522388059e-07, + "loss": 0.0911, + "reward": 0.4581473469734192, + "reward_std": 0.13438985869288445, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.250558041036129, + "rewards/tag_count_reward": 0.373325914144516, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 2016.6451416015625, + "completion_length": 1773.8929138183594, "epoch": 0.09678142035695617, - "grad_norm": 0.06797696650028229, - "kl": 0.0001995563507080078, - "learning_rate": 9.671641791044776e-08, - "loss": 0.0151, - "reward": 0.2650669738650322, - "reward_std": 0.019131171517074108, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.259675532579422, + "kl": 0.0850830078125, + "learning_rate": 4.835820895522387e-07, + "loss": 0.0754, + "reward": 0.400111623108387, + "reward_std": 0.11311941407620907, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669738650322, + "rewards/tag_count_reward": 0.3911830484867096, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 2029.7701721191406, + "completion_length": 1688.2277526855469, "epoch": 0.09708012844447764, - "grad_norm": 0.0735718384385109, - "kl": 0.0002014636993408203, - "learning_rate": 9.701492537313432e-08, - "loss": 0.0129, - "reward": 0.3002232313156128, - "reward_std": 0.024462405126541853, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.2579168677330017, + "kl": 0.09033203125, + "learning_rate": 4.850746268656717e-07, + "loss": 0.0653, + "reward": 0.5011161044239998, + "reward_std": 0.1576140597462654, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.4229910895228386, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 2037.4219055175781, + "completion_length": 1763.8639221191406, "epoch": 0.09737883653199911, - "grad_norm": 0.0683262050151825, - "kl": 0.0001983642578125, - "learning_rate": 9.731343283582089e-08, - "loss": 0.0071, - "reward": 0.3018973246216774, - "reward_std": 0.03931520436890423, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 0.25518760085105896, + "kl": 0.0960693359375, + "learning_rate": 4.865671641791044e-07, + "loss": 0.0593, + "reward": 0.5189732387661934, + "reward_std": 0.19704899564385414, + "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.4029017984867096, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 2033.4598999023438, + "completion_length": 1743.3192749023438, "epoch": 0.09767754461952058, - "grad_norm": 0.07419373840093613, - "kl": 0.0001900196075439453, - "learning_rate": 9.761194029850746e-08, - "loss": 0.0083, - "reward": 0.3459821566939354, - "reward_std": 0.0563548025675118, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 0.30585822463035583, + "kl": 0.101806640625, + "learning_rate": 4.880597014925372e-07, + "loss": 0.0767, + "reward": 0.5251116380095482, + "reward_std": 0.16676925867795944, + "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607313156128, + "rewards/tag_count_reward": 0.4179687723517418, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 2028.4308776855469, + "completion_length": 1614.1050109863281, "epoch": 0.09797625270704205, - "grad_norm": 0.08804039657115936, - "kl": 0.00019431114196777344, - "learning_rate": 9.791044776119402e-08, - "loss": 0.0155, - "reward": 0.3493303805589676, - "reward_std": 0.052786584943532944, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 0.2727620303630829, + "kl": 0.1014404296875, + "learning_rate": 4.895522388059702e-07, + "loss": 0.0724, + "reward": 0.5585937649011612, + "reward_std": 0.14034350588917732, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.4402901977300644, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 2032.0335693359375, + "completion_length": 1653.1094055175781, "epoch": 0.0982749607945635, - "grad_norm": 0.08725815266370773, - "kl": 0.0002033710479736328, - "learning_rate": 9.82089552238806e-08, - "loss": 0.0089, - "reward": 0.3699776977300644, - "reward_std": 0.07343383063562214, - "rewards/accuracy_reward": 0.09821429010480642, + "grad_norm": 0.2347412258386612, + "kl": 0.10986328125, + "learning_rate": 4.91044776119403e-07, + "loss": 0.0454, + "reward": 0.5641741454601288, + "reward_std": 0.1534999329596758, + "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2717634066939354, + "rewards/tag_count_reward": 0.4302455633878708, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 2012.7009582519531, + "completion_length": 1732.0536193847656, "epoch": 0.09857366888208498, - "grad_norm": 0.08189118653535843, - "kl": 0.0002143383026123047, - "learning_rate": 9.850746268656715e-08, - "loss": 0.0148, - "reward": 0.2840401977300644, - "reward_std": 0.04473430826328695, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 0.2581503391265869, + "kl": 0.1248779296875, + "learning_rate": 4.925373134328357e-07, + "loss": 0.0574, + "reward": 0.4486607313156128, + "reward_std": 0.14246662892401218, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830559372902, + "rewards/tag_count_reward": 0.4218750223517418, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 1993.19873046875, + "completion_length": 1610.3683776855469, "epoch": 0.09887237696960645, - "grad_norm": 0.08750807493925095, - "kl": 0.00023889541625976562, - "learning_rate": 9.880597014925373e-08, - "loss": 0.0122, - "reward": 0.3258928656578064, - "reward_std": 0.051168052945286036, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 0.2832465171813965, + "kl": 0.121337890625, + "learning_rate": 4.940298507462687e-07, + "loss": 0.0659, + "reward": 0.5178571566939354, + "reward_std": 0.12371615506708622, + "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500074505806, + "rewards/tag_count_reward": 0.450892873108387, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 2037.8370971679688, + "completion_length": 1716.2835693359375, "epoch": 0.09917108505712792, - "grad_norm": 0.05927595496177673, - "kl": 0.00018596649169921875, - "learning_rate": 9.91044776119403e-08, - "loss": 0.0054, - "reward": 0.3426339402794838, - "reward_std": 0.041056130547076464, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 0.25233137607574463, + "kl": 0.133056640625, + "learning_rate": 4.955223880597015e-07, + "loss": 0.0446, + "reward": 0.5502232313156128, + "reward_std": 0.12946095131337643, + "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2600446566939354, + "rewards/tag_count_reward": 0.447544664144516, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 2043.6094360351562, + "completion_length": 1556.8638916015625, "epoch": 0.09946979314464939, - "grad_norm": 0.0580446720123291, - "kl": 0.00019073486328125, - "learning_rate": 9.940298507462686e-08, - "loss": 0.0036, - "reward": 0.2656250074505806, - "reward_std": 0.04026610730215907, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.219815194606781, + "kl": 0.1318359375, + "learning_rate": 4.970149253731343e-07, + "loss": 0.0492, + "reward": 0.5156250223517418, + "reward_std": 0.13348649349063635, + "rewards/accuracy_reward": 0.04017857415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.4754464477300644, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 2027.0536499023438, + "completion_length": 1565.5157165527344, "epoch": 0.09976850123217086, - "grad_norm": 0.0873844102025032, - "kl": 0.00019598007202148438, - "learning_rate": 9.970149253731343e-08, - "loss": 0.0172, - "reward": 0.3164062574505806, - "reward_std": 0.06343613378703594, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 0.250499427318573, + "kl": 0.1373291015625, + "learning_rate": 4.985074626865671e-07, + "loss": 0.0394, + "reward": 0.5485491305589676, + "reward_std": 0.13868961296975613, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.4659598469734192, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 2033.450927734375, + "completion_length": 1544.6228332519531, "epoch": 0.10006720931969233, - "grad_norm": 0.05294236168265343, - "kl": 0.00021719932556152344, - "learning_rate": 1e-07, - "loss": 0.0069, - "reward": 0.2929687649011612, - "reward_std": 0.010835815919563174, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.25107425451278687, + "kl": 0.152099609375, + "learning_rate": 5e-07, + "loss": 0.035, + "reward": 0.5558035969734192, + "reward_std": 0.1405537649989128, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.4754464477300644, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 2005.2433166503906, + "completion_length": 1516.6094360351562, "epoch": 0.1003659174072138, - "grad_norm": 0.05616692826151848, - "kl": 0.00023412704467773438, - "learning_rate": 9.999997280245025e-08, - "loss": 0.0059, - "reward": 0.267299123108387, - "reward_std": 0.015723891090601683, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.2001575380563736, + "kl": 0.1507568359375, + "learning_rate": 4.999998640122513e-07, + "loss": 0.0491, + "reward": 0.510602705180645, + "reward_std": 0.10413178242743015, + "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.4815848469734192, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 2025.07373046875, + "completion_length": 1602.8058471679688, "epoch": 0.10066462549473527, - "grad_norm": 0.06285136193037033, - "kl": 0.0002086162567138672, - "learning_rate": 9.999989120983064e-08, - "loss": 0.0124, - "reward": 0.2940848395228386, - "reward_std": 0.011788202216848731, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.21289537847042084, + "kl": 0.162353515625, + "learning_rate": 4.999994560491531e-07, + "loss": 0.037, + "reward": 0.5156250223517418, + "reward_std": 0.0598137229681015, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705559372902, + "rewards/tag_count_reward": 0.4776785969734192, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 1977.07373046875, + "completion_length": 1461.9353332519531, "epoch": 0.10096333358225675, - "grad_norm": 0.08971189707517624, - "kl": 0.00027370452880859375, - "learning_rate": 9.99997552222299e-08, - "loss": 0.021, - "reward": 0.3677455484867096, - "reward_std": 0.05426981672644615, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 0.19642992317676544, + "kl": 0.148681640625, + "learning_rate": 4.999987761111494e-07, + "loss": 0.0374, + "reward": 0.579241082072258, + "reward_std": 0.08721552789211273, + "rewards/accuracy_reward": 0.09375000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276828289032, + "rewards/tag_count_reward": 0.485491082072258, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 2031.6786193847656, + "completion_length": 1540.4866638183594, "epoch": 0.10126204166977822, - "grad_norm": 0.07708151638507843, - "kl": 0.0002033710479736328, - "learning_rate": 9.999956483979597e-08, - "loss": 0.0059, - "reward": 0.2985491305589676, - "reward_std": 0.018627170007675886, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.22309331595897675, + "kl": 0.160400390625, + "learning_rate": 4.999978241989798e-07, + "loss": 0.0356, + "reward": 0.5306919813156128, + "reward_std": 0.07228913344442844, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2628348395228386, + "rewards/tag_count_reward": 0.4838169813156128, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 2042.2009887695312, + "completion_length": 1526.8706359863281, "epoch": 0.10156074975729967, - "grad_norm": 0.09123651683330536, - "kl": 0.00020647048950195312, - "learning_rate": 9.999932006273599e-08, - "loss": 0.0062, - "reward": 0.2773437649011612, - "reward_std": 0.06865235092118382, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 0.2615892291069031, + "kl": 0.162353515625, + "learning_rate": 4.999966003136799e-07, + "loss": 0.0214, + "reward": 0.560825914144516, + "reward_std": 0.1372564658522606, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.4849330559372902, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 2021.4911499023438, + "completion_length": 1486.16748046875, "epoch": 0.10185945784482114, - "grad_norm": 0.07573254406452179, - "kl": 0.00021314620971679688, - "learning_rate": 9.999902089131624e-08, - "loss": 0.0111, - "reward": 0.3437500149011612, - "reward_std": 0.03392154281027615, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 0.2156354933977127, + "kl": 0.158935546875, + "learning_rate": 4.999951044565813e-07, + "loss": 0.0273, + "reward": 0.667410746216774, + "reward_std": 0.10443050414323807, + "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.4933035969734192, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 2002.2210693359375, + "completion_length": 1449.8861999511719, "epoch": 0.10215816593234262, - "grad_norm": 0.10430095344781876, - "kl": 0.0002627372741699219, - "learning_rate": 9.99986673258622e-08, - "loss": 0.0246, - "reward": 0.3404017984867096, - "reward_std": 0.07465368276461959, - "rewards/accuracy_reward": 0.06696429033763707, + "grad_norm": 0.21398331224918365, + "kl": 0.15478515625, + "learning_rate": 4.99993336629311e-07, + "loss": 0.043, + "reward": 0.5669643208384514, + "reward_std": 0.13367497269064188, + "rewards/accuracy_reward": 0.08482143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375074505806, + "rewards/tag_count_reward": 0.482142873108387, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 2031.1563110351562, + "completion_length": 1495.2590026855469, "epoch": 0.10245687401986409, - "grad_norm": 0.0816178023815155, - "kl": 0.0002219676971435547, - "learning_rate": 9.999825936675848e-08, - "loss": 0.011, - "reward": 0.337611623108387, - "reward_std": 0.03053134703077376, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 0.24133774638175964, + "kl": 0.16748046875, + "learning_rate": 4.999912968337924e-07, + "loss": 0.0337, + "reward": 0.5920759290456772, + "reward_std": 0.0974612906575203, + "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.4849330633878708, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 2030.009033203125, + "completion_length": 1573.4353332519531, "epoch": 0.10275558210738556, - "grad_norm": 0.08904030919075012, - "kl": 0.00021386146545410156, - "learning_rate": 9.999779701444895e-08, - "loss": 0.0121, - "reward": 0.3119419813156128, - "reward_std": 0.04666244680993259, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 0.20990975201129913, + "kl": 0.1728515625, + "learning_rate": 4.999889850722448e-07, + "loss": 0.034, + "reward": 0.572544664144516, + "reward_std": 0.11599421501159668, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.4854910895228386, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 2002.8438415527344, + "completion_length": 1439.26123046875, "epoch": 0.10305429019490703, - "grad_norm": 0.09175792336463928, - "kl": 0.00025010108947753906, - "learning_rate": 9.999728026943656e-08, - "loss": 0.022, - "reward": 0.2812500149011612, - "reward_std": 0.04562442330643535, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.20129843056201935, + "kl": 0.1591796875, + "learning_rate": 4.999864013471828e-07, + "loss": 0.0263, + "reward": 0.5301339477300644, + "reward_std": 0.09216642938554287, + "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.4921875223517418, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 2030.41748046875, + "completion_length": 1588.8304443359375, "epoch": 0.1033529982824285, - "grad_norm": 0.0571923665702343, - "kl": 0.00020766258239746094, - "learning_rate": 9.999670913228351e-08, - "loss": 0.014, - "reward": 0.3046875223517418, - "reward_std": 0.044879546854645014, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 0.24424806237220764, + "kl": 0.16796875, + "learning_rate": 4.999835456614175e-07, + "loss": 0.0271, + "reward": 0.5412946566939354, + "reward_std": 0.08315367996692657, + "rewards/accuracy_reward": 0.05580357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.260044664144516, + "rewards/tag_count_reward": 0.4854910895228386, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 2041.5469665527344, + "completion_length": 1638.3973693847656, "epoch": 0.10365170636994997, - "grad_norm": 0.07314113527536392, - "kl": 0.0001881122589111328, - "learning_rate": 9.999608360361112e-08, - "loss": 0.0052, - "reward": 0.3108258992433548, - "reward_std": 0.04613212775439024, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 0.22653424739837646, + "kl": 0.171142578125, + "learning_rate": 4.999804180180557e-07, + "loss": 0.025, + "reward": 0.549107164144516, + "reward_std": 0.11042520403862, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.4821428805589676, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 2035.7835693359375, + "completion_length": 1573.3639221191406, "epoch": 0.10395041445747144, - "grad_norm": 0.07516453415155411, - "kl": 0.00019979476928710938, - "learning_rate": 9.999540368409992e-08, - "loss": 0.0127, - "reward": 0.3097098395228386, - "reward_std": 0.0609456398524344, - "rewards/accuracy_reward": 0.04910714505240321, + "grad_norm": 0.2334415465593338, + "kl": 0.16064453125, + "learning_rate": 4.999770184204996e-07, + "loss": 0.0197, + "reward": 0.573660746216774, + "reward_std": 0.13343196082860231, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.4888393059372902, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 1987.0625610351562, + "completion_length": 1538.7611999511719, "epoch": 0.10424912254499291, - "grad_norm": 0.09729699045419693, - "kl": 0.0002868175506591797, - "learning_rate": 9.999466937448959e-08, - "loss": 0.0221, - "reward": 0.3309151902794838, - "reward_std": 0.0878829259891063, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 0.21041445434093475, + "kl": 0.15380859375, + "learning_rate": 4.999733468724479e-07, + "loss": 0.0195, + "reward": 0.5770089477300644, + "reward_std": 0.15266066789627075, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.279575914144516, + "rewards/tag_count_reward": 0.4877232313156128, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 2014.868408203125, + "completion_length": 1588.4085693359375, "epoch": 0.10454783063251437, - "grad_norm": 0.09050702303647995, - "kl": 0.00024771690368652344, - "learning_rate": 9.999388067557898e-08, - "loss": 0.0179, - "reward": 0.3638393059372902, - "reward_std": 0.07765219919383526, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 0.24560904502868652, + "kl": 0.158447265625, + "learning_rate": 4.999694033778949e-07, + "loss": 0.0166, + "reward": 0.643973246216774, + "reward_std": 0.10717487148940563, + "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.4832589477300644, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 2031.4420471191406, + "completion_length": 1590.0313415527344, "epoch": 0.10484653872003584, - "grad_norm": 0.06980008631944656, - "kl": 0.0002262592315673828, - "learning_rate": 9.999303758822612e-08, - "loss": 0.0088, - "reward": 0.3738839402794838, - "reward_std": 0.030581308994442225, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 0.229527086019516, + "kl": 0.1650390625, + "learning_rate": 4.999651879411306e-07, + "loss": 0.0158, + "reward": 0.5954241454601288, + "reward_std": 0.05622940510511398, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.4838169887661934, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 2015.3929138183594, + "completion_length": 1580.8839721679688, "epoch": 0.10514524680755731, - "grad_norm": 0.08226103335618973, - "kl": 0.00024366378784179688, - "learning_rate": 9.99921401133482e-08, - "loss": 0.0132, - "reward": 0.3404017984867096, - "reward_std": 0.043066670186817646, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 0.2322050780057907, + "kl": 0.157470703125, + "learning_rate": 4.999607005667411e-07, + "loss": 0.0269, + "reward": 0.592075914144516, + "reward_std": 0.13284130953252316, + "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.4849330559372902, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 1958.9197082519531, + "completion_length": 1606.4197082519531, "epoch": 0.10544395489507878, - "grad_norm": 0.08646747469902039, - "kl": 0.0003533363342285156, - "learning_rate": 9.99911882519216e-08, - "loss": 0.0232, - "reward": 0.365513414144516, - "reward_std": 0.08070567063987255, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 0.2224269062280655, + "kl": 0.14599609375, + "learning_rate": 4.99955941259608e-07, + "loss": 0.0405, + "reward": 0.5703125149011612, + "reward_std": 0.1476429346948862, + "rewards/accuracy_reward": 0.09375000325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2873884066939354, + "rewards/tag_count_reward": 0.4765625223517418, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 2022.6004943847656, + "completion_length": 1645.9866943359375, "epoch": 0.10574266298260025, - "grad_norm": 0.08341144025325775, - "kl": 0.0002422332763671875, - "learning_rate": 9.999018200498186e-08, - "loss": 0.0169, - "reward": 0.3950893133878708, - "reward_std": 0.05845392681658268, - "rewards/accuracy_reward": 0.12946429336443543, + "grad_norm": 0.20698876678943634, + "kl": 0.15283203125, + "learning_rate": 4.999509100249093e-07, + "loss": 0.0223, + "reward": 0.6473214477300644, + "reward_std": 0.12398159131407738, + "rewards/accuracy_reward": 0.16294643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250074505806, + "rewards/tag_count_reward": 0.4843750223517418, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 2009.5246276855469, + "completion_length": 1606.7835693359375, "epoch": 0.10604137107012172, - "grad_norm": 0.08120117336511612, - "kl": 0.0002818107604980469, - "learning_rate": 9.998912137362365e-08, - "loss": 0.0121, - "reward": 0.3046875074505806, - "reward_std": 0.02364099072292447, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.22010749578475952, + "kl": 0.14794921875, + "learning_rate": 4.999456068681183e-07, + "loss": 0.0103, + "reward": 0.521205373108387, + "reward_std": 0.05684919096529484, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.4810267984867096, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 2037.5156860351562, + "completion_length": 1661.2969665527344, "epoch": 0.1063400791576432, - "grad_norm": 0.06339839100837708, - "kl": 0.00021457672119140625, - "learning_rate": 9.998800635900084e-08, - "loss": 0.0089, - "reward": 0.3080357313156128, - "reward_std": 0.03561924980022013, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 0.22447185218334198, + "kl": 0.146484375, + "learning_rate": 4.999400317950042e-07, + "loss": 0.0407, + "reward": 0.5424107313156128, + "reward_std": 0.10662781819701195, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.4754464477300644, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 2047.1116333007812, + "completion_length": 1757.6786804199219, "epoch": 0.10663878724516467, - "grad_norm": 0.05277043208479881, - "kl": 0.00020051002502441406, - "learning_rate": 9.998683696232649e-08, - "loss": 0.0016, - "reward": 0.2879464402794838, - "reward_std": 0.008928571827709675, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.2322104573249817, + "kl": 0.154541015625, + "learning_rate": 4.999341848116324e-07, + "loss": 0.0247, + "reward": 0.5301339626312256, + "reward_std": 0.10411343909800053, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2522321566939354, + "rewards/tag_count_reward": 0.4765625223517418, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 2035.87060546875, + "completion_length": 1714.8460388183594, "epoch": 0.10693749533268614, - "grad_norm": 0.08872954547405243, - "kl": 0.00022363662719726562, - "learning_rate": 9.998561318487273e-08, - "loss": 0.0084, - "reward": 0.2773437649011612, - "reward_std": 0.05461431504227221, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 0.20849138498306274, + "kl": 0.14892578125, + "learning_rate": 4.999280659243637e-07, + "loss": 0.0362, + "reward": 0.5156250298023224, + "reward_std": 0.15004741959273815, + "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830559372902, + "rewards/tag_count_reward": 0.4620535969734192, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 2030.6942749023438, + "completion_length": 1749.5781860351562, "epoch": 0.10723620342020761, - "grad_norm": 0.08758485317230225, - "kl": 0.0002319812774658203, - "learning_rate": 9.998433502797095e-08, - "loss": 0.0148, - "reward": 0.2700892984867096, - "reward_std": 0.061269809026271105, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.2080266773700714, + "kl": 0.15087890625, + "learning_rate": 4.999216751398547e-07, + "loss": 0.0252, + "reward": 0.5440848469734192, + "reward_std": 0.16194296814501286, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.4704241305589676, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 2038.0870971679688, + "completion_length": 1712.55810546875, "epoch": 0.10753491150772908, - "grad_norm": 0.07489731162786484, - "kl": 0.0002269744873046875, - "learning_rate": 9.998300249301166e-08, - "loss": 0.009, - "reward": 0.3002232238650322, - "reward_std": 0.03227905975654721, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 0.24691513180732727, + "kl": 0.137451171875, + "learning_rate": 4.999150124650583e-07, + "loss": 0.0429, + "reward": 0.5792411044239998, + "reward_std": 0.1408797036856413, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.4698660895228386, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 2004.4398498535156, + "completion_length": 1715.5536499023438, "epoch": 0.10783361959525053, - "grad_norm": 0.09362268447875977, - "kl": 0.0002892017364501953, - "learning_rate": 9.998161558144452e-08, - "loss": 0.0242, - "reward": 0.2728794738650322, - "reward_std": 0.03354328637942672, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.20999087393283844, + "kl": 0.138671875, + "learning_rate": 4.999080779072225e-07, + "loss": 0.0387, + "reward": 0.4927455559372902, + "reward_std": 0.1048511303961277, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.4726562798023224, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 2040.5558471679688, + "completion_length": 1730.04248046875, "epoch": 0.108132327682772, - "grad_norm": 0.06933708488941193, - "kl": 0.0002346038818359375, - "learning_rate": 9.998017429477833e-08, - "loss": 0.0037, - "reward": 0.2689732238650322, - "reward_std": 0.04026029841043055, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 0.22684308886528015, + "kl": 0.142822265625, + "learning_rate": 4.999008714738917e-07, + "loss": 0.0347, + "reward": 0.518415205180645, + "reward_std": 0.13277038373053074, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2578125074505806, + "rewards/tag_count_reward": 0.4715401977300644, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 1981.7389526367188, + "completion_length": 1600.7500915527344, "epoch": 0.10843103577029348, - "grad_norm": 0.11206962168216705, - "kl": 0.0003619194030761719, - "learning_rate": 9.99786786345811e-08, - "loss": 0.0369, - "reward": 0.4213169813156128, - "reward_std": 0.04299068497493863, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 0.2272636741399765, + "kl": 0.13671875, + "learning_rate": 4.998933931729055e-07, + "loss": 0.0348, + "reward": 0.6417411044239998, + "reward_std": 0.06803015759214759, + "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276977300644, + "rewards/tag_count_reward": 0.478794664144516, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 2037.6942749023438, + "completion_length": 1686.4308776855469, "epoch": 0.10872974385781495, - "grad_norm": 0.086862713098526, - "kl": 0.000232696533203125, - "learning_rate": 9.997712860247996e-08, - "loss": 0.0095, - "reward": 0.4369419813156128, - "reward_std": 0.022431674879044294, - "rewards/accuracy_reward": 0.1785714365541935, + "grad_norm": 0.2002456933259964, + "kl": 0.13623046875, + "learning_rate": 4.998856430123998e-07, + "loss": 0.0255, + "reward": 0.6707589626312256, + "reward_std": 0.1025421367958188, + "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2583705484867096, + "rewards/tag_count_reward": 0.4720982313156128, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 2026.2411499023438, + "completion_length": 1701.0514221191406, "epoch": 0.10902845194533642, - "grad_norm": 0.08561345934867859, - "kl": 0.00027060508728027344, - "learning_rate": 9.997552420016118e-08, - "loss": 0.0223, - "reward": 0.3404018059372902, - "reward_std": 0.047190384240821004, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 0.2080105096101761, + "kl": 0.14697265625, + "learning_rate": 4.998776210008058e-07, + "loss": 0.0267, + "reward": 0.6138393133878708, + "reward_std": 0.1752930535003543, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.4598214477300644, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 1997.2232666015625, + "completion_length": 1713.3750915527344, "epoch": 0.10932716003285789, - "grad_norm": 0.0980398878455162, - "kl": 0.0003418922424316406, - "learning_rate": 9.99738654293702e-08, - "loss": 0.0166, - "reward": 0.3431919887661934, - "reward_std": 0.060346872778609395, - "rewards/accuracy_reward": 0.06696428591385484, + "grad_norm": 0.27657926082611084, + "kl": 0.139892578125, + "learning_rate": 4.99869327146851e-07, + "loss": 0.0413, + "reward": 0.5306919887661934, + "reward_std": 0.13885026797652245, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.4547991305589676, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 2031.3482971191406, + "completion_length": 1821.3125915527344, "epoch": 0.10962586812037936, - "grad_norm": 0.08116894215345383, - "kl": 0.0002276897430419922, - "learning_rate": 9.99721522919116e-08, - "loss": 0.0069, - "reward": 0.2996651977300644, - "reward_std": 0.021676857490092516, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.2439846694469452, + "kl": 0.150634765625, + "learning_rate": 4.99860761459558e-07, + "loss": 0.0256, + "reward": 0.5234375223517418, + "reward_std": 0.12117623444646597, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.4542410895228386, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 2030.10498046875, + "completion_length": 1756.5134582519531, "epoch": 0.10992457620790083, - "grad_norm": 0.0938415601849556, - "kl": 0.00026988983154296875, - "learning_rate": 9.99703847896491e-08, - "loss": 0.0139, - "reward": 0.279017873108387, - "reward_std": 0.0704977223649621, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 0.2327897548675537, + "kl": 0.136962890625, + "learning_rate": 4.998519239482455e-07, + "loss": 0.0436, + "reward": 0.4983259066939354, + "reward_std": 0.14618432894349098, + "rewards/accuracy_reward": 0.03571428684517741, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2678571566939354, + "rewards/tag_count_reward": 0.4626116305589676, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 2045.1563110351562, + "completion_length": 1784.024658203125, "epoch": 0.1102232842954223, - "grad_norm": 0.0738474577665329, - "kl": 0.00024127960205078125, - "learning_rate": 9.996856292450557e-08, - "loss": 0.0039, - "reward": 0.255580373108387, - "reward_std": 0.016395026817917824, - "rewards/accuracy_reward": 0.0, + "grad_norm": 0.23025494813919067, + "kl": 0.137451171875, + "learning_rate": 4.998428146225279e-07, + "loss": 0.0288, + "reward": 0.5139509215950966, + "reward_std": 0.1309387218207121, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.4693080633878708, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 2039.4822082519531, + "completion_length": 1837.1407165527344, "epoch": 0.11052199238294377, - "grad_norm": 0.06614664196968079, - "kl": 0.00026416778564453125, - "learning_rate": 9.996668669846305e-08, - "loss": 0.004, - "reward": 0.3922991305589676, - "reward_std": 0.03429269348271191, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 0.2914640009403229, + "kl": 0.153564453125, + "learning_rate": 4.998334334923153e-07, + "loss": 0.0342, + "reward": 0.6065848544239998, + "reward_std": 0.13887898437678814, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.4525669887661934, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 2022.9263916015625, + "completion_length": 1643.8840026855469, "epoch": 0.11082070047046524, - "grad_norm": 0.09342680126428604, - "kl": 0.0002887248992919922, - "learning_rate": 9.996475611356263e-08, - "loss": 0.0217, - "reward": 0.2784598395228386, - "reward_std": 0.07547460705973208, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 0.24380235373973846, + "kl": 0.1292724609375, + "learning_rate": 4.998237805678131e-07, + "loss": 0.0439, + "reward": 0.5306919887661934, + "reward_std": 0.16333149373531342, + "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.474888414144516, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 2045.3616638183594, + "completion_length": 1780.2232971191406, "epoch": 0.1111194085579867, - "grad_norm": 0.060608625411987305, - "kl": 0.0002288818359375, - "learning_rate": 9.996277117190464e-08, - "loss": 0.002, - "reward": 0.263392873108387, - "reward_std": 0.03764077113009989, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 0.35335609316825867, + "kl": 0.141845703125, + "learning_rate": 4.998138558595232e-07, + "loss": 0.0404, + "reward": 0.5212053880095482, + "reward_std": 0.1817568503320217, + "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2566964477300644, + "rewards/tag_count_reward": 0.4609375298023224, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 2032.9755249023438, + "completion_length": 1721.6853332519531, "epoch": 0.11141811664550817, - "grad_norm": 0.07344838231801987, - "kl": 0.00029468536376953125, - "learning_rate": 9.996073187564851e-08, - "loss": 0.0168, - "reward": 0.387834832072258, - "reward_std": 0.0435672253370285, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 0.3098659813404083, + "kl": 0.1392822265625, + "learning_rate": 4.998036593782426e-07, + "loss": 0.016, + "reward": 0.6004464626312256, + "reward_std": 0.11609909869730473, + "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262834832072258, + "rewards/tag_count_reward": 0.4687500223517418, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 2029.3973388671875, + "completion_length": 1813.0223999023438, "epoch": 0.11171682473302964, - "grad_norm": 0.06355759501457214, - "kl": 0.0002627372741699219, - "learning_rate": 9.995863822701276e-08, - "loss": 0.0166, - "reward": 0.294642873108387, - "reward_std": 0.017857143422588706, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 0.31774982810020447, + "kl": 0.146240234375, + "learning_rate": 4.997931911350638e-07, + "loss": 0.0234, + "reward": 0.5569196715950966, + "reward_std": 0.11959274858236313, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.463169664144516, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 2019.5380249023438, + "completion_length": 1698.4732971191406, "epoch": 0.11201553282055111, - "grad_norm": 0.09003506600856781, - "kl": 0.00031876564025878906, - "learning_rate": 9.995649022827509e-08, - "loss": 0.0124, - "reward": 0.3638393059372902, - "reward_std": 0.0748761473223567, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 0.38727355003356934, + "kl": 0.139892578125, + "learning_rate": 4.997824511413755e-07, + "loss": 0.037, + "reward": 0.6422991305589676, + "reward_std": 0.18701894208788872, + "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2678571566939354, + "rewards/tag_count_reward": 0.4659598395228386, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 2013.9286193847656, + "completion_length": 1693.9085693359375, "epoch": 0.11231424090807259, - "grad_norm": 0.10776465386152267, - "kl": 0.0003268718719482422, - "learning_rate": 9.995428788177231e-08, - "loss": 0.0181, - "reward": 0.3292410895228386, - "reward_std": 0.08695672079920769, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 0.5233216881752014, + "kl": 0.135498046875, + "learning_rate": 4.997714394088615e-07, + "loss": 0.0319, + "reward": 0.5970982387661934, + "reward_std": 0.16860965825617313, + "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375074505806, + "rewards/tag_count_reward": 0.467633955180645, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 2038.638427734375, + "completion_length": 1803.0000915527344, "epoch": 0.11261294899559406, - "grad_norm": 0.06493032723665237, - "kl": 0.0002639293670654297, - "learning_rate": 9.995203118990039e-08, - "loss": 0.0069, - "reward": 0.2968750149011612, - "reward_std": 0.02414092468097806, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 0.9928394556045532, + "kl": 0.15234375, + "learning_rate": 4.99760155949502e-07, + "loss": 0.0308, + "reward": 0.5212053805589676, + "reward_std": 0.09945615381002426, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.258928582072258, + "rewards/tag_count_reward": 0.4720982387661934, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 2034.82373046875, + "completion_length": 1747.2701721191406, "epoch": 0.11291165708311553, - "grad_norm": 0.07489624619483948, - "kl": 0.0002868175506591797, - "learning_rate": 9.994972015511433e-08, - "loss": 0.0057, - "reward": 0.3119419813156128, - "reward_std": 0.036171177634969354, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 0.9551793932914734, + "kl": 0.1475830078125, + "learning_rate": 4.997486007755717e-07, + "loss": 0.028, + "reward": 0.5446428880095482, + "reward_std": 0.15482873655855656, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262834832072258, + "rewards/tag_count_reward": 0.450892873108387, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 2028.68310546875, + "completion_length": 1748.4666137695312, "epoch": 0.113210365170637, - "grad_norm": 0.08131052553653717, - "kl": 0.0003223419189453125, - "learning_rate": 9.994735477992835e-08, - "loss": 0.0159, - "reward": 0.3046875074505806, - "reward_std": 0.0372473462484777, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 0.43657103180885315, + "kl": 0.16357421875, + "learning_rate": 4.997367738996418e-07, + "loss": 0.0317, + "reward": 0.5563616454601288, + "reward_std": 0.16272744163870811, + "rewards/accuracy_reward": 0.09598214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.262276791036129, + "rewards/tag_count_reward": 0.4603794887661934, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 2032.950927734375, + "completion_length": 1762.6942443847656, "epoch": 0.11350907325815847, - "grad_norm": 0.07796020060777664, - "kl": 0.00028252601623535156, - "learning_rate": 9.994493506691575e-08, - "loss": 0.0133, - "reward": 0.3264508992433548, - "reward_std": 0.06595724122598767, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 0.6815599203109741, + "kl": 0.1396484375, + "learning_rate": 4.997246753345788e-07, + "loss": 0.0329, + "reward": 0.554129496216774, + "reward_std": 0.13164770789444447, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.4693080633878708, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 2044.1406555175781, + "completion_length": 1746.5045776367188, "epoch": 0.11380778134567994, - "grad_norm": 0.07322216033935547, - "kl": 0.00024890899658203125, - "learning_rate": 9.994246101870891e-08, - "loss": 0.0046, - "reward": 0.3666294738650322, - "reward_std": 0.02817943482659757, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 1.869083046913147, + "kl": 0.15673828125, + "learning_rate": 4.997123050935445e-07, + "loss": 0.0367, + "reward": 0.6065848469734192, + "reward_std": 0.14629404246807098, + "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.463727705180645, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 1966.1563110351562, + "completion_length": 1617.2545471191406, "epoch": 0.11410648943320141, - "grad_norm": 0.1076650321483612, - "kl": 0.0005140304565429688, - "learning_rate": 9.993993263799938e-08, - "loss": 0.0171, - "reward": 0.325334832072258, - "reward_std": 0.06196014815941453, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 1.0162982940673828, + "kl": 0.139404296875, + "learning_rate": 4.996996631899968e-07, + "loss": 0.0422, + "reward": 0.5345982387661934, + "reward_std": 0.15161003172397614, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2806919738650322, + "rewards/tag_count_reward": 0.4631696715950966, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 1951.94873046875, + "completion_length": 1616.4621276855469, "epoch": 0.11440519752072287, - "grad_norm": 0.10082155466079712, - "kl": 0.0005621910095214844, - "learning_rate": 9.993734992753776e-08, - "loss": 0.0174, - "reward": 0.361049123108387, - "reward_std": 0.05429393844678998, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 0.9915550351142883, + "kl": 0.1331787109375, + "learning_rate": 4.996867496376888e-07, + "loss": 0.0323, + "reward": 0.6261160969734192, + "reward_std": 0.13551262579858303, + "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2806919664144516, + "rewards/tag_count_reward": 0.4720982387661934, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 2015.3504943847656, + "completion_length": 1723.87060546875, "epoch": 0.11470390560824434, - "grad_norm": 0.08529632538557053, - "kl": 0.00037169456481933594, - "learning_rate": 9.993471289013382e-08, - "loss": 0.0171, - "reward": 0.364397332072258, - "reward_std": 0.0997677103150636, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 2.39125919342041, + "kl": 0.165283203125, + "learning_rate": 4.99673564450669e-07, + "loss": 0.0426, + "reward": 0.6383928805589676, + "reward_std": 0.22527769207954407, + "rewards/accuracy_reward": 0.17410715040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151902794838, + "rewards/tag_count_reward": 0.4642857313156128, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 2031.4822082519531, + "completion_length": 1731.6630554199219, "epoch": 0.11500261369576581, - "grad_norm": 0.06340638548135757, - "kl": 0.0003058910369873047, - "learning_rate": 9.993202152865639e-08, - "loss": 0.0081, - "reward": 0.2840401902794838, - "reward_std": 0.053718479350209236, - "rewards/accuracy_reward": 0.024553572526201606, + "grad_norm": 0.9077367186546326, + "kl": 0.151123046875, + "learning_rate": 4.996601076432819e-07, + "loss": 0.0374, + "reward": 0.5373884215950966, + "reward_std": 0.15877192094922066, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "rewards/tag_count_reward": 0.4704241305589676, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 2035.82373046875, + "completion_length": 1788.90185546875, "epoch": 0.11530132178328728, - "grad_norm": 0.11445873230695724, - "kl": 0.0002903938293457031, - "learning_rate": 9.992927584603338e-08, - "loss": 0.012, - "reward": 0.3822544813156128, - "reward_std": 0.06826452631503344, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 2.999178647994995, + "kl": 0.16748046875, + "learning_rate": 4.996463792301669e-07, + "loss": 0.0492, + "reward": 0.5948660895228386, + "reward_std": 0.13126266933977604, + "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639509066939354, + "rewards/tag_count_reward": 0.4542410969734192, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 2008.8438415527344, + "completion_length": 1669.9710693359375, "epoch": 0.11560002987080875, - "grad_norm": 0.09532254934310913, - "kl": 0.0004134178161621094, - "learning_rate": 9.992647584525186e-08, - "loss": 0.0259, - "reward": 0.4017857387661934, - "reward_std": 0.09913647500798106, - "rewards/accuracy_reward": 0.13169643841683865, + "grad_norm": 1.6843479871749878, + "kl": 0.164794921875, + "learning_rate": 4.996323792262593e-07, + "loss": 0.0594, + "reward": 0.6545759215950966, + "reward_std": 0.1734354868531227, + "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.4559151977300644, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 1999.1317749023438, + "completion_length": 1723.0983276367188, "epoch": 0.11589873795833022, - "grad_norm": 0.08176741749048233, - "kl": 0.0004038810729980469, - "learning_rate": 9.992362152935794e-08, - "loss": 0.0196, - "reward": 0.3409598395228386, - "reward_std": 0.019302175613120198, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 8.725236892700195, + "kl": 0.219970703125, + "learning_rate": 4.996181076467897e-07, + "loss": 0.049, + "reward": 0.5758928880095482, + "reward_std": 0.16588085889816284, + "rewards/accuracy_reward": 0.1250000095460564, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.450892873108387, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 2023.3952026367188, + "completion_length": 1720.8728332519531, "epoch": 0.1161974460458517, - "grad_norm": 0.0992458239197731, - "kl": 0.0003676414489746094, - "learning_rate": 9.992071290145683e-08, - "loss": 0.0176, - "reward": 0.3498884066939354, - "reward_std": 0.06594702531583607, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 3.7546021938323975, + "kl": 0.15234375, + "learning_rate": 4.996035645072842e-07, + "loss": 0.0472, + "reward": 0.6199777126312256, + "reward_std": 0.2237107828259468, + "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.4525669813156128, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 2033.0291137695312, + "completion_length": 1721.5715026855469, "epoch": 0.11649615413337316, - "grad_norm": 0.08157746493816376, - "kl": 0.0003228187561035156, - "learning_rate": 9.991774996471283e-08, - "loss": 0.0155, - "reward": 0.3554687574505806, - "reward_std": 0.06806525238789618, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 2.650991439819336, + "kl": 0.144775390625, + "learning_rate": 4.995887498235642e-07, + "loss": 0.0347, + "reward": 0.5937500298023224, + "reward_std": 0.20694916136562824, + "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.4531250149011612, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 2040.07373046875, + "completion_length": 1739.0157165527344, "epoch": 0.11679486222089464, - "grad_norm": 0.08157145231962204, - "kl": 0.0003228187561035156, - "learning_rate": 9.991473272234937e-08, - "loss": 0.0114, - "reward": 0.2940848395228386, - "reward_std": 0.029909562319517136, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 5.359323501586914, + "kl": 0.18017578125, + "learning_rate": 4.995736636117468e-07, + "loss": 0.0665, + "reward": 0.5000000223517418, + "reward_std": 0.1435579787939787, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.4486607313156128, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 2031.2165832519531, + "completion_length": 1737.6116943359375, "epoch": 0.1170935703084161, - "grad_norm": 0.08052168041467667, - "kl": 0.0003447532653808594, - "learning_rate": 9.991166117764884e-08, - "loss": 0.0143, - "reward": 0.372209832072258, - "reward_std": 0.03719330835156143, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 17.036531448364258, + "kl": 0.298095703125, + "learning_rate": 4.995583058882442e-07, + "loss": 0.0465, + "reward": 0.5931920036673546, + "reward_std": 0.14481880329549313, + "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026977300644, + "rewards/tag_count_reward": 0.4458705633878708, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 2023.5536499023438, + "completion_length": 1744.5759582519531, "epoch": 0.11739227839593756, - "grad_norm": 0.06540744751691818, - "kl": 0.00034999847412109375, - "learning_rate": 9.990853533395283e-08, - "loss": 0.0078, - "reward": 0.2611607238650322, - "reward_std": 0.015499930828809738, - "rewards/accuracy_reward": 0.0, + "grad_norm": 72.09225463867188, + "kl": 0.545166015625, + "learning_rate": 4.995426766697641e-07, + "loss": 0.0643, + "reward": 0.4581473469734192, + "reward_std": 0.1382804922759533, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.4447544813156128, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 1991.5648193359375, + "completion_length": 1720.3348693847656, "epoch": 0.11769098648345903, - "grad_norm": 0.0881509780883789, - "kl": 0.00047588348388671875, - "learning_rate": 9.990535519466195e-08, - "loss": 0.0175, - "reward": 0.313616082072258, - "reward_std": 0.024884309619665146, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 8.964143753051758, + "kl": 0.230224609375, + "learning_rate": 4.995267759733098e-07, + "loss": 0.0379, + "reward": 0.5703125149011612, + "reward_std": 0.18131808936595917, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.277901791036129, + "rewards/tag_count_reward": 0.447544664144516, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 2036.9687805175781, + "completion_length": 1819.1473693847656, "epoch": 0.1179896945709805, - "grad_norm": 0.07943782955408096, - "kl": 0.0003108978271484375, - "learning_rate": 9.990212076323585e-08, - "loss": 0.0077, - "reward": 0.2968750149011612, - "reward_std": 0.02395651931874454, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 4.219891548156738, + "kl": 0.21484375, + "learning_rate": 4.995106038161793e-07, + "loss": 0.0494, + "reward": 0.5128348469734192, + "reward_std": 0.1619175262749195, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2611607238650322, + "rewards/tag_count_reward": 0.4369419887661934, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 2017.2947082519531, + "completion_length": 1678.4063110351562, "epoch": 0.11828840265850198, - "grad_norm": 0.08892307430505753, - "kl": 0.0004248619079589844, - "learning_rate": 9.98988320431933e-08, - "loss": 0.0162, - "reward": 0.3794643059372902, - "reward_std": 0.06652729329653084, - "rewards/accuracy_reward": 0.10714286309666932, + "grad_norm": 9.964911460876465, + "kl": 0.28759765625, + "learning_rate": 4.994941602159665e-07, + "loss": 0.0561, + "reward": 0.5954241380095482, + "reward_std": 0.17944439128041267, + "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214477300644, + "rewards/tag_count_reward": 0.4369419813156128, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 1994.4777526855469, + "completion_length": 1665.4063110351562, "epoch": 0.11858711074602345, - "grad_norm": 0.10685969889163971, - "kl": 0.0005130767822265625, - "learning_rate": 9.989548903811211e-08, - "loss": 0.0282, - "reward": 0.373325914144516, - "reward_std": 0.09651419147849083, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 6.321689128875732, + "kl": 0.24365234375, + "learning_rate": 4.994774451905606e-07, + "loss": 0.0578, + "reward": 0.5691964477300644, + "reward_std": 0.20852927677333355, + "rewards/accuracy_reward": 0.12276786030270159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.275111623108387, + "rewards/tag_count_reward": 0.4464285895228386, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 2039.35498046875, + "completion_length": 1783.6362609863281, "epoch": 0.11888581883354492, - "grad_norm": 0.07939071953296661, - "kl": 0.0003390312194824219, - "learning_rate": 9.989209175162912e-08, - "loss": 0.0072, - "reward": 0.3515625223517418, - "reward_std": 0.06434143520891666, - "rewards/accuracy_reward": 0.08705357415601611, + "grad_norm": 23.902652740478516, + "kl": 0.435546875, + "learning_rate": 4.994604587581456e-07, + "loss": 0.057, + "reward": 0.5563616454601288, + "reward_std": 0.20811011269688606, + "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089328289032, + "rewards/tag_count_reward": 0.4291294887661934, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 2033.1719360351562, + "completion_length": 1746.5402526855469, "epoch": 0.11918452692106639, - "grad_norm": 0.0915946438908577, - "kl": 0.0003371238708496094, - "learning_rate": 9.988864018744026e-08, - "loss": 0.0097, - "reward": 0.3242187649011612, - "reward_std": 0.06812636647373438, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 5.511786460876465, + "kl": 0.239990234375, + "learning_rate": 4.994432009372012e-07, + "loss": 0.0547, + "reward": 0.5100446715950966, + "reward_std": 0.14884588681161404, + "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.443080373108387, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 2046.4263916015625, + "completion_length": 1792.8460693359375, "epoch": 0.11948323500858786, - "grad_norm": 0.04640355706214905, - "kl": 0.0003046989440917969, - "learning_rate": 9.988513434930049e-08, - "loss": 0.0018, - "reward": 0.325334832072258, - "reward_std": 0.008738514268770814, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 6.302380084991455, + "kl": 0.267578125, + "learning_rate": 4.994256717465024e-07, + "loss": 0.0486, + "reward": 0.5357143133878708, + "reward_std": 0.15216713212430477, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2539062574505806, + "rewards/tag_count_reward": 0.439732164144516, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 2033.9420166015625, + "completion_length": 1730.0781860351562, "epoch": 0.11978194309610933, - "grad_norm": 0.09490390121936798, - "kl": 0.0003647804260253906, - "learning_rate": 9.988157424102379e-08, - "loss": 0.012, - "reward": 0.3465401977300644, - "reward_std": 0.05210266471840441, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 4.950244426727295, + "kl": 0.23779296875, + "learning_rate": 4.99407871205119e-07, + "loss": 0.0591, + "reward": 0.5446428880095482, + "reward_std": 0.17566783353686333, + "rewards/accuracy_reward": 0.11160714970901608, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473395228386, + "rewards/tag_count_reward": 0.4330357238650322, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 1982.5603637695312, + "completion_length": 1691.9732971191406, "epoch": 0.1200806511836308, - "grad_norm": 0.10779646039009094, - "kl": 0.0005803108215332031, - "learning_rate": 9.987795986648326e-08, - "loss": 0.0283, - "reward": 0.3331473395228386, - "reward_std": 0.05424899375066161, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 4.869670391082764, + "kl": 0.2041015625, + "learning_rate": 4.993897993324163e-07, + "loss": 0.0691, + "reward": 0.5094866305589676, + "reward_std": 0.15183800272643566, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401902794838, + "rewards/tag_count_reward": 0.4469866305589676, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 2043.7523193359375, + "completion_length": 1793.5558776855469, "epoch": 0.12037935927115227, - "grad_norm": 0.0778711661696434, - "kl": 0.0003237724304199219, - "learning_rate": 9.987429122961095e-08, - "loss": 0.0061, - "reward": 0.2952009066939354, - "reward_std": 0.03159623988904059, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 4.653181076049805, + "kl": 0.35205078125, + "learning_rate": 4.993714561480547e-07, + "loss": 0.0653, + "reward": 0.4698660895228386, + "reward_std": 0.16444317623972893, + "rewards/accuracy_reward": 0.05133928684517741, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2572544813156128, + "rewards/tag_count_reward": 0.4185268133878708, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 1983.4978332519531, + "completion_length": 1730.368408203125, "epoch": 0.12067806735867373, - "grad_norm": 0.09483948349952698, - "kl": 0.0005345344543457031, - "learning_rate": 9.9870568334398e-08, - "loss": 0.0319, - "reward": 0.3242187574505806, - "reward_std": 0.06835525296628475, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 2.515561103820801, + "kl": 0.248291015625, + "learning_rate": 4.9935284167199e-07, + "loss": 0.0418, + "reward": 0.522321455180645, + "reward_std": 0.19265655055642128, + "rewards/accuracy_reward": 0.09151786239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "rewards/tag_count_reward": 0.4308035895228386, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 2028.4554138183594, + "completion_length": 1764.7813110351562, "epoch": 0.1209767754461952, - "grad_norm": 0.08337409049272537, - "kl": 0.0004220008850097656, - "learning_rate": 9.986679118489453e-08, - "loss": 0.0136, - "reward": 0.313616082072258, - "reward_std": 0.055838066851720214, - "rewards/accuracy_reward": 0.05133928777649999, + "grad_norm": 4.770638942718506, + "kl": 0.2587890625, + "learning_rate": 4.993339559244727e-07, + "loss": 0.0525, + "reward": 0.4949776977300644, + "reward_std": 0.14385286159813404, + "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "rewards/tag_count_reward": 0.4369419887661934, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 2023.2166137695312, + "completion_length": 1728.9197082519531, "epoch": 0.12127548353371667, - "grad_norm": 0.09205210208892822, - "kl": 0.0004630088806152344, - "learning_rate": 9.986295978520973e-08, - "loss": 0.0201, - "reward": 0.3470982313156128, - "reward_std": 0.047903148690238595, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 5.24013090133667, + "kl": 0.33544921875, + "learning_rate": 4.993147989260487e-07, + "loss": 0.0712, + "reward": 0.5340402126312256, + "reward_std": 0.180188599973917, + "rewards/accuracy_reward": 0.10937500675208867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2667410895228386, + "rewards/tag_count_reward": 0.424665205180645, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 2030.4889221191406, + "completion_length": 1707.6675109863281, "epoch": 0.12157419162123814, - "grad_norm": 0.10746953636407852, - "kl": 0.0004253387451171875, - "learning_rate": 9.985907413951179e-08, - "loss": 0.0106, - "reward": 0.3510044813156128, - "reward_std": 0.06026653037406504, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 6.4574103355407715, + "kl": 0.30908203125, + "learning_rate": 4.992953706975589e-07, + "loss": 0.0739, + "reward": 0.5479911044239998, + "reward_std": 0.1909767296165228, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830559372902, + "rewards/tag_count_reward": 0.4252232313156128, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 2008.44873046875, + "completion_length": 1753.4532165527344, "epoch": 0.12187289970875961, - "grad_norm": 0.08425290137529373, - "kl": 0.00047969818115234375, - "learning_rate": 9.985513425202789e-08, - "loss": 0.0215, - "reward": 0.361607164144516, - "reward_std": 0.07848737575113773, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 7.233785629272461, + "kl": 0.32666015625, + "learning_rate": 4.992756712601395e-07, + "loss": 0.0609, + "reward": 0.545200914144516, + "reward_std": 0.21929758042097092, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.4380580484867096, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 1958.305908203125, + "completion_length": 1683.8438415527344, "epoch": 0.12217160779628108, - "grad_norm": 0.10558472573757172, - "kl": 0.000728607177734375, - "learning_rate": 9.985114012704425e-08, - "loss": 0.0381, - "reward": 0.3191964477300644, - "reward_std": 0.04820820363238454, - "rewards/accuracy_reward": 0.03571428847499192, + "grad_norm": 3.7974467277526855, + "kl": 0.358154296875, + "learning_rate": 4.992557006352213e-07, + "loss": 0.0642, + "reward": 0.464285746216774, + "reward_std": 0.16011319309473038, + "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2834821566939354, + "rewards/tag_count_reward": 0.4241071715950966, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 2046.6228332519531, + "completion_length": 1794.7880249023438, "epoch": 0.12247031588380256, - "grad_norm": 0.06753968447446823, - "kl": 0.0003485679626464844, - "learning_rate": 9.984709176890609e-08, - "loss": 0.0023, - "reward": 0.2566964402794838, - "reward_std": 0.023956519551575184, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 6.321704864501953, + "kl": 0.35498046875, + "learning_rate": 4.992354588445304e-07, + "loss": 0.0716, + "reward": 0.4637276902794838, + "reward_std": 0.21002240851521492, + "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2544642984867096, + "rewards/tag_count_reward": 0.3945312649011612, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 2009.68310546875, + "completion_length": 1760.6384582519531, "epoch": 0.12276902397132403, - "grad_norm": 0.08590006083250046, - "kl": 0.0004878044128417969, - "learning_rate": 9.984298918201763e-08, - "loss": 0.0207, - "reward": 0.2890625149011612, - "reward_std": 0.043353511253371835, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 5.116876602172852, + "kl": 0.51953125, + "learning_rate": 4.992149459100881e-07, + "loss": 0.0869, + "reward": 0.419642873108387, + "reward_std": 0.1749109923839569, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.3973214402794838, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 2019.8259582519531, + "completion_length": 1694.9665832519531, "epoch": 0.1230677320588455, - "grad_norm": 0.09461798518896103, - "kl": 0.000530242919921875, - "learning_rate": 9.983883237084206e-08, - "loss": 0.0111, - "reward": 0.353236623108387, - "reward_std": 0.06527710473164916, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 3.7119460105895996, + "kl": 0.44677734375, + "learning_rate": 4.991941618542103e-07, + "loss": 0.0801, + "reward": 0.5245535969734192, + "reward_std": 0.19552000239491463, + "rewards/accuracy_reward": 0.10267857764847577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.4218750223517418, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 1976.6004943847656, + "completion_length": 1696.7076721191406, "epoch": 0.12336644014636697, - "grad_norm": 0.09853121638298035, - "kl": 0.0006632804870605469, - "learning_rate": 9.983462133990161e-08, - "loss": 0.0178, - "reward": 0.3309151902794838, - "reward_std": 0.0644544882234186, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 17.278169631958008, + "kl": 0.5771484375, + "learning_rate": 4.991731066995081e-07, + "loss": 0.0754, + "reward": 0.4916294887661934, + "reward_std": 0.2157577946782112, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2795759066939354, + "rewards/tag_count_reward": 0.4135044813156128, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 2025.35498046875, + "completion_length": 1641.4665832519531, "epoch": 0.12366514823388844, - "grad_norm": 0.1151774525642395, - "kl": 0.0004830360412597656, - "learning_rate": 9.983035609377747e-08, - "loss": 0.0179, - "reward": 0.3750000074505806, - "reward_std": 0.11451391875743866, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 6.896962642669678, + "kl": 0.55859375, + "learning_rate": 4.991517804688874e-07, + "loss": 0.1012, + "reward": 0.5390625074505806, + "reward_std": 0.1763317808508873, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700893059372902, + "rewards/tag_count_reward": 0.4274553805589676, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 2003.8147888183594, + "completion_length": 1647.5245971679688, "epoch": 0.1239638563214099, - "grad_norm": 0.0943358987569809, - "kl": 0.0005965232849121094, - "learning_rate": 9.98260366371098e-08, - "loss": 0.0234, - "reward": 0.3867187723517418, - "reward_std": 0.0648807268589735, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 45.734519958496094, + "kl": 0.98388671875, + "learning_rate": 4.99130183185549e-07, + "loss": 0.1151, + "reward": 0.545200914144516, + "reward_std": 0.19960248842835426, + "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.4112723395228386, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 2041.2166137695312, + "completion_length": 1768.7813415527344, "epoch": 0.12426256440893137, - "grad_norm": 0.08462145179510117, - "kl": 0.0003681182861328125, - "learning_rate": 9.982166297459773e-08, - "loss": 0.0067, - "reward": 0.2963169738650322, - "reward_std": 0.02465246245265007, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 32.52640914916992, + "kl": 0.828125, + "learning_rate": 4.991083148729887e-07, + "loss": 0.1071, + "reward": 0.4520089402794838, + "reward_std": 0.17455745488405228, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.4051339402794838, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 2022.1897888183594, + "completion_length": 1768.30810546875, "epoch": 0.12456127249645284, - "grad_norm": 0.06408986449241638, - "kl": 0.00047588348388671875, - "learning_rate": 9.981723511099941e-08, - "loss": 0.0107, - "reward": 0.3002232238650322, - "reward_std": 0.017494095489382744, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 36.33440399169922, + "kl": 0.9228515625, + "learning_rate": 4.99086175554997e-07, + "loss": 0.0832, + "reward": 0.4642857387661934, + "reward_std": 0.1830338705331087, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089328289032, + "rewards/tag_count_reward": 0.3973214402794838, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 2031.4688110351562, + "completion_length": 1749.8795471191406, "epoch": 0.12485998058397431, - "grad_norm": 0.092833012342453, - "kl": 0.0004711151123046875, - "learning_rate": 9.98127530511319e-08, - "loss": 0.0163, - "reward": 0.3152901977300644, - "reward_std": 0.0847532032057643, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 26.102466583251953, + "kl": 0.888671875, + "learning_rate": 4.990637652556595e-07, + "loss": 0.1025, + "reward": 0.486607164144516, + "reward_std": 0.19966524839401245, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266183041036129, + "rewards/tag_count_reward": 0.4107142984867096, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 2025.3259582519531, + "completion_length": 1715.1295471191406, "epoch": 0.1251586886714958, - "grad_norm": 0.08006705343723297, - "kl": 0.0005154609680175781, - "learning_rate": 9.980821679987123e-08, - "loss": 0.0133, - "reward": 0.2963169813156128, - "reward_std": 0.06918483856134117, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 18.954999923706055, + "kl": 0.77734375, + "learning_rate": 4.990410839993562e-07, + "loss": 0.0854, + "reward": 0.509486623108387, + "reward_std": 0.24269787967205048, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.4135044813156128, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 2022.9844360351562, + "completion_length": 1750.8728332519531, "epoch": 0.12545739675901724, - "grad_norm": 0.09156250208616257, - "kl": 0.0004725456237792969, - "learning_rate": 9.980362636215242e-08, - "loss": 0.0154, - "reward": 0.3906250223517418, - "reward_std": 0.05697209690697491, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 16.162120819091797, + "kl": 0.7392578125, + "learning_rate": 4.990181318107622e-07, + "loss": 0.0835, + "reward": 0.5323660895228386, + "reward_std": 0.17519783973693848, + "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.4029018059372902, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 2014.1831359863281, + "completion_length": 1725.7567749023438, "epoch": 0.1257561048465387, - "grad_norm": 0.09178797900676727, - "kl": 0.00058746337890625, - "learning_rate": 9.979898174296941e-08, - "loss": 0.0157, - "reward": 0.2996651902794838, - "reward_std": 0.09658480668440461, - "rewards/accuracy_reward": 0.026785715948790312, + "grad_norm": 17.33260154724121, + "kl": 0.830078125, + "learning_rate": 4.989949087148471e-07, + "loss": 0.1055, + "reward": 0.4380580559372902, + "reward_std": 0.21490630134940147, + "rewards/accuracy_reward": 0.040178572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.3978794813156128, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 2004.6540832519531, + "completion_length": 1665.6630249023438, "epoch": 0.12605481293406018, - "grad_norm": 0.09907078742980957, - "kl": 0.0006356239318847656, - "learning_rate": 9.979428294737508e-08, - "loss": 0.0217, - "reward": 0.3504464477300644, - "reward_std": 0.1011499809101224, - "rewards/accuracy_reward": 0.06696428777649999, + "grad_norm": 4.674254417419434, + "kl": 0.55615234375, + "learning_rate": 4.989714147368754e-07, + "loss": 0.085, + "reward": 0.5323660969734192, + "reward_std": 0.23007908090949059, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2834821566939354, + "rewards/tag_count_reward": 0.4118303656578064, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 1998.7969360351562, + "completion_length": 1710.3326416015625, "epoch": 0.12635352102158165, - "grad_norm": 0.10583220422267914, - "kl": 0.0006189346313476562, - "learning_rate": 9.978952998048128e-08, - "loss": 0.0249, - "reward": 0.321428582072258, - "reward_std": 0.05969261517748237, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 9.157416343688965, + "kl": 0.62548828125, + "learning_rate": 4.989476499024064e-07, + "loss": 0.0786, + "reward": 0.483258955180645, + "reward_std": 0.18618686869740486, + "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214328289032, + "rewards/tag_count_reward": 0.4140625149011612, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 2036.5268859863281, + "completion_length": 1757.4308776855469, "epoch": 0.12665222910910312, - "grad_norm": 0.0817607045173645, - "kl": 0.0004839897155761719, - "learning_rate": 9.978472284745875e-08, - "loss": 0.0073, - "reward": 0.3498884066939354, - "reward_std": 0.054528879234567285, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 8.874431610107422, + "kl": 0.796875, + "learning_rate": 4.989236142372937e-07, + "loss": 0.1091, + "reward": 0.506138414144516, + "reward_std": 0.17440990172326565, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.4034598469734192, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 2046.77685546875, + "completion_length": 1796.5514526367188, "epoch": 0.1269509371966246, - "grad_norm": 0.057629916816949844, - "kl": 0.0003952980041503906, - "learning_rate": 9.977986155353719e-08, - "loss": 0.0019, - "reward": 0.2918526977300644, - "reward_std": 0.01456987950950861, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 37.65040969848633, + "kl": 0.8935546875, + "learning_rate": 4.98899307767686e-07, + "loss": 0.1022, + "reward": 0.4302455559372902, + "reward_std": 0.18125955015420914, + "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2561384066939354, + "rewards/tag_count_reward": 0.3811384066939354, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 2005.8594360351562, + "completion_length": 1762.6853332519531, "epoch": 0.12724964528414606, - "grad_norm": 0.0885755643248558, - "kl": 0.0006103515625, - "learning_rate": 9.977494610400522e-08, - "loss": 0.0201, - "reward": 0.3616071492433548, - "reward_std": 0.06948269950225949, - "rewards/accuracy_reward": 0.08928572107106447, + "grad_norm": 5.7325968742370605, + "kl": 0.62109375, + "learning_rate": 4.988747305200261e-07, + "loss": 0.0898, + "reward": 0.5234375298023224, + "reward_std": 0.19826872646808624, + "rewards/accuracy_reward": 0.12276786519214511, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.400669664144516, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 2018.5848999023438, + "completion_length": 1754.0156860351562, "epoch": 0.12754835337166753, - "grad_norm": 0.08414001017808914, - "kl": 0.0005559921264648438, - "learning_rate": 9.976997650421036e-08, - "loss": 0.0182, - "reward": 0.3208705484867096, - "reward_std": 0.07173275202512741, - "rewards/accuracy_reward": 0.053571430034935474, + "grad_norm": 9.744460105895996, + "kl": 0.6396484375, + "learning_rate": 4.988498825210518e-07, + "loss": 0.0931, + "reward": 0.4648437649011612, + "reward_std": 0.18562539666891098, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.4023437649011612, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 2016.7322387695312, + "completion_length": 1726.54248046875, "epoch": 0.127847061459189, - "grad_norm": 0.10149809718132019, - "kl": 0.0005965232849121094, - "learning_rate": 9.976495275955903e-08, - "loss": 0.0219, - "reward": 0.3470982313156128, - "reward_std": 0.05141337704844773, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 9.612737655639648, + "kl": 0.53271484375, + "learning_rate": 4.988247637977952e-07, + "loss": 0.098, + "reward": 0.4871651902794838, + "reward_std": 0.1740439496934414, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.400111623108387, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 1966.7478332519531, + "completion_length": 1671.3728942871094, "epoch": 0.12814576954671048, - "grad_norm": 0.11557989567518234, - "kl": 0.0008950233459472656, - "learning_rate": 9.97598748755166e-08, - "loss": 0.0278, - "reward": 0.329799123108387, - "reward_std": 0.07583593670278788, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 4.392724990844727, + "kl": 0.53369140625, + "learning_rate": 4.98799374377583e-07, + "loss": 0.1116, + "reward": 0.4564732387661934, + "reward_std": 0.1938146837055683, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2873884066939354, + "rewards/tag_count_reward": 0.3984375149011612, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 2016.8907165527344, + "completion_length": 1803.5402526855469, "epoch": 0.12844447763423195, - "grad_norm": 0.08738546818494797, - "kl": 0.0005898475646972656, - "learning_rate": 9.975474285760728e-08, - "loss": 0.0185, - "reward": 0.329799123108387, - "reward_std": 0.06958190468139946, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 2.9815316200256348, + "kl": 0.50830078125, + "learning_rate": 4.987737142880363e-07, + "loss": 0.0928, + "reward": 0.4575892984867096, + "reward_std": 0.21182601898908615, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2717634066939354, + "rewards/tag_count_reward": 0.3839285895228386, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 2004.0982971191406, + "completion_length": 1661.7545471191406, "epoch": 0.12874318572175342, - "grad_norm": 0.10727723687887192, - "kl": 0.0007190704345703125, - "learning_rate": 9.974955671141423e-08, - "loss": 0.0261, - "reward": 0.4017857387661934, - "reward_std": 0.07896770164370537, - "rewards/accuracy_reward": 0.12276786053553224, + "grad_norm": 11.794843673706055, + "kl": 0.41552734375, + "learning_rate": 4.987477835570711e-07, + "loss": 0.1018, + "reward": 0.5452009215950966, + "reward_std": 0.19809302687644958, + "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2790178656578064, + "rewards/tag_count_reward": 0.4023437649011612, "step": 431 }, { "clip_ratio": 0.0, - "completion_length": 2029.3795776367188, + "completion_length": 1773.05810546875, "epoch": 0.1290418938092749, - "grad_norm": 0.07993348687887192, - "kl": 0.0005331039428710938, - "learning_rate": 9.974431644257946e-08, - "loss": 0.0116, - "reward": 0.3404017984867096, - "reward_std": 0.04246059129945934, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 4.0646514892578125, + "kl": 0.47607421875, + "learning_rate": 4.987215822128973e-07, + "loss": 0.089, + "reward": 0.4849330633878708, + "reward_std": 0.19286515936255455, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622768059372902, + "rewards/tag_count_reward": 0.384486623108387, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 1998.7746276855469, + "completion_length": 1694.8148193359375, "epoch": 0.12934060189679636, - "grad_norm": 0.10198958963155746, - "kl": 0.0007638931274414062, - "learning_rate": 9.973902205680387e-08, - "loss": 0.0216, - "reward": 0.2818080559372902, - "reward_std": 0.04380830586887896, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 8.928903579711914, + "kl": 0.3759765625, + "learning_rate": 4.986951102840193e-07, + "loss": 0.0979, + "reward": 0.4235491380095482, + "reward_std": 0.16311455145478249, + "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.279575914144516, + "rewards/tag_count_reward": 0.4079241305589676, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 2016.7924499511719, + "completion_length": 1747.33935546875, "epoch": 0.12963930998431783, - "grad_norm": 0.09664202481508255, - "kl": 0.0006322860717773438, - "learning_rate": 9.973367355984724e-08, - "loss": 0.0158, - "reward": 0.299107164144516, - "reward_std": 0.06385443662293255, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 5.853902816772461, + "kl": 0.63525390625, + "learning_rate": 4.986683677992362e-07, + "loss": 0.0814, + "reward": 0.4525669887661934, + "reward_std": 0.24432093277573586, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.274553582072258, + "rewards/tag_count_reward": 0.3789062723517418, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 2018.2054748535156, + "completion_length": 1778.9063110351562, "epoch": 0.1299380180718393, - "grad_norm": 0.10235672444105148, - "kl": 0.0005998611450195312, - "learning_rate": 9.972827095752819e-08, - "loss": 0.0137, - "reward": 0.412946455180645, - "reward_std": 0.08672920241951942, - "rewards/accuracy_reward": 0.14732143376022577, + "grad_norm": 3.6149532794952393, + "kl": 0.56103515625, + "learning_rate": 4.986413547876409e-07, + "loss": 0.081, + "reward": 0.5602678805589676, + "reward_std": 0.2076813280582428, + "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250074505806, + "rewards/tag_count_reward": 0.3772321566939354, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 2046.1607360839844, + "completion_length": 1816.8304443359375, "epoch": 0.13023672615936077, - "grad_norm": 0.045758843421936035, - "kl": 0.0004029273986816406, - "learning_rate": 9.972281425572422e-08, - "loss": 0.0019, - "reward": 0.2533482238650322, - "reward_std": 0.008881053188815713, - "rewards/accuracy_reward": 0.0, + "grad_norm": 7.4070048332214355, + "kl": 0.705078125, + "learning_rate": 4.986140712786211e-07, + "loss": 0.0973, + "reward": 0.3945312723517418, + "reward_std": 0.21280937269330025, + "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2533482238650322, + "rewards/tag_count_reward": 0.361049123108387, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 1994.0781860351562, + "completion_length": 1725.3148193359375, "epoch": 0.13053543424688224, - "grad_norm": 0.1088603213429451, - "kl": 0.0008096694946289062, - "learning_rate": 9.97173034603717e-08, - "loss": 0.0116, - "reward": 0.2840401902794838, - "reward_std": 0.057210883125662804, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 4.981678485870361, + "kl": 0.58935546875, + "learning_rate": 4.985865173018585e-07, + "loss": 0.1003, + "reward": 0.385044664144516, + "reward_std": 0.1809232011437416, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.3716518059372902, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 2033.0380249023438, + "completion_length": 1792.0625915527344, "epoch": 0.1308341423344037, - "grad_norm": 0.09655492007732391, - "kl": 0.0005941390991210938, - "learning_rate": 9.971173857746584e-08, - "loss": 0.011, - "reward": 0.3833705559372902, - "reward_std": 0.05610431730747223, - "rewards/accuracy_reward": 0.11383929080329835, + "grad_norm": 5.515299320220947, + "kl": 0.64111328125, + "learning_rate": 4.985586928873292e-07, + "loss": 0.0907, + "reward": 0.4916294887661934, + "reward_std": 0.20357535406947136, + "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.3532366305589676, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 2025.1920166015625, + "completion_length": 1792.5179138183594, "epoch": 0.13113285042192518, - "grad_norm": 0.08139055222272873, - "kl": 0.0005731582641601562, - "learning_rate": 9.97061196130607e-08, - "loss": 0.009, - "reward": 0.321428582072258, - "reward_std": 0.06090939324349165, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 5.567059516906738, + "kl": 0.52294921875, + "learning_rate": 4.985305980653035e-07, + "loss": 0.0777, + "reward": 0.4430803880095482, + "reward_std": 0.2120881974697113, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2656250149011612, + "rewards/tag_count_reward": 0.369419664144516, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 1980.4577026367188, + "completion_length": 1654.5313110351562, "epoch": 0.13143155850944666, - "grad_norm": 0.12449940294027328, - "kl": 0.0009546279907226562, - "learning_rate": 9.970044657326912e-08, - "loss": 0.0258, - "reward": 0.384486623108387, - "reward_std": 0.039486483205109835, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 9.795083999633789, + "kl": 0.51708984375, + "learning_rate": 4.985022328663456e-07, + "loss": 0.0993, + "reward": 0.5228794813156128, + "reward_std": 0.20075957477092743, + "rewards/accuracy_reward": 0.1517857222352177, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.275111623108387, + "rewards/tag_count_reward": 0.3710937649011612, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 2009.6183471679688, + "completion_length": 1753.2255554199219, "epoch": 0.13173026659696813, - "grad_norm": 0.1023937463760376, - "kl": 0.0007214546203613281, - "learning_rate": 9.969471946426285e-08, - "loss": 0.0198, - "reward": 0.3085937574505806, - "reward_std": 0.04870673501864076, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 4.738426208496094, + "kl": 0.6240234375, + "learning_rate": 4.984735973213142e-07, + "loss": 0.1075, + "reward": 0.3978794813156128, + "reward_std": 0.21313538402318954, + "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830484867096, + "rewards/tag_count_reward": 0.3420759066939354, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 2046.9017944335938, + "completion_length": 1736.6562805175781, "epoch": 0.13202897468448957, - "grad_norm": 0.0649019256234169, - "kl": 0.0004949569702148438, - "learning_rate": 9.968893829227241e-08, - "loss": 0.002, - "reward": 0.325892873108387, - "reward_std": 0.015027947491034865, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 8.45821475982666, + "kl": 0.6142578125, + "learning_rate": 4.98444691461362e-07, + "loss": 0.0975, + "reward": 0.4408482387661934, + "reward_std": 0.164394099265337, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2544642984867096, + "rewards/tag_count_reward": 0.364955373108387, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 1993.2098999023438, + "completion_length": 1688.6786499023438, "epoch": 0.13232768277201104, - "grad_norm": 0.0911622866988182, - "kl": 0.0008769035339355469, - "learning_rate": 9.968310306358714e-08, - "loss": 0.0177, - "reward": 0.3437500223517418, - "reward_std": 0.026251389179378748, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 15.27545166015625, + "kl": 0.7021484375, + "learning_rate": 4.984155153179356e-07, + "loss": 0.1102, + "reward": 0.4319196715950966, + "reward_std": 0.17640457674860954, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214477300644, + "rewards/tag_count_reward": 0.3515625149011612, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 2035.868408203125, + "completion_length": 1777.5358276367188, "epoch": 0.1326263908595325, - "grad_norm": 0.08807260543107986, - "kl": 0.0005793571472167969, - "learning_rate": 9.96772137845552e-08, - "loss": 0.0076, - "reward": 0.3069196566939354, - "reward_std": 0.04468150855973363, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 4.756255626678467, + "kl": 0.7490234375, + "learning_rate": 4.98386068922776e-07, + "loss": 0.1202, + "reward": 0.3962053805589676, + "reward_std": 0.22278672456741333, + "rewards/accuracy_reward": 0.06026785844005644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.3359375149011612, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 1966.8728637695312, + "completion_length": 1681.3973693847656, "epoch": 0.13292509894705398, - "grad_norm": 0.10563986003398895, - "kl": 0.0010766983032226562, - "learning_rate": 9.967127046158357e-08, - "loss": 0.0258, - "reward": 0.4196428805589676, - "reward_std": 0.06975942570716143, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 4.871476650238037, + "kl": 0.54296875, + "learning_rate": 4.983563523079179e-07, + "loss": 0.0828, + "reward": 0.5066964477300644, + "reward_std": 0.23435239493846893, + "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2857142984867096, + "rewards/tag_count_reward": 0.3549107238650322, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 2028.6541137695312, + "completion_length": 1715.7322082519531, "epoch": 0.13322380703457545, - "grad_norm": 0.08977127075195312, - "kl": 0.0006647109985351562, - "learning_rate": 9.966527310113797e-08, - "loss": 0.0179, - "reward": 0.345424123108387, - "reward_std": 0.04856186546385288, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 7.476255893707275, + "kl": 0.6259765625, + "learning_rate": 4.983263655056899e-07, + "loss": 0.131, + "reward": 0.4542410895228386, + "reward_std": 0.1906590349972248, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.3694196566939354, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 2011.3818054199219, + "completion_length": 1725.649658203125, "epoch": 0.13352251512209692, - "grad_norm": 0.11246927082538605, - "kl": 0.0008296966552734375, - "learning_rate": 9.965922170974298e-08, - "loss": 0.0269, - "reward": 0.4302455559372902, - "reward_std": 0.05755465663969517, - "rewards/accuracy_reward": 0.1540178656578064, + "grad_norm": 6.511068820953369, + "kl": 0.63720703125, + "learning_rate": 4.982961085487149e-07, + "loss": 0.115, + "reward": 0.5312500298023224, + "reward_std": 0.22237225249409676, + "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.3526785895228386, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 2040.6473693847656, + "completion_length": 1748.44873046875, "epoch": 0.1338212232096184, - "grad_norm": 0.067986398935318, - "kl": 0.0005216598510742188, - "learning_rate": 9.965311629398185e-08, - "loss": 0.0077, - "reward": 0.255580373108387, - "reward_std": 0.015971238259226084, - "rewards/accuracy_reward": 0.0, + "grad_norm": 3.9968206882476807, + "kl": 0.642578125, + "learning_rate": 4.982655814699092e-07, + "loss": 0.1125, + "reward": 0.3683035895228386, + "reward_std": 0.21424997225403786, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.255580373108387, + "rewards/tag_count_reward": 0.3392857313156128, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 2031.6206665039062, + "completion_length": 1733.4643859863281, "epoch": 0.13411993129713987, - "grad_norm": 0.09235338866710663, - "kl": 0.00064849853515625, - "learning_rate": 9.964695686049675e-08, - "loss": 0.0123, - "reward": 0.3593750223517418, - "reward_std": 0.06194194406270981, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 4.968216896057129, + "kl": 0.5908203125, + "learning_rate": 4.982347843024837e-07, + "loss": 0.0865, + "reward": 0.447544664144516, + "reward_std": 0.2222447618842125, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.3448660895228386, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 2029.9286499023438, + "completion_length": 1665.49560546875, "epoch": 0.13441863938466134, - "grad_norm": 0.09111592918634415, - "kl": 0.0007061958312988281, - "learning_rate": 9.964074341598851e-08, - "loss": 0.0093, - "reward": 0.342633955180645, - "reward_std": 0.0696880214381963, - "rewards/accuracy_reward": 0.07589285750873387, + "grad_norm": 13.677940368652344, + "kl": 0.65478515625, + "learning_rate": 4.982037170799425e-07, + "loss": 0.1433, + "reward": 0.4453125298023224, + "reward_std": 0.2230478599667549, + "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2667410895228386, + "rewards/tag_count_reward": 0.3582589477300644, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 2036.1429138183594, + "completion_length": 1713.0982971191406, "epoch": 0.1347173474721828, - "grad_norm": 0.09050633758306503, - "kl": 0.0006337165832519531, - "learning_rate": 9.963447596721675e-08, - "loss": 0.0119, - "reward": 0.3052455484867096, - "reward_std": 0.04493393772281706, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 8.526785850524902, + "kl": 0.60205078125, + "learning_rate": 4.981723798360837e-07, + "loss": 0.1122, + "reward": 0.443080373108387, + "reward_std": 0.22774287313222885, + "rewards/accuracy_reward": 0.07812500325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.3649553805589676, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 2007.509033203125, + "completion_length": 1652.6563110351562, "epoch": 0.13501605555970428, - "grad_norm": 0.10028314590454102, - "kl": 0.0008487701416015625, - "learning_rate": 9.962815452099984e-08, - "loss": 0.0167, - "reward": 0.282924123108387, - "reward_std": 0.06159662688151002, - "rewards/accuracy_reward": 0.0133928582072258, + "grad_norm": 10.148331642150879, + "kl": 0.7939453125, + "learning_rate": 4.981407726049992e-07, + "loss": 0.1196, + "reward": 0.3761160895228386, + "reward_std": 0.20988183096051216, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.3470982313156128, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 1991.0804443359375, + "completion_length": 1689.5603637695312, "epoch": 0.13531476364722575, - "grad_norm": 0.1116897314786911, - "kl": 0.001041412353515625, - "learning_rate": 9.96217790842149e-08, - "loss": 0.0256, - "reward": 0.3777901902794838, - "reward_std": 0.05948487529531121, - "rewards/accuracy_reward": 0.10491072130389512, + "grad_norm": 17.340486526489258, + "kl": 0.966796875, + "learning_rate": 4.981088954210745e-07, + "loss": 0.1205, + "reward": 0.4380580484867096, + "reward_std": 0.21062801778316498, + "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.3309151977300644, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 2022.4599304199219, + "completion_length": 1721.3058776855469, "epoch": 0.13561347173474722, - "grad_norm": 0.10869956016540527, - "kl": 0.0007953643798828125, - "learning_rate": 9.961534966379776e-08, - "loss": 0.0193, - "reward": 0.3292410895228386, - "reward_std": 0.07412611343897879, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 5.392794132232666, + "kl": 0.814453125, + "learning_rate": 4.980767483189888e-07, + "loss": 0.1183, + "reward": 0.4425223395228386, + "reward_std": 0.25322138145565987, + "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.3353794738650322, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 2014.0759887695312, + "completion_length": 1699.13623046875, "epoch": 0.1359121798222687, - "grad_norm": 0.10937536507844925, - "kl": 0.00089263916015625, - "learning_rate": 9.9608866266743e-08, - "loss": 0.0183, - "reward": 0.4603794813156128, - "reward_std": 0.08188504865393043, - "rewards/accuracy_reward": 0.1830357201397419, + "grad_norm": 3.3406989574432373, + "kl": 0.609375, + "learning_rate": 4.98044331333715e-07, + "loss": 0.1061, + "reward": 0.5139509215950966, + "reward_std": 0.22931575030088425, + "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437574505806, + "rewards/tag_count_reward": 0.3197544738650322, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 2003.27685546875, + "completion_length": 1695.7523193359375, "epoch": 0.13621088790979016, - "grad_norm": 0.10883515328168869, - "kl": 0.00093841552734375, - "learning_rate": 9.960232890010396e-08, - "loss": 0.0189, - "reward": 0.318080373108387, - "reward_std": 0.08473952207714319, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 11.295357704162598, + "kl": 0.7421875, + "learning_rate": 4.980116445005198e-07, + "loss": 0.101, + "reward": 0.4095982387661934, + "reward_std": 0.2645513229072094, + "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.3292410895228386, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 2021.4889221191406, + "completion_length": 1688.6473999023438, "epoch": 0.13650959599731163, - "grad_norm": 0.10494883358478546, - "kl": 0.0008754730224609375, - "learning_rate": 9.959573757099263e-08, - "loss": 0.0154, - "reward": 0.3186384066939354, - "reward_std": 0.10167613998055458, - "rewards/accuracy_reward": 0.04464286006987095, + "grad_norm": 10.917984008789062, + "kl": 0.5927734375, + "learning_rate": 4.979786878549631e-07, + "loss": 0.117, + "reward": 0.415736623108387, + "reward_std": 0.26218926906585693, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.273995541036129, + "rewards/tag_count_reward": 0.3465401902794838, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 2008.6987609863281, + "completion_length": 1659.9978332519531, "epoch": 0.1368083040848331, - "grad_norm": 0.10564270615577698, - "kl": 0.0009031295776367188, - "learning_rate": 9.95890922865797e-08, - "loss": 0.0221, - "reward": 0.4017857313156128, - "reward_std": 0.07299068197607994, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 4.539723873138428, + "kl": 0.6806640625, + "learning_rate": 4.979454614328985e-07, + "loss": 0.1273, + "reward": 0.489955373108387, + "reward_std": 0.24744581058621407, + "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.3225446566939354, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 2030.0402221679688, + "completion_length": 1744.3438415527344, "epoch": 0.13710701217235458, - "grad_norm": 0.09711883217096329, - "kl": 0.0007486343383789062, - "learning_rate": 9.958239305409463e-08, - "loss": 0.0064, - "reward": 0.270647332072258, - "reward_std": 0.03938987897709012, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 13.337088584899902, + "kl": 0.994140625, + "learning_rate": 4.979119652704731e-07, + "loss": 0.128, + "reward": 0.3320312649011612, + "reward_std": 0.20739859715104103, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151828289032, + "rewards/tag_count_reward": 0.325334832072258, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 1973.1496276855469, + "completion_length": 1585.3639221191406, "epoch": 0.13740572025987605, - "grad_norm": 0.13835348188877106, - "kl": 0.0013666152954101562, - "learning_rate": 9.957563988082552e-08, - "loss": 0.0284, - "reward": 0.4034598469734192, - "reward_std": 0.09908473864197731, - "rewards/accuracy_reward": 0.11607143236324191, + "grad_norm": 13.52846622467041, + "kl": 0.8251953125, + "learning_rate": 4.978781994041276e-07, + "loss": 0.1287, + "reward": 0.4648437723517418, + "reward_std": 0.22151242196559906, + "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2873883992433548, + "rewards/tag_count_reward": 0.3510044887661934, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 1994.1764221191406, + "completion_length": 1607.2835693359375, "epoch": 0.13770442834739752, - "grad_norm": 0.11239714175462723, - "kl": 0.0011730194091796875, - "learning_rate": 9.956883277411914e-08, - "loss": 0.0283, - "reward": 0.5066964477300644, - "reward_std": 0.08024896681308746, - "rewards/accuracy_reward": 0.2276785746216774, + "grad_norm": 8.296908378601074, + "kl": 0.927734375, + "learning_rate": 4.978441638705957e-07, + "loss": 0.1619, + "reward": 0.558035746216774, + "reward_std": 0.2525344230234623, + "rewards/accuracy_reward": 0.2410714398138225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2790178656578064, + "rewards/tag_count_reward": 0.3169642984867096, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 1987.6563110351562, + "completion_length": 1613.9799499511719, "epoch": 0.138003136434919, - "grad_norm": 0.130323126912117, - "kl": 0.0012769699096679688, - "learning_rate": 9.956197174138098e-08, - "loss": 0.0257, - "reward": 0.3537946566939354, - "reward_std": 0.05281845876015723, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 5.828124523162842, + "kl": 0.7685546875, + "learning_rate": 4.978098587069049e-07, + "loss": 0.1357, + "reward": 0.4319196715950966, + "reward_std": 0.19434111565351486, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2779017984867096, + "rewards/tag_count_reward": 0.3470982238650322, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 2034.1741943359375, + "completion_length": 1638.2634582519531, "epoch": 0.13830184452244043, - "grad_norm": 0.1011432334780693, - "kl": 0.0008211135864257812, - "learning_rate": 9.955505679007515e-08, - "loss": 0.0094, - "reward": 0.3554687798023224, - "reward_std": 0.05928331008180976, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 10.005996704101562, + "kl": 0.5703125, + "learning_rate": 4.977752839503758e-07, + "loss": 0.1392, + "reward": 0.4235491380095482, + "reward_std": 0.21815768629312515, + "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2661830559372902, + "rewards/tag_count_reward": 0.318638414144516, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 1960.9620971679688, + "completion_length": 1568.3549499511719, "epoch": 0.1386005526099619, - "grad_norm": 0.1249510645866394, - "kl": 0.0015916824340820312, - "learning_rate": 9.954808792772447e-08, - "loss": 0.0278, - "reward": 0.395647332072258, - "reward_std": 0.1102598849684, - "rewards/accuracy_reward": 0.10937500558793545, + "grad_norm": 11.636214256286621, + "kl": 0.4970703125, + "learning_rate": 4.977404396386224e-07, + "loss": 0.1128, + "reward": 0.4564732387661934, + "reward_std": 0.2784469500184059, + "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286272332072258, + "rewards/tag_count_reward": 0.3270089402794838, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 2041.5603332519531, + "completion_length": 1568.1652526855469, "epoch": 0.13889926069748337, - "grad_norm": 0.12845703959465027, - "kl": 0.0007486343383789062, - "learning_rate": 9.954106516191034e-08, - "loss": 0.0076, - "reward": 0.3956473469734192, - "reward_std": 0.09426523465663195, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 6.901561737060547, + "kl": 0.517578125, + "learning_rate": 4.977053258095517e-07, + "loss": 0.1258, + "reward": 0.498325914144516, + "reward_std": 0.254276342689991, + "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2639508992433548, + "rewards/tag_count_reward": 0.317522332072258, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 2033.7411804199219, + "completion_length": 1642.2523193359375, "epoch": 0.13919796878500484, - "grad_norm": 0.10542953759431839, - "kl": 0.0007925033569335938, - "learning_rate": 9.953398850027287e-08, - "loss": 0.013, - "reward": 0.3431919887661934, - "reward_std": 0.05664242245256901, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 66.4725341796875, + "kl": 1.4423828125, + "learning_rate": 4.976699425013643e-07, + "loss": 0.1595, + "reward": 0.3939732387661934, + "reward_std": 0.2176070772111416, + "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2650669813156128, + "rewards/tag_count_reward": 0.3046875149011612, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 2008.0023193359375, + "completion_length": 1612.5692749023438, "epoch": 0.13949667687252632, - "grad_norm": 0.12088914960622787, - "kl": 0.00112152099609375, - "learning_rate": 9.952685795051077e-08, - "loss": 0.0208, - "reward": 0.318080373108387, - "reward_std": 0.07262597838416696, - "rewards/accuracy_reward": 0.046875003492459655, + "grad_norm": 11.518576622009277, + "kl": 0.53369140625, + "learning_rate": 4.976342897525538e-07, + "loss": 0.0957, + "reward": 0.3727678805589676, + "reward_std": 0.21623540669679642, + "rewards/accuracy_reward": 0.055803572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.271205373108387, + "rewards/tag_count_reward": 0.3169643059372902, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 2038.3125915527344, + "completion_length": 1647.4375915527344, "epoch": 0.13979538496004779, - "grad_norm": 0.10402955114841461, - "kl": 0.00070953369140625, - "learning_rate": 9.951967352038135e-08, - "loss": 0.0089, - "reward": 0.2979910895228386, - "reward_std": 0.03109037782996893, - "rewards/accuracy_reward": 0.0357142873108387, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2622767984867096, + "grad_norm": 38.45896911621094, + "kl": 1.1640625, + "learning_rate": 4.975983676019068e-07, + "loss": 0.1231, + "reward": 0.3498884066939354, + "reward_std": 0.22987286746501923, + "rewards/accuracy_reward": 0.05133928847499192, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2963169738650322, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 1982.90185546875, + "completion_length": 1556.4353332519531, "epoch": 0.14009409304756926, - "grad_norm": 0.13000476360321045, - "kl": 0.0015072822570800781, - "learning_rate": 9.951243521770061e-08, - "loss": 0.0238, - "reward": 0.2952008992433548, - "reward_std": 0.0706777386367321, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 24.09501838684082, + "kl": 1.166015625, + "learning_rate": 4.97562176088503e-07, + "loss": 0.1488, + "reward": 0.3281250149011612, + "reward_std": 0.22574611008167267, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2795758992433548, + "rewards/tag_count_reward": 0.2991071566939354, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 2007.700927734375, + "completion_length": 1565.5380249023438, "epoch": 0.14039280113509073, - "grad_norm": 0.10881378501653671, - "kl": 0.0011463165283203125, - "learning_rate": 9.950514305034309e-08, - "loss": 0.0213, - "reward": 0.349888414144516, - "reward_std": 0.07002316461876035, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 21.94568634033203, + "kl": 0.5517578125, + "learning_rate": 4.975257152517154e-07, + "loss": 0.1194, + "reward": 0.3839285969734192, + "reward_std": 0.23307275772094727, + "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.321428582072258, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 1992.0893859863281, + "completion_length": 1493.5625610351562, "epoch": 0.1406915092226122, - "grad_norm": 0.16518795490264893, - "kl": 0.0014886856079101562, - "learning_rate": 9.949779702624194e-08, - "loss": 0.0293, - "reward": 0.321986623108387, - "reward_std": 0.06505566649138927, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 12.730708122253418, + "kl": 1.01953125, + "learning_rate": 4.974889851312097e-07, + "loss": 0.1514, + "reward": 0.4006696566939354, + "reward_std": 0.24730517342686653, + "rewards/accuracy_reward": 0.08258929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437574505806, + "rewards/tag_count_reward": 0.318080373108387, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 2002.0357666015625, + "completion_length": 1497.8973693847656, "epoch": 0.14099021731013367, - "grad_norm": 0.11497579514980316, - "kl": 0.00133514404296875, - "learning_rate": 9.949039715338896e-08, - "loss": 0.0211, - "reward": 0.440848246216774, - "reward_std": 0.10331951454281807, - "rewards/accuracy_reward": 0.16517857927829027, + "grad_norm": 17.26862144470215, + "kl": 0.54248046875, + "learning_rate": 4.974519857669448e-07, + "loss": 0.1055, + "reward": 0.487165205180645, + "reward_std": 0.24169136211276054, + "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2756696492433548, + "rewards/tag_count_reward": 0.3041294738650322, "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 2016.1272888183594, + "completion_length": 1490.7210693359375, "epoch": 0.14128892539765514, - "grad_norm": 0.13661468029022217, - "kl": 0.0012063980102539062, - "learning_rate": 9.948294343983444e-08, - "loss": 0.0237, - "reward": 0.2929687649011612, - "reward_std": 0.06719244224950671, - "rewards/accuracy_reward": 0.022321429569274187, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473395228386, + "grad_norm": 5.557093620300293, + "kl": 0.8603515625, + "learning_rate": 4.974147171991722e-07, + "loss": 0.1112, + "reward": 0.3515625074505806, + "reward_std": 0.22987275943160057, + "rewards/accuracy_reward": 0.037946430034935474, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3113839402794838, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 2021.0781860351562, + "completion_length": 1503.5625610351562, "epoch": 0.1415876334851766, - "grad_norm": 0.13647322356700897, - "kl": 0.0011882781982421875, - "learning_rate": 9.947543589368732e-08, - "loss": 0.0155, - "reward": 0.3398437649011612, - "reward_std": 0.031427118461579084, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 25.97781753540039, + "kl": 1.28125, + "learning_rate": 4.973771794684367e-07, + "loss": 0.1219, + "reward": 0.3616071566939354, + "reward_std": 0.21219830960035324, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2684151977300644, + "rewards/tag_count_reward": 0.274553582072258, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 2016.9666137695312, + "completion_length": 1544.5871276855469, "epoch": 0.14188634157269808, - "grad_norm": 0.11926978081464767, - "kl": 0.0012521743774414062, - "learning_rate": 9.946787452311505e-08, - "loss": 0.0204, - "reward": 0.3325892984867096, - "reward_std": 0.10725545324385166, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 17.041881561279297, + "kl": 0.998046875, + "learning_rate": 4.973393726155752e-07, + "loss": 0.1317, + "reward": 0.3487723395228386, + "reward_std": 0.2655651904642582, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2767857238650322, + "rewards/tag_count_reward": 0.301897332072258, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 1992.4688415527344, + "completion_length": 1534.8348999023438, "epoch": 0.14218504966021955, - "grad_norm": 0.09288622438907623, - "kl": 0.0014619827270507812, - "learning_rate": 9.946025933634367e-08, - "loss": 0.0208, - "reward": 0.3013393059372902, - "reward_std": 0.03121967939659953, - "rewards/accuracy_reward": 0.03125, + "grad_norm": 20.357866287231445, + "kl": 1.12109375, + "learning_rate": 4.973012966817184e-07, + "loss": 0.1337, + "reward": 0.329241082072258, + "reward_std": 0.21531317383050919, + "rewards/accuracy_reward": 0.042410717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.2868303656578064, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 2006.01123046875, + "completion_length": 1507.9486999511719, "epoch": 0.14248375774774102, - "grad_norm": 0.12001644819974899, - "kl": 0.0014095306396484375, - "learning_rate": 9.945259034165776e-08, - "loss": 0.0247, - "reward": 0.3906250149011612, - "reward_std": 0.056298661045730114, - "rewards/accuracy_reward": 0.11383928963914514, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2767857238650322, + "grad_norm": 8.733418464660645, + "kl": 0.51806640625, + "learning_rate": 4.972629517082888e-07, + "loss": 0.0852, + "reward": 0.4693080633878708, + "reward_std": 0.24232514947652817, + "rewards/accuracy_reward": 0.14955357927829027, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.317522332072258, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 2010.46435546875, + "completion_length": 1479.0469360351562, "epoch": 0.1427824658352625, - "grad_norm": 0.11263782531023026, - "kl": 0.0014448165893554688, - "learning_rate": 9.944486754740044e-08, - "loss": 0.0206, - "reward": 0.3867187798023224, - "reward_std": 0.09748081141151488, - "rewards/accuracy_reward": 0.10937500558793545, + "grad_norm": 80.32256317138672, + "kl": 1.44921875, + "learning_rate": 4.972243377370022e-07, + "loss": 0.1608, + "reward": 0.407924123108387, + "reward_std": 0.2274959348142147, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437574505806, + "rewards/tag_count_reward": 0.3119419738650322, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 1964.2634582519531, + "completion_length": 1485.1986999511719, "epoch": 0.14308117392278397, - "grad_norm": 0.15227386355400085, - "kl": 0.002197265625, - "learning_rate": 9.943709096197333e-08, - "loss": 0.0399, - "reward": 0.2896205484867096, - "reward_std": 0.05594425182789564, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 14.516999244689941, + "kl": 1.1171875, + "learning_rate": 4.971854548098667e-07, + "loss": 0.1287, + "reward": 0.3030134066939354, + "reward_std": 0.19861429184675217, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2873884066939354, + "rewards/tag_count_reward": 0.2940848246216774, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 1984.3906860351562, + "completion_length": 1449.9375915527344, "epoch": 0.14337988201030544, - "grad_norm": 0.13435454666614532, - "kl": 0.0018062591552734375, - "learning_rate": 9.942926059383663e-08, - "loss": 0.0265, - "reward": 0.3733259066939354, - "reward_std": 0.04726517084054649, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 25.714197158813477, + "kl": 1.138671875, + "learning_rate": 4.971463029691832e-07, + "loss": 0.149, + "reward": 0.4045759066939354, + "reward_std": 0.22242727130651474, + "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794813156128, + "rewards/tag_count_reward": 0.2996651902794838, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 2023.5982971191406, + "completion_length": 1539.18310546875, "epoch": 0.1436785900978269, - "grad_norm": 0.11547597497701645, - "kl": 0.0011835098266601562, - "learning_rate": 9.942137645150898e-08, - "loss": 0.0199, - "reward": 0.348772332072258, - "reward_std": 0.05556344147771597, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 56.93959426879883, + "kl": 0.9384765625, + "learning_rate": 4.97106882257545e-07, + "loss": 0.1047, + "reward": 0.3939732313156128, + "reward_std": 0.21351967379450798, + "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473246216774, + "rewards/tag_count_reward": 0.3069196566939354, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 1933.3371276855469, + "completion_length": 1378.513427734375, "epoch": 0.14397729818534838, - "grad_norm": 0.11384446173906326, - "kl": 0.0028543472290039062, - "learning_rate": 9.941343854356756e-08, - "loss": 0.0264, - "reward": 0.4503348469734192, - "reward_std": 0.04934239503927529, - "rewards/accuracy_reward": 0.1651785783469677, + "grad_norm": 14.244688034057617, + "kl": 1.12109375, + "learning_rate": 4.970671927178378e-07, + "loss": 0.1364, + "reward": 0.4648437574505806, + "reward_std": 0.22593215480446815, + "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "rewards/tag_count_reward": 0.297433041036129, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 2014.4554138183594, + "completion_length": 1489.7545166015625, "epoch": 0.14427600627286985, - "grad_norm": 0.10993655771017075, - "kl": 0.0014295578002929688, - "learning_rate": 9.940544687864805e-08, - "loss": 0.0186, - "reward": 0.317522332072258, - "reward_std": 0.057752539636567235, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 17.129865646362305, + "kl": 1.19921875, + "learning_rate": 4.970272343932403e-07, + "loss": 0.1345, + "reward": 0.3431919813156128, + "reward_std": 0.21285830065608025, + "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794738650322, + "rewards/tag_count_reward": 0.2851562649011612, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 1966.7991943359375, + "completion_length": 1426.2032165527344, "epoch": 0.14457471436039132, - "grad_norm": 0.1139247938990593, - "kl": 0.0024251937866210938, - "learning_rate": 9.93974014654446e-08, - "loss": 0.0184, - "reward": 0.4268973395228386, - "reward_std": 0.10391270462423563, - "rewards/accuracy_reward": 0.13616072130389512, + "grad_norm": 23.218088150024414, + "kl": 0.94140625, + "learning_rate": 4.96987007327223e-07, + "loss": 0.1078, + "reward": 0.440290205180645, + "reward_std": 0.2618703097105026, + "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2907366156578064, + "rewards/tag_count_reward": 0.2952008992433548, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 2006.0625305175781, + "completion_length": 1483.5067443847656, "epoch": 0.14487342244791276, - "grad_norm": 0.10335541516542435, - "kl": 0.001544952392578125, - "learning_rate": 9.938930231270981e-08, - "loss": 0.0233, - "reward": 0.3097098395228386, - "reward_std": 0.04779530595988035, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 39.45608901977539, + "kl": 1.1689453125, + "learning_rate": 4.96946511563549e-07, + "loss": 0.0991, + "reward": 0.368861623108387, + "reward_std": 0.2229001373052597, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.2996651902794838, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 2040.71435546875, + "completion_length": 1526.5335693359375, "epoch": 0.14517213053543424, - "grad_norm": 0.08110660314559937, - "kl": 0.0009174346923828125, - "learning_rate": 9.938114942925477e-08, - "loss": 0.0087, - "reward": 0.2684151902794838, - "reward_std": 0.03587323520332575, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 47.41693115234375, + "kl": 1.34375, + "learning_rate": 4.969057471462739e-07, + "loss": 0.1216, + "reward": 0.286830373108387, + "reward_std": 0.20712244510650635, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2617187649011612, + "rewards/tag_count_reward": 0.2756696566939354, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 2028.0558776855469, + "completion_length": 1455.1317749023438, "epoch": 0.1454708386229557, - "grad_norm": 0.09722445160150528, - "kl": 0.001331329345703125, - "learning_rate": 9.937294282394904e-08, - "loss": 0.0177, - "reward": 0.3113839402794838, - "reward_std": 0.053774913772940636, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 22.495946884155273, + "kl": 1.2294921875, + "learning_rate": 4.968647141197452e-07, + "loss": 0.16, + "reward": 0.3275669738650322, + "reward_std": 0.19334674626588821, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.2806919738650322, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 1973.4152526855469, + "completion_length": 1488.3616943359375, "epoch": 0.14576954671047718, - "grad_norm": 0.14169614017009735, - "kl": 0.002410888671875, - "learning_rate": 9.936468250572059e-08, - "loss": 0.036, - "reward": 0.340401791036129, - "reward_std": 0.08181907795369625, - "rewards/accuracy_reward": 0.05357143026776612, + "grad_norm": 10.561921119689941, + "kl": 0.82421875, + "learning_rate": 4.968234125286029e-07, + "loss": 0.1156, + "reward": 0.3716517984867096, + "reward_std": 0.18082809075713158, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2868303656578064, + "rewards/tag_count_reward": 0.329241082072258, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 1992.1116943359375, + "completion_length": 1505.5224304199219, "epoch": 0.14606825479799865, - "grad_norm": 0.12222287058830261, - "kl": 0.0021533966064453125, - "learning_rate": 9.935636848355581e-08, - "loss": 0.0102, - "reward": 0.3325892984867096, - "reward_std": 0.10254515800625086, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 12.997093200683594, + "kl": 0.8125, + "learning_rate": 4.96781842417779e-07, + "loss": 0.0919, + "reward": 0.3677455484867096, + "reward_std": 0.22588007152080536, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2834821566939354, + "rewards/tag_count_reward": 0.298549123108387, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 1961.087158203125, + "completion_length": 1422.8973999023438, "epoch": 0.14636696288552012, - "grad_norm": 0.12215470522642136, - "kl": 0.002758026123046875, - "learning_rate": 9.934800076649958e-08, - "loss": 0.0303, - "reward": 0.2991071492433548, - "reward_std": 0.07340858597308397, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 29.225215911865234, + "kl": 0.5908203125, + "learning_rate": 4.967400038324979e-07, + "loss": 0.1163, + "reward": 0.3286830484867096, + "reward_std": 0.19791864231228828, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2879464328289032, + "rewards/tag_count_reward": 0.3152901902794838, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 2044.0000610351562, + "completion_length": 1534.2076416015625, "epoch": 0.1466656709730416, - "grad_norm": 0.08755538612604141, - "kl": 0.0009307861328125, - "learning_rate": 9.933957936365513e-08, - "loss": 0.0049, - "reward": 0.2996651902794838, - "reward_std": 0.03446268476545811, - "rewards/accuracy_reward": 0.04017857322469354, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2594866156578064, + "grad_norm": 20.25705337524414, + "kl": 0.7294921875, + "learning_rate": 4.966978968182757e-07, + "loss": 0.1147, + "reward": 0.3052455484867096, + "reward_std": 0.21112031117081642, + "rewards/accuracy_reward": 0.04910714505240321, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2539062686264515, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 1979.5693054199219, + "completion_length": 1450.1161499023438, "epoch": 0.14696437906056306, - "grad_norm": 0.11866500228643417, - "kl": 0.0024166107177734375, - "learning_rate": 9.933110428418414e-08, - "loss": 0.035, - "reward": 0.447544664144516, - "reward_std": 0.1235068216919899, - "rewards/accuracy_reward": 0.1607142947614193, + "grad_norm": 21.163368225097656, + "kl": 0.62109375, + "learning_rate": 4.966555214209207e-07, + "loss": 0.1054, + "reward": 0.4787946715950966, + "reward_std": 0.24121013283729553, + "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2868303582072258, + "rewards/tag_count_reward": 0.3002232275903225, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 2020.3103332519531, + "completion_length": 1477.8661193847656, "epoch": 0.14726308714808453, - "grad_norm": 0.09099187701940536, - "kl": 0.0014629364013671875, - "learning_rate": 9.932257553730666e-08, - "loss": 0.0195, - "reward": 0.349888414144516, - "reward_std": 0.056657119654119015, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 35.966796875, + "kl": 1.251953125, + "learning_rate": 4.966128776865333e-07, + "loss": 0.0999, + "reward": 0.3632812649011612, + "reward_std": 0.20130787789821625, + "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.267299123108387, + "rewards/tag_count_reward": 0.2762276828289032, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 1986.5380249023438, + "completion_length": 1399.7879943847656, "epoch": 0.147561795235606, - "grad_norm": 0.11789730936288834, - "kl": 0.0022249221801757812, - "learning_rate": 9.93139931323011e-08, - "loss": 0.0322, - "reward": 0.3883928656578064, - "reward_std": 0.05938239092938602, - "rewards/accuracy_reward": 0.1160714365541935, + "grad_norm": 28.23116111755371, + "kl": 0.736328125, + "learning_rate": 4.965699656615056e-07, + "loss": 0.1284, + "reward": 0.4363839477300644, + "reward_std": 0.22000691294670105, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.3069196566939354, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 1972.2456359863281, + "completion_length": 1441.05810546875, "epoch": 0.14786050332312747, - "grad_norm": 0.11901458352804184, - "kl": 0.002796173095703125, - "learning_rate": 9.930535707850432e-08, - "loss": 0.0151, - "reward": 0.3141741156578064, - "reward_std": 0.07408899907022715, - "rewards/accuracy_reward": 0.031250001629814506, + "grad_norm": 12.888809204101562, + "kl": 1.13671875, + "learning_rate": 4.965267853925216e-07, + "loss": 0.1038, + "reward": 0.3186384066939354, + "reward_std": 0.22457966953516006, + "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2829241156578064, + "rewards/tag_count_reward": 0.2851562574505806, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 2044.6228332519531, + "completion_length": 1453.5157165527344, "epoch": 0.14815921141064894, - "grad_norm": 0.08736791461706161, - "kl": 0.00101470947265625, - "learning_rate": 9.929666738531149e-08, - "loss": 0.0038, - "reward": 0.2963169813156128, - "reward_std": 0.025433843955397606, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 28.30711555480957, + "kl": 1.3046875, + "learning_rate": 4.964833369265575e-07, + "loss": 0.128, + "reward": 0.3331473395228386, + "reward_std": 0.20996088534593582, + "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2606026902794838, + "rewards/tag_count_reward": 0.2706473283469677, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 2008.0558471679688, + "completion_length": 1473.6942749023438, "epoch": 0.14845791949817042, - "grad_norm": 0.08000420778989792, - "kl": 0.0019245147705078125, - "learning_rate": 9.928792406217613e-08, - "loss": 0.0143, - "reward": 0.4436384066939354, - "reward_std": 0.052717901300638914, - "rewards/accuracy_reward": 0.1674107164144516, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276828289032, + "grad_norm": 15.71290111541748, + "kl": 1.07421875, + "learning_rate": 4.964396203108806e-07, + "loss": 0.1029, + "reward": 0.449776791036129, + "reward_std": 0.2157180830836296, + "rewards/accuracy_reward": 0.16517857555299997, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2823660783469677, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 2007.9107971191406, + "completion_length": 1457.9063110351562, "epoch": 0.1487566275856919, - "grad_norm": 0.10823286324739456, - "kl": 0.00196075439453125, - "learning_rate": 9.927912711861013e-08, - "loss": 0.0172, - "reward": 0.3152901902794838, - "reward_std": 0.11215580813586712, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 26.50455665588379, + "kl": 1.439453125, + "learning_rate": 4.963956355930506e-07, + "loss": 0.1227, + "reward": 0.3309151902794838, + "reward_std": 0.2338428609073162, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2795758992433548, + "rewards/tag_count_reward": 0.2929687574505806, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 1997.4777526855469, + "completion_length": 1450.2545471191406, "epoch": 0.14905533567321336, - "grad_norm": 0.13592801988124847, - "kl": 0.0021924972534179688, - "learning_rate": 9.92702765641837e-08, - "loss": 0.0263, - "reward": 0.3638393133878708, - "reward_std": 0.1138795530423522, - "rewards/accuracy_reward": 0.08035714412108064, + "grad_norm": 15.583983421325684, + "kl": 1.0234375, + "learning_rate": 4.963513828209184e-07, + "loss": 0.1333, + "reward": 0.3627232313156128, + "reward_std": 0.20912770181894302, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2834821566939354, + "rewards/tag_count_reward": 0.309151791036129, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 1981.5983276367188, + "completion_length": 1392.3393249511719, "epoch": 0.14935404376073483, - "grad_norm": 0.11549423635005951, - "kl": 0.0025272369384765625, - "learning_rate": 9.926137240852537e-08, - "loss": 0.0287, - "reward": 0.286830373108387, - "reward_std": 0.04992205323651433, - "rewards/accuracy_reward": 0.0066964291036129, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339402794838, + "grad_norm": 15.106719970703125, + "kl": 1.0703125, + "learning_rate": 4.963068620426269e-07, + "loss": 0.0984, + "reward": 0.2873884066939354, + "reward_std": 0.1699320189654827, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2829241193830967, "step": 500 }, { "clip_ratio": 0.0, - "completion_length": 2011.40185546875, + "completion_length": 1476.6139221191406, "epoch": 0.1496527518482563, - "grad_norm": 0.10822220891714096, - "kl": 0.0018548965454101562, - "learning_rate": 9.925241466132199e-08, - "loss": 0.0176, - "reward": 0.333147332072258, - "reward_std": 0.06433450477197766, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 20.554685592651367, + "kl": 1.041015625, + "learning_rate": 4.962620733066099e-07, + "loss": 0.1098, + "reward": 0.3404018059372902, + "reward_std": 0.18493600189685822, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "rewards/tag_count_reward": 0.2756696529686451, "step": 501 }, { "clip_ratio": 0.0, - "completion_length": 1963.6697387695312, + "completion_length": 1431.8750610351562, "epoch": 0.14995145993577777, - "grad_norm": 0.0986773744225502, - "kl": 0.0028858184814453125, - "learning_rate": 9.924340333231872e-08, - "loss": 0.0205, - "reward": 0.3007812649011612, - "reward_std": 0.07649564300663769, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 33.422760009765625, + "kl": 0.646484375, + "learning_rate": 4.962170166615936e-07, + "loss": 0.0978, + "reward": 0.3281250074505806, + "reward_std": 0.22296306490898132, + "rewards/accuracy_reward": 0.026785714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2806919813156128, + "rewards/tag_count_reward": 0.3013392984867096, "step": 502 }, { "clip_ratio": 0.0, - "completion_length": 2022.3728637695312, + "completion_length": 1513.4710388183594, "epoch": 0.15025016802329924, - "grad_norm": 0.07261163741350174, - "kl": 0.0016307830810546875, - "learning_rate": 9.923433843131899e-08, - "loss": 0.014, - "reward": 0.313616082072258, - "reward_std": 0.04675479349680245, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 16.91933822631836, + "kl": 0.8740234375, + "learning_rate": 4.96171692156595e-07, + "loss": 0.1201, + "reward": 0.3867187723517418, + "reward_std": 0.22634423896670341, + "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732313156128, + "rewards/tag_count_reward": 0.3085937649011612, "step": 503 }, { "clip_ratio": 0.0, - "completion_length": 1984.0380554199219, + "completion_length": 1416.993408203125, "epoch": 0.1505488761108207, - "grad_norm": 0.11427389830350876, - "kl": 0.002620697021484375, - "learning_rate": 9.922521996818454e-08, - "loss": 0.0202, - "reward": 0.353794664144516, - "reward_std": 0.06553467409685254, - "rewards/accuracy_reward": 0.06919643143191934, + "grad_norm": 19.677053451538086, + "kl": 0.68798828125, + "learning_rate": 4.961260998409227e-07, + "loss": 0.1043, + "reward": 0.3733259066939354, + "reward_std": 0.21079107746481895, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2845982313156128, + "rewards/tag_count_reward": 0.3063616156578064, "step": 504 }, { "clip_ratio": 0.0, - "completion_length": 1908.337158203125, + "completion_length": 1319.5335388183594, "epoch": 0.15084758419834218, - "grad_norm": 0.14047352969646454, - "kl": 0.00457763671875, - "learning_rate": 9.921604795283536e-08, - "loss": 0.0463, - "reward": 0.407924123108387, - "reward_std": 0.17381544411182404, - "rewards/accuracy_reward": 0.1049107217695564, + "grad_norm": 32.37833023071289, + "kl": 0.69921875, + "learning_rate": 4.960802397641768e-07, + "loss": 0.0922, + "reward": 0.4547991305589676, + "reward_std": 0.21234611794352531, + "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.345424123108387, "step": 505 }, { "clip_ratio": 0.0, - "completion_length": 1995.7411499023438, + "completion_length": 1444.8683776855469, "epoch": 0.15114629228586363, - "grad_norm": 0.12033507972955704, - "kl": 0.00232696533203125, - "learning_rate": 9.920682239524967e-08, - "loss": 0.0271, - "reward": 0.317522332072258, - "reward_std": 0.056899035815149546, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 20.803768157958984, + "kl": 1.267578125, + "learning_rate": 4.960341119762484e-07, + "loss": 0.1059, + "reward": 0.3459821566939354, + "reward_std": 0.1845408733934164, + "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437649011612, + "rewards/tag_count_reward": 0.2968750149011612, "step": 506 }, { "clip_ratio": 0.0, - "completion_length": 1967.9353637695312, + "completion_length": 1336.2634582519531, "epoch": 0.1514450003733851, - "grad_norm": 0.1303471028804779, - "kl": 0.003253936767578125, - "learning_rate": 9.919754330546403e-08, - "loss": 0.0346, - "reward": 0.3861607313156128, - "reward_std": 0.11430553626269102, - "rewards/accuracy_reward": 0.09375000232830644, + "grad_norm": 8.381172180175781, + "kl": 1.0302734375, + "learning_rate": 4.959877165273202e-07, + "loss": 0.0995, + "reward": 0.4040178805589676, + "reward_std": 0.17987819015979767, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107238650322, + "rewards/tag_count_reward": 0.3191964402794838, "step": 507 }, { "clip_ratio": 0.0, - "completion_length": 2019.4442749023438, + "completion_length": 1408.6786193847656, "epoch": 0.15174370846090657, - "grad_norm": 0.11302424222230911, - "kl": 0.0019130706787109375, - "learning_rate": 9.918821069357314e-08, - "loss": 0.0157, - "reward": 0.4631696566939354, - "reward_std": 0.06457453314214945, - "rewards/accuracy_reward": 0.1830357238650322, + "grad_norm": 22.949893951416016, + "kl": 0.9365234375, + "learning_rate": 4.959410534678657e-07, + "loss": 0.1031, + "reward": 0.517299123108387, + "reward_std": 0.2119394987821579, + "rewards/accuracy_reward": 0.1964285816065967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339328289032, + "rewards/tag_count_reward": 0.3208705559372902, "step": 508 }, { "clip_ratio": 0.0, - "completion_length": 2013.3460693359375, + "completion_length": 1461.1340026855469, "epoch": 0.15204241654842804, - "grad_norm": 0.1036442294716835, - "kl": 0.0019550323486328125, - "learning_rate": 9.917882456972998e-08, - "loss": 0.0216, - "reward": 0.325334832072258, - "reward_std": 0.06495755817741156, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 22.094566345214844, + "kl": 1.0107421875, + "learning_rate": 4.958941228486499e-07, + "loss": 0.0809, + "reward": 0.3465401977300644, + "reward_std": 0.2078564167022705, + "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2739955484867096, + "rewards/tag_count_reward": 0.290736623108387, "step": 509 }, { "clip_ratio": 0.0, - "completion_length": 2018.4308471679688, + "completion_length": 1457.2813110351562, "epoch": 0.1523411246359495, - "grad_norm": 0.10617455095052719, - "kl": 0.0017213821411132812, - "learning_rate": 9.916938494414573e-08, - "loss": 0.011, - "reward": 0.2695312574505806, - "reward_std": 0.032915799878537655, - "rewards/accuracy_reward": 0.0, + "grad_norm": 19.875404357910156, + "kl": 1.435546875, + "learning_rate": 4.958469247207286e-07, + "loss": 0.1254, + "reward": 0.3219866156578064, + "reward_std": 0.1806906796991825, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312574505806, + "rewards/tag_count_reward": 0.3041294738650322, "step": 510 }, { "clip_ratio": 0.0, - "completion_length": 2012.5871276855469, + "completion_length": 1412.7255554199219, "epoch": 0.15263983272347098, - "grad_norm": 0.10286625474691391, - "kl": 0.0020542144775390625, - "learning_rate": 9.915989182708977e-08, - "loss": 0.0219, - "reward": 0.4793527126312256, - "reward_std": 0.07745374087244272, - "rewards/accuracy_reward": 0.2031250074505806, + "grad_norm": 28.24956512451172, + "kl": 1.41015625, + "learning_rate": 4.957994591354489e-07, + "loss": 0.1272, + "reward": 0.5200893133878708, + "reward_std": 0.19330596551299095, + "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276828289032, + "rewards/tag_count_reward": 0.310267873108387, "step": 511 }, { "clip_ratio": 0.0, - "completion_length": 2022.6139221191406, + "completion_length": 1407.7679138183594, "epoch": 0.15293854081099245, - "grad_norm": 0.10444395244121552, - "kl": 0.0018253326416015625, - "learning_rate": 9.91503452288897e-08, - "loss": 0.0173, - "reward": 0.416852705180645, - "reward_std": 0.08516851486638188, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 990.5584716796875, + "kl": 1.849609375, + "learning_rate": 4.957517261444485e-07, + "loss": 0.1465, + "reward": 0.4665178880095482, + "reward_std": 0.24038105085492134, + "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2739955484867096, + "rewards/tag_count_reward": 0.2857142984867096, "step": 512 }, { "clip_ratio": 0.0, - "completion_length": 2014.4465026855469, + "completion_length": 1433.0223693847656, "epoch": 0.15323724889851392, - "grad_norm": 0.10827910155057907, - "kl": 0.0020599365234375, - "learning_rate": 9.914074515993127e-08, - "loss": 0.0163, - "reward": 0.3180803656578064, - "reward_std": 0.05008667195215821, - "rewards/accuracy_reward": 0.03794643026776612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339402794838, + "grad_norm": 30.94546890258789, + "kl": 1.681640625, + "learning_rate": 4.957037257996563e-07, + "loss": 0.1415, + "reward": 0.3465401902794838, + "reward_std": 0.19393234699964523, + "rewards/accuracy_reward": 0.04687500232830644, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2974330484867096, "step": 513 }, { "clip_ratio": 0.0, - "completion_length": 1951.4241943359375, + "completion_length": 1350.0893249511719, "epoch": 0.1535359569860354, - "grad_norm": 0.1326271891593933, - "kl": 0.003406524658203125, - "learning_rate": 9.913109163065842e-08, - "loss": 0.0372, - "reward": 0.349330373108387, - "reward_std": 0.08861260674893856, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 10.932440757751465, + "kl": 1.296875, + "learning_rate": 4.956554581532922e-07, + "loss": 0.1203, + "reward": 0.3671875149011612, + "reward_std": 0.19484785944223404, + "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589477300644, + "rewards/tag_count_reward": 0.318080373108387, "step": 514 }, { "clip_ratio": 0.0, - "completion_length": 1955.4576721191406, + "completion_length": 1375.40185546875, "epoch": 0.15383466507355686, - "grad_norm": 0.10871897637844086, - "kl": 0.003536224365234375, - "learning_rate": 9.912138465157323e-08, - "loss": 0.0304, - "reward": 0.3504464328289032, - "reward_std": 0.10707663092762232, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 51.63142776489258, + "kl": 1.654296875, + "learning_rate": 4.956069232578661e-07, + "loss": 0.127, + "reward": 0.317522332072258, + "reward_std": 0.19570493698120117, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2946428656578064, + "rewards/tag_count_reward": 0.2952009066939354, "step": 515 }, { "clip_ratio": 0.0, - "completion_length": 2002.90185546875, + "completion_length": 1448.38623046875, "epoch": 0.15413337316107834, - "grad_norm": 0.10621136426925659, - "kl": 0.0024318695068359375, - "learning_rate": 9.911162423323596e-08, - "loss": 0.0201, - "reward": 0.2801339477300644, - "reward_std": 0.05412868736311793, + "grad_norm": 32.530250549316406, + "kl": 1.572265625, + "learning_rate": 4.955581211661798e-07, + "loss": 0.1306, + "reward": 0.3080357313156128, + "reward_std": 0.16375143639743328, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2756696566939354, + "rewards/tag_count_reward": 0.3035714477300644, "step": 516 }, { "clip_ratio": 0.0, - "completion_length": 1986.7835693359375, + "completion_length": 1302.4665832519531, "epoch": 0.1544320812485998, - "grad_norm": 0.13218039274215698, - "kl": 0.0029239654541015625, - "learning_rate": 9.910181038626498e-08, - "loss": 0.0349, - "reward": 0.396763414144516, - "reward_std": 0.10193303320556879, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 22.07048225402832, + "kl": 1.30078125, + "learning_rate": 4.955090519313249e-07, + "loss": 0.1116, + "reward": 0.4453125149011612, + "reward_std": 0.2152816653251648, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562574505806, + "rewards/tag_count_reward": 0.3270089402794838, "step": 517 }, { "clip_ratio": 0.0, - "completion_length": 2024.1139221191406, + "completion_length": 1405.3014221191406, "epoch": 0.15473078933612128, - "grad_norm": 0.11076945066452026, - "kl": 0.0017910003662109375, - "learning_rate": 9.90919431213368e-08, - "loss": 0.0173, + "grad_norm": 9.912904739379883, + "kl": 1.556640625, + "learning_rate": 4.95459715606684e-07, + "loss": 0.1221, "reward": 0.3158482313156128, - "reward_std": 0.06501996563747525, - "rewards/accuracy_reward": 0.04687500232830644, + "reward_std": 0.19546522945165634, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2689732238650322, + "rewards/tag_count_reward": 0.2578125074505806, "step": 518 }, { "clip_ratio": 0.0, - "completion_length": 1966.2880554199219, + "completion_length": 1340.5246276855469, "epoch": 0.15502949742364275, - "grad_norm": 0.12313757836818695, - "kl": 0.00341033935546875, - "learning_rate": 9.908202244918602e-08, - "loss": 0.039, - "reward": 0.3816964477300644, - "reward_std": 0.11708412505686283, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 7.609091758728027, + "kl": 1.154296875, + "learning_rate": 4.954101122459301e-07, + "loss": 0.121, + "reward": 0.4235491380095482, + "reward_std": 0.23425734415650368, + "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294642873108387, + "rewards/tag_count_reward": 0.3119419813156128, "step": 519 }, { "clip_ratio": 0.0, - "completion_length": 2027.4241638183594, + "completion_length": 1503.0536499023438, "epoch": 0.15532820551116422, - "grad_norm": 0.0818467065691948, - "kl": 0.0015869140625, - "learning_rate": 9.907204838060539e-08, - "loss": 0.0093, - "reward": 0.329241082072258, - "reward_std": 0.04783848091028631, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 22.38677978515625, + "kl": 1.197265625, + "learning_rate": 4.953602419030269e-07, + "loss": 0.1032, + "reward": 0.3666294813156128, + "reward_std": 0.22174959629774094, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2645089402794838, + "rewards/tag_count_reward": 0.286272332072258, "step": 520 }, { "clip_ratio": 0.0, - "completion_length": 2038.4666137695312, + "completion_length": 1454.6674499511719, "epoch": 0.1556269135986857, - "grad_norm": 0.11458659172058105, - "kl": 0.001468658447265625, - "learning_rate": 9.906202092644567e-08, - "loss": 0.007, - "reward": 0.3041294738650322, - "reward_std": 0.0406738524325192, - "rewards/accuracy_reward": 0.03794643026776612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266183041036129, + "grad_norm": 27.314897537231445, + "kl": 1.3515625, + "learning_rate": 4.953101046322284e-07, + "loss": 0.1391, + "reward": 0.3443080559372902, + "reward_std": 0.2016202248632908, + "rewards/accuracy_reward": 0.051339289639145136, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2907366156578064, "step": 521 }, { "clip_ratio": 0.0, - "completion_length": 1987.8907165527344, + "completion_length": 1353.0625915527344, "epoch": 0.15592562168620716, - "grad_norm": 0.10158926993608475, - "kl": 0.0027713775634765625, - "learning_rate": 9.90519400976158e-08, - "loss": 0.0134, - "reward": 0.3638392984867096, - "reward_std": 0.06619240576401353, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 43.14972686767578, + "kl": 0.986328125, + "learning_rate": 4.95259700488079e-07, + "loss": 0.1288, + "reward": 0.3984375298023224, + "reward_std": 0.19918563589453697, + "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500223517418, + "rewards/tag_count_reward": 0.3091517984867096, "step": 522 }, { "clip_ratio": 0.0, - "completion_length": 1975.0715026855469, + "completion_length": 1430.7500610351562, "epoch": 0.15622432977372863, - "grad_norm": 0.11171112209558487, - "kl": 0.003253936767578125, - "learning_rate": 9.904180590508271e-08, - "loss": 0.0365, - "reward": 0.3816964477300644, - "reward_std": 0.07296849647536874, - "rewards/accuracy_reward": 0.09151786309666932, + "grad_norm": 23.774337768554688, + "kl": 1.201171875, + "learning_rate": 4.952090295254136e-07, + "loss": 0.1175, + "reward": 0.428013414144516, + "reward_std": 0.21897517517209053, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.290178582072258, + "rewards/tag_count_reward": 0.3208705559372902, "step": 523 }, { "clip_ratio": 0.0, - "completion_length": 1967.7143859863281, + "completion_length": 1433.8326721191406, "epoch": 0.1565230378612501, - "grad_norm": 0.12369764596223831, - "kl": 0.00325775146484375, - "learning_rate": 9.90316183598714e-08, - "loss": 0.0371, - "reward": 0.3917410969734192, - "reward_std": 0.09485725546255708, - "rewards/accuracy_reward": 0.09598214598372579, + "grad_norm": 26.849700927734375, + "kl": 1.06640625, + "learning_rate": 4.95158091799357e-07, + "loss": 0.1182, + "reward": 0.428571455180645, + "reward_std": 0.19215301051735878, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589402794838, + "rewards/tag_count_reward": 0.3125000149011612, "step": 524 }, { "clip_ratio": 0.0, - "completion_length": 2024.6139221191406, + "completion_length": 1522.0290832519531, "epoch": 0.15682174594877157, - "grad_norm": 0.09759242087602615, - "kl": 0.0019168853759765625, - "learning_rate": 9.902137747306492e-08, - "loss": 0.0155, - "reward": 0.3537946566939354, - "reward_std": 0.050408078357577324, - "rewards/accuracy_reward": 0.0781250037252903, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2756696566939354, + "grad_norm": 28.29598045349121, + "kl": 1.9296875, + "learning_rate": 4.951068873653246e-07, + "loss": 0.148, + "reward": 0.4095982313156128, + "reward_std": 0.21544695273041725, + "rewards/accuracy_reward": 0.11160714738070965, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2957589402794838, "step": 525 }, { "clip_ratio": 0.0, - "completion_length": 2031.9040832519531, + "completion_length": 1546.1942749023438, "epoch": 0.15712045403629304, - "grad_norm": 0.09665562212467194, - "kl": 0.0016384124755859375, - "learning_rate": 9.901108325580438e-08, - "loss": 0.0151, - "reward": 0.3102678805589676, - "reward_std": 0.05759762227535248, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 37.72453308105469, + "kl": 2.142578125, + "learning_rate": 4.95055416279022e-07, + "loss": 0.1554, + "reward": 0.3404017984867096, + "reward_std": 0.19783299416303635, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2678571492433548, + "rewards/tag_count_reward": 0.2912946566939354, "step": 526 }, { "clip_ratio": 0.0, - "completion_length": 2010.1005249023438, + "completion_length": 1500.4554443359375, "epoch": 0.15741916212381452, - "grad_norm": 0.12374105304479599, - "kl": 0.0023937225341796875, - "learning_rate": 9.900073571928886e-08, - "loss": 0.0239, - "reward": 0.3515625223517418, - "reward_std": 0.09123997669667006, - "rewards/accuracy_reward": 0.06473214644938707, + "grad_norm": 22.83420753479004, + "kl": 1.9140625, + "learning_rate": 4.950036785964443e-07, + "loss": 0.1622, + "reward": 0.3482143059372902, + "reward_std": 0.19142743572592735, + "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286830373108387, + "rewards/tag_count_reward": 0.2968750149011612, "step": 527 }, { "clip_ratio": 0.0, - "completion_length": 1992.8058776855469, + "completion_length": 1438.2054138183594, "epoch": 0.15771787021133596, - "grad_norm": 0.12261202931404114, - "kl": 0.002960205078125, - "learning_rate": 9.899033487477546e-08, - "loss": 0.0336, - "reward": 0.4280134066939354, - "reward_std": 0.11598779540508986, - "rewards/accuracy_reward": 0.13616072316654027, + "grad_norm": 21.790014266967773, + "kl": 1.798828125, + "learning_rate": 4.949516743738772e-07, + "loss": 0.1524, + "reward": 0.4815848469734192, + "reward_std": 0.24426716193556786, + "rewards/accuracy_reward": 0.16071429196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2918526902794838, + "rewards/tag_count_reward": 0.3208705484867096, "step": 528 }, { "clip_ratio": 0.0, - "completion_length": 2010.6273193359375, + "completion_length": 1436.12060546875, "epoch": 0.15801657829885743, - "grad_norm": 0.08361922204494476, - "kl": 0.0022640228271484375, - "learning_rate": 9.897988073357928e-08, - "loss": 0.0171, - "reward": 0.3387276902794838, - "reward_std": 0.058376674773171544, - "rewards/accuracy_reward": 0.06919643026776612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "grad_norm": 11.602551460266113, + "kl": 1.78125, + "learning_rate": 4.948994036678964e-07, + "loss": 0.1906, + "reward": 0.3867187649011612, + "reward_std": 0.2183646634221077, + "rewards/accuracy_reward": 0.08482143096625805, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2996651902794838, "step": 529 }, { "clip_ratio": 0.0, - "completion_length": 1985.5960693359375, + "completion_length": 1489.7902526855469, "epoch": 0.1583152863863789, - "grad_norm": 0.10705036669969559, - "kl": 0.0031280517578125, - "learning_rate": 9.89693733070734e-08, - "loss": 0.0231, - "reward": 0.344866082072258, - "reward_std": 0.08592876326292753, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 22.823286056518555, + "kl": 1.677734375, + "learning_rate": 4.94846866535367e-07, + "loss": 0.1272, + "reward": 0.3459821566939354, + "reward_std": 0.17157990112900734, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2868303656578064, + "rewards/tag_count_reward": 0.2991071492433548, "step": 530 }, { "clip_ratio": 0.0, - "completion_length": 1998.7679443359375, + "completion_length": 1460.2857666015625, "epoch": 0.15861399447390037, - "grad_norm": 0.10253927856683731, - "kl": 0.00251007080078125, - "learning_rate": 9.895881260668888e-08, - "loss": 0.0194, - "reward": 0.3231026902794838, - "reward_std": 0.06592066353186965, - "rewards/accuracy_reward": 0.046875000931322575, + "grad_norm": 15.884933471679688, + "kl": 1.68359375, + "learning_rate": 4.947940630334444e-07, + "loss": 0.1715, + "reward": 0.3945312574505806, + "reward_std": 0.17972556129097939, + "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.3164062574505806, "step": 531 }, { "clip_ratio": 0.0, - "completion_length": 2017.9866943359375, + "completion_length": 1517.0826416015625, "epoch": 0.15891270256142184, - "grad_norm": 0.09582048654556274, - "kl": 0.0022907257080078125, - "learning_rate": 9.894819864391473e-08, - "loss": 0.0176, - "reward": 0.3152901902794838, - "reward_std": 0.048831868916749954, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 11.859461784362793, + "kl": 2.005859375, + "learning_rate": 4.947409932195736e-07, + "loss": 0.1911, + "reward": 0.3214285895228386, + "reward_std": 0.17129936441779137, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "rewards/tag_count_reward": 0.2834821566939354, "step": 532 }, { "clip_ratio": 0.0, - "completion_length": 1999.6406860351562, + "completion_length": 1460.2590026855469, "epoch": 0.1592114106489433, - "grad_norm": 0.12411881238222122, - "kl": 0.0026836395263671875, - "learning_rate": 9.89375314302979e-08, - "loss": 0.0238, - "reward": 0.3699776902794838, - "reward_std": 0.09647283749654889, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 25.551998138427734, + "kl": 1.923828125, + "learning_rate": 4.946876571514895e-07, + "loss": 0.1592, + "reward": 0.3945312649011612, + "reward_std": 0.18359490856528282, + "rewards/accuracy_reward": 0.10044643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2829241156578064, + "rewards/tag_count_reward": 0.294084832072258, "step": 533 }, { "clip_ratio": 0.0, - "completion_length": 2019.7656860351562, + "completion_length": 1549.4286193847656, "epoch": 0.15951011873646478, - "grad_norm": 0.08935979753732681, - "kl": 0.002048492431640625, - "learning_rate": 9.892681097744326e-08, - "loss": 0.0111, - "reward": 0.3392857313156128, - "reward_std": 0.061589392367750406, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 22.828031539916992, + "kl": 1.845703125, + "learning_rate": 4.946340548872164e-07, + "loss": 0.1384, + "reward": 0.3510044738650322, + "reward_std": 0.18982669338583946, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.274553582072258, + "rewards/tag_count_reward": 0.2751116119325161, "step": 534 }, { "clip_ratio": 0.0, - "completion_length": 1978.7857971191406, + "completion_length": 1493.1451416015625, "epoch": 0.15980882682398626, - "grad_norm": 0.1042909175157547, - "kl": 0.00307464599609375, - "learning_rate": 9.891603729701362e-08, - "loss": 0.0173, - "reward": 0.2935268059372902, - "reward_std": 0.06222202442586422, - "rewards/accuracy_reward": 0.0066964291036129, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2868303805589676, + "grad_norm": 20.755876541137695, + "kl": 1.3623046875, + "learning_rate": 4.945801864850681e-07, + "loss": 0.1415, + "reward": 0.337611623108387, + "reward_std": 0.20516351610422134, + "rewards/accuracy_reward": 0.029017859371379018, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.306361623108387, "step": 535 }, { "clip_ratio": 0.0, - "completion_length": 1996.0224304199219, + "completion_length": 1485.9063415527344, "epoch": 0.16010753491150773, - "grad_norm": 0.11627961695194244, - "kl": 0.002918243408203125, - "learning_rate": 9.890521040072969e-08, - "loss": 0.0335, - "reward": 0.4570312649011612, - "reward_std": 0.10716890543699265, - "rewards/accuracy_reward": 0.1696428619325161, + "grad_norm": 53.59226989746094, + "kl": 1.203125, + "learning_rate": 4.945260520036484e-07, + "loss": 0.1705, + "reward": 0.5251116305589676, + "reward_std": 0.22791927307844162, + "rewards/accuracy_reward": 0.20535714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.287388414144516, + "rewards/tag_count_reward": 0.3197544813156128, "step": 536 }, { "clip_ratio": 0.0, - "completion_length": 2022.0558776855469, + "completion_length": 1546.279052734375, "epoch": 0.1604062429990292, - "grad_norm": 0.10945910960435867, - "kl": 0.0022430419921875, - "learning_rate": 9.889433030037006e-08, - "loss": 0.0199, - "reward": 0.3599330484867096, - "reward_std": 0.07208667695522308, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 32.348567962646484, + "kl": 1.568359375, + "learning_rate": 4.944716515018504e-07, + "loss": 0.1693, + "reward": 0.3705357313156128, + "reward_std": 0.17491189762949944, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437574505806, + "rewards/tag_count_reward": 0.285714291036129, "step": 537 }, { "clip_ratio": 0.0, - "completion_length": 2000.5804443359375, + "completion_length": 1533.1920166015625, "epoch": 0.16070495108655067, - "grad_norm": 0.10426013171672821, - "kl": 0.002864837646484375, - "learning_rate": 9.888339700777125e-08, - "loss": 0.0248, - "reward": 0.337611623108387, - "reward_std": 0.07486195396631956, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 41.89881896972656, + "kl": 1.3671875, + "learning_rate": 4.944169850388562e-07, + "loss": 0.1411, + "reward": 0.341517873108387, + "reward_std": 0.20202600583434105, + "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401977300644, + "rewards/tag_count_reward": 0.2879464402794838, "step": 538 }, { "clip_ratio": 0.0, - "completion_length": 1990.2880554199219, + "completion_length": 1529.1295166015625, "epoch": 0.16100365917407214, - "grad_norm": 0.10494105517864227, - "kl": 0.0028228759765625, - "learning_rate": 9.887241053482755e-08, - "loss": 0.0179, - "reward": 0.3816964402794838, - "reward_std": 0.040914720855653286, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 31.795007705688477, + "kl": 1.5390625, + "learning_rate": 4.943620526741378e-07, + "loss": 0.158, + "reward": 0.4492187649011612, + "reward_std": 0.2079201228916645, + "rewards/accuracy_reward": 0.1316964311990887, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214402794838, + "rewards/tag_count_reward": 0.3175223395228386, "step": 539 }, { "clip_ratio": 0.0, - "completion_length": 2023.3683776855469, + "completion_length": 1533.3929138183594, "epoch": 0.1613023672615936, - "grad_norm": 0.1039937436580658, - "kl": 0.0022296905517578125, - "learning_rate": 9.886137089349122e-08, - "loss": 0.0147, - "reward": 0.3253348395228386, - "reward_std": 0.0844221068546176, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 16.42098045349121, + "kl": 1.88671875, + "learning_rate": 4.94306854467456e-07, + "loss": 0.1687, + "reward": 0.3677455559372902, + "reward_std": 0.20785117149353027, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.3052455484867096, "step": 540 }, { "clip_ratio": 0.0, - "completion_length": 1918.3036804199219, + "completion_length": 1448.6965026855469, "epoch": 0.16160107534911508, - "grad_norm": 0.1283424198627472, - "kl": 0.004730224609375, - "learning_rate": 9.885027809577226e-08, - "loss": 0.0482, - "reward": 0.3443080484867096, - "reward_std": 0.09204641170799732, - "rewards/accuracy_reward": 0.037946430034935474, + "grad_norm": 15.577332496643066, + "kl": 1.65625, + "learning_rate": 4.942513904788613e-07, + "loss": 0.1797, + "reward": 0.3560268059372902, + "reward_std": 0.1952895149588585, + "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.3203125149011612, "step": 541 }, { "clip_ratio": 0.0, - "completion_length": 1993.5781860351562, + "completion_length": 1505.7210388183594, "epoch": 0.16189978343663655, - "grad_norm": 0.08630579710006714, - "kl": 0.0029964447021484375, - "learning_rate": 9.88391321537386e-08, - "loss": 0.0097, - "reward": 0.3175223395228386, - "reward_std": 0.04236319661140442, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 16.7977294921875, + "kl": 1.857421875, + "learning_rate": 4.94195660768693e-07, + "loss": 0.1521, + "reward": 0.3783482238650322, + "reward_std": 0.1823890469968319, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2773437649011612, + "rewards/tag_count_reward": 0.3113839477300644, "step": 542 }, { "clip_ratio": 0.0, - "completion_length": 2026.8438110351562, + "completion_length": 1512.1429443359375, "epoch": 0.16219849152415802, - "grad_norm": 0.09318391233682632, - "kl": 0.0019474029541015625, - "learning_rate": 9.88279330795159e-08, - "loss": 0.0099, - "reward": 0.302455373108387, - "reward_std": 0.031894957181066275, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 25.266433715820312, + "kl": 2.20703125, + "learning_rate": 4.941396653975795e-07, + "loss": 0.1702, + "reward": 0.314174123108387, + "reward_std": 0.16804957389831543, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.266741082072258, + "rewards/tag_count_reward": 0.2717634066939354, "step": 543 }, { "clip_ratio": 0.0, - "completion_length": 1953.7702026367188, + "completion_length": 1451.107177734375, "epoch": 0.1624971996116795, - "grad_norm": 0.12675946950912476, - "kl": 0.004093170166015625, - "learning_rate": 9.881668088528766e-08, - "loss": 0.0383, - "reward": 0.459821455180645, - "reward_std": 0.14292337000370026, - "rewards/accuracy_reward": 0.15848215389996767, + "grad_norm": 11.879313468933105, + "kl": 1.896484375, + "learning_rate": 4.940834044264383e-07, + "loss": 0.152, + "reward": 0.466517873108387, + "reward_std": 0.20944871753454208, + "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.301339291036129, + "rewards/tag_count_reward": 0.3191964402794838, "step": 544 }, { "clip_ratio": 0.0, - "completion_length": 1972.6630249023438, + "completion_length": 1511.9040832519531, "epoch": 0.16279590769920096, - "grad_norm": 0.09444686770439148, - "kl": 0.00342559814453125, - "learning_rate": 9.880537558329517e-08, - "loss": 0.0326, - "reward": 0.3203125223517418, - "reward_std": 0.04379560542292893, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 10.97879409790039, + "kl": 1.787109375, + "learning_rate": 4.940268779164758e-07, + "loss": 0.1443, + "reward": 0.334263414144516, + "reward_std": 0.17915084585547447, + "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2823660895228386, + "rewards/tag_count_reward": 0.2806919738650322, "step": 545 }, { "clip_ratio": 0.0, - "completion_length": 1972.4130249023438, + "completion_length": 1536.2300109863281, "epoch": 0.16309461578672244, - "grad_norm": 0.12117256224155426, - "kl": 0.0035343170166015625, - "learning_rate": 9.879401718583748e-08, - "loss": 0.0262, - "reward": 0.318080373108387, - "reward_std": 0.0746199581772089, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 15.401771545410156, + "kl": 2.119140625, + "learning_rate": 4.939700859291875e-07, + "loss": 0.209, + "reward": 0.3292410895228386, + "reward_std": 0.18664926663041115, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2912946566939354, + "rewards/tag_count_reward": 0.306919664144516, "step": 546 }, { "clip_ratio": 0.0, - "completion_length": 2034.1206359863281, + "completion_length": 1569.24560546875, "epoch": 0.1633933238742439, - "grad_norm": 0.11382097750902176, - "kl": 0.0019435882568359375, - "learning_rate": 9.878260570527142e-08, - "loss": 0.0111, - "reward": 0.3231026902794838, - "reward_std": 0.07708001974970102, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 11.521446228027344, + "kl": 1.966796875, + "learning_rate": 4.939130285263572e-07, + "loss": 0.1948, + "reward": 0.3699776977300644, + "reward_std": 0.2225136049091816, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312574505806, + "rewards/tag_count_reward": 0.294084832072258, "step": 547 }, { "clip_ratio": 0.0, - "completion_length": 2034.2590026855469, + "completion_length": 1644.4263916015625, "epoch": 0.16369203196176538, - "grad_norm": 0.10137426108121872, - "kl": 0.0018768310546875, - "learning_rate": 9.877114115401157e-08, - "loss": 0.0127, - "reward": 0.3191964402794838, - "reward_std": 0.0633086166344583, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 15.928544998168945, + "kl": 2.09765625, + "learning_rate": 4.938557057700579e-07, + "loss": 0.1666, + "reward": 0.3125000111758709, + "reward_std": 0.1819944977760315, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2678571492433548, + "rewards/tag_count_reward": 0.2633928693830967, "step": 548 }, { "clip_ratio": 0.0, - "completion_length": 1969.5335693359375, + "completion_length": 1513.60498046875, "epoch": 0.16399074004928682, - "grad_norm": 0.09129921346902847, - "kl": 0.00339508056640625, - "learning_rate": 9.875962354453022e-08, - "loss": 0.0234, - "reward": 0.330357164144516, - "reward_std": 0.05987348733469844, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 26.027103424072266, + "kl": 1.626953125, + "learning_rate": 4.937981177226511e-07, + "loss": 0.1506, + "reward": 0.4308035895228386, + "reward_std": 0.2502877898514271, + "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2723214477300644, + "rewards/tag_count_reward": 0.3236607238650322, "step": 549 }, { "clip_ratio": 0.0, - "completion_length": 2019.3170471191406, + "completion_length": 1564.0491943359375, "epoch": 0.1642894481368083, - "grad_norm": 0.11515054106712341, - "kl": 0.0025177001953125, - "learning_rate": 9.874805288935742e-08, - "loss": 0.0217, - "reward": 0.361049123108387, - "reward_std": 0.08037207182496786, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 14.689818382263184, + "kl": 1.7265625, + "learning_rate": 4.937402644467871e-07, + "loss": 0.1784, + "reward": 0.3878348395228386, + "reward_std": 0.18370545655488968, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.278459832072258, + "rewards/tag_count_reward": 0.3007812649011612, "step": 550 }, { "clip_ratio": 0.0, - "completion_length": 2022.8058776855469, + "completion_length": 1587.7277526855469, "epoch": 0.16458815622432976, - "grad_norm": 0.10380702465772629, - "kl": 0.0024852752685546875, - "learning_rate": 9.87364292010809e-08, - "loss": 0.023, - "reward": 0.3800223395228386, - "reward_std": 0.046228845370933414, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 27.457515716552734, + "kl": 1.79296875, + "learning_rate": 4.936821460054045e-07, + "loss": 0.1951, + "reward": 0.4056919738650322, + "reward_std": 0.16137132421135902, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2728794813156128, + "rewards/tag_count_reward": 0.2963169738650322, "step": 551 }, { "clip_ratio": 0.0, - "completion_length": 1968.8773193359375, + "completion_length": 1511.7902526855469, "epoch": 0.16488686431185123, - "grad_norm": 0.135184183716774, - "kl": 0.0039272308349609375, - "learning_rate": 9.872475249234607e-08, - "loss": 0.0348, - "reward": 0.3727678656578064, - "reward_std": 0.08429369609802961, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 13.077816009521484, + "kl": 1.66015625, + "learning_rate": 4.936237624617304e-07, + "loss": 0.1805, + "reward": 0.423549123108387, + "reward_std": 0.1953897513449192, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294642873108387, + "rewards/tag_count_reward": 0.3208705484867096, "step": 552 }, { "clip_ratio": 0.0, - "completion_length": 1958.5000915527344, + "completion_length": 1540.5268249511719, "epoch": 0.1651855723993727, - "grad_norm": 0.11828085780143738, - "kl": 0.00400543212890625, - "learning_rate": 9.871302277585609e-08, - "loss": 0.0315, - "reward": 0.353236623108387, - "reward_std": 0.06766171101480722, - "rewards/accuracy_reward": 0.06919643026776612, + "grad_norm": 9.963757514953613, + "kl": 1.8359375, + "learning_rate": 4.935651138792804e-07, + "loss": 0.192, + "reward": 0.3694196566939354, + "reward_std": 0.1632193885743618, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401977300644, + "rewards/tag_count_reward": 0.2979910895228386, "step": 553 }, { "clip_ratio": 0.0, - "completion_length": 2006.0715026855469, + "completion_length": 1568.3036193847656, "epoch": 0.16548428048689418, - "grad_norm": 0.1360340118408203, - "kl": 0.0028057098388671875, - "learning_rate": 9.87012400643717e-08, - "loss": 0.0214, - "reward": 0.3203125149011612, - "reward_std": 0.12398355081677437, - "rewards/accuracy_reward": 0.03348214412108064, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286830373108387, + "grad_norm": 32.14925003051758, + "kl": 1.638671875, + "learning_rate": 4.935062003218585e-07, + "loss": 0.1768, + "reward": 0.3660714402794838, + "reward_std": 0.22965658456087112, + "rewards/accuracy_reward": 0.042410716880112886, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3214285895228386, "step": 554 }, { "clip_ratio": 0.0, - "completion_length": 2027.5179443359375, + "completion_length": 1624.5581359863281, "epoch": 0.16578298857441565, - "grad_norm": 0.10568525642156601, - "kl": 0.0022525787353515625, - "learning_rate": 9.868940437071138e-08, - "loss": 0.0215, - "reward": 0.381138414144516, - "reward_std": 0.05473257787525654, - "rewards/accuracy_reward": 0.1138392873108387, + "grad_norm": 15.839299201965332, + "kl": 2.44140625, + "learning_rate": 4.934470218535569e-07, + "loss": 0.1964, + "reward": 0.4101562649011612, + "reward_std": 0.1923549398779869, + "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2672991156578064, + "rewards/tag_count_reward": 0.2829241156578064, "step": 555 }, { "clip_ratio": 0.0, - "completion_length": 2017.5693054199219, + "completion_length": 1590.352783203125, "epoch": 0.16608169666193712, - "grad_norm": 0.13017070293426514, - "kl": 0.00257110595703125, - "learning_rate": 9.867751570775116e-08, - "loss": 0.0201, - "reward": 0.3325893059372902, - "reward_std": 0.09291958250105381, - "rewards/accuracy_reward": 0.0491071455180645, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.283482164144516, + "grad_norm": 10.260693550109863, + "kl": 2.029296875, + "learning_rate": 4.933875785387558e-07, + "loss": 0.2125, + "reward": 0.353236623108387, + "reward_std": 0.2079397663474083, + "rewards/accuracy_reward": 0.06696428917348385, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2840401902794838, "step": 556 }, { "clip_ratio": 0.0, - "completion_length": 2026.2857666015625, + "completion_length": 1621.4598999023438, "epoch": 0.1663804047494586, - "grad_norm": 0.09314581006765366, - "kl": 0.0022792816162109375, - "learning_rate": 9.866557408842477e-08, - "loss": 0.0181, - "reward": 0.3275669738650322, - "reward_std": 0.07067036838270724, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 13.340351104736328, + "kl": 2.34765625, + "learning_rate": 4.933278704421239e-07, + "loss": 0.2067, + "reward": 0.3225446566939354, + "reward_std": 0.20206910744309425, + "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2695312649011612, + "rewards/tag_count_reward": 0.266741082072258, "step": 557 }, { "clip_ratio": 0.0, - "completion_length": 1965.1072387695312, + "completion_length": 1574.0134582519531, "epoch": 0.16667911283698006, - "grad_norm": 0.11105509847402573, - "kl": 0.0037364959716796875, - "learning_rate": 9.86535795257235e-08, - "loss": 0.0259, - "reward": 0.3152901902794838, - "reward_std": 0.07662903377786279, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 8.78235149383545, + "kl": 2.28125, + "learning_rate": 4.932678976286176e-07, + "loss": 0.216, + "reward": 0.321986623108387, + "reward_std": 0.20052982866764069, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952009066939354, + "rewards/tag_count_reward": 0.2996651902794838, "step": 558 }, { "clip_ratio": 0.0, - "completion_length": 2019.76123046875, + "completion_length": 1611.2545471191406, "epoch": 0.16697782092450153, - "grad_norm": 0.1128140315413475, - "kl": 0.0024566650390625, - "learning_rate": 9.864153203269628e-08, - "loss": 0.0196, - "reward": 0.2851562574505806, - "reward_std": 0.07469536643475294, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 10.4319486618042, + "kl": 2.61328125, + "learning_rate": 4.932076601634814e-07, + "loss": 0.2176, + "reward": 0.2912946566939354, + "reward_std": 0.1844063363969326, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2762276902794838, + "rewards/tag_count_reward": 0.2801339477300644, "step": 559 }, { "clip_ratio": 0.0, - "completion_length": 2030.8772888183594, + "completion_length": 1638.82373046875, "epoch": 0.167276529012023, - "grad_norm": 0.10619811713695526, - "kl": 0.0024242401123046875, - "learning_rate": 9.862943162244959e-08, - "loss": 0.0134, - "reward": 0.3532366305589676, - "reward_std": 0.09729917207732797, - "rewards/accuracy_reward": 0.07812500558793545, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "grad_norm": 32.21158218383789, + "kl": 2.63671875, + "learning_rate": 4.931471581122479e-07, + "loss": 0.2147, + "reward": 0.3593750149011612, + "reward_std": 0.23315341398119926, + "rewards/accuracy_reward": 0.07589286006987095, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2812500074505806, "step": 560 }, { "clip_ratio": 0.0, - "completion_length": 2009.3884887695312, + "completion_length": 1587.6473999023438, "epoch": 0.16757523709954447, - "grad_norm": 0.113200344145298, - "kl": 0.0028514862060546875, - "learning_rate": 9.86172783081475e-08, - "loss": 0.0186, - "reward": 0.338169664144516, - "reward_std": 0.10625433223322034, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 13.846587181091309, + "kl": 2.36328125, + "learning_rate": 4.930863915407374e-07, + "loss": 0.2594, + "reward": 0.353236623108387, + "reward_std": 0.2137494459748268, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2823660895228386, + "rewards/tag_count_reward": 0.2952009066939354, "step": 561 }, { "clip_ratio": 0.0, - "completion_length": 2011.9331359863281, + "completion_length": 1594.10498046875, "epoch": 0.16787394518706594, - "grad_norm": 0.11562783271074295, - "kl": 0.002918243408203125, - "learning_rate": 9.860507210301159e-08, - "loss": 0.014, - "reward": 0.2840401902794838, - "reward_std": 0.06174400355666876, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 15.19684886932373, + "kl": 1.755859375, + "learning_rate": 4.930253605150579e-07, + "loss": 0.1738, + "reward": 0.318638414144516, + "reward_std": 0.2119026742875576, + "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2818080484867096, + "rewards/tag_count_reward": 0.2963169738650322, "step": 562 }, { "clip_ratio": 0.0, - "completion_length": 1998.7813110351562, + "completion_length": 1627.1652526855469, "epoch": 0.1681726532745874, - "grad_norm": 0.11453171074390411, - "kl": 0.003299713134765625, - "learning_rate": 9.859281302032105e-08, - "loss": 0.0135, - "reward": 0.3867187723517418, - "reward_std": 0.08820157870650291, - "rewards/accuracy_reward": 0.1049107164144516, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2818080559372902, + "grad_norm": 21.628976821899414, + "kl": 2.6171875, + "learning_rate": 4.929640651016053e-07, + "loss": 0.2196, + "reward": 0.3722098395228386, + "reward_std": 0.2102942168712616, + "rewards/accuracy_reward": 0.1004464328289032, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2695312649011612, "step": 563 }, { "clip_ratio": 0.0, - "completion_length": 2012.9264221191406, + "completion_length": 1618.1697387695312, "epoch": 0.16847136136210888, - "grad_norm": 0.12546952068805695, - "kl": 0.003047943115234375, - "learning_rate": 9.858050107341256e-08, - "loss": 0.0243, - "reward": 0.3214285895228386, - "reward_std": 0.0652003800496459, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 10.55736255645752, + "kl": 2.32421875, + "learning_rate": 4.929025053670627e-07, + "loss": 0.2267, + "reward": 0.3381696566939354, + "reward_std": 0.16556742787361145, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2834821566939354, + "rewards/tag_count_reward": 0.297991082072258, "step": 564 }, { "clip_ratio": 0.0, - "completion_length": 2010.4577026367188, + "completion_length": 1565.60498046875, "epoch": 0.16877006944963036, - "grad_norm": 0.12110298871994019, - "kl": 0.003231048583984375, - "learning_rate": 9.85681362756803e-08, - "loss": 0.0257, - "reward": 0.3733259066939354, - "reward_std": 0.08449325524270535, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 6.295408248901367, + "kl": 2.40625, + "learning_rate": 4.928406813784015e-07, + "loss": 0.2379, + "reward": 0.3850446566939354, + "reward_std": 0.19622733816504478, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401902794838, + "rewards/tag_count_reward": 0.297991082072258, "step": 565 }, { "clip_ratio": 0.0, - "completion_length": 2014.3505554199219, + "completion_length": 1597.0022888183594, "epoch": 0.16906877753715183, - "grad_norm": 0.1144041195511818, - "kl": 0.00292205810546875, - "learning_rate": 9.855571864057597e-08, - "loss": 0.0267, - "reward": 0.3108258992433548, - "reward_std": 0.0482783536426723, - "rewards/accuracy_reward": 0.0357142873108387, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2751116156578064, + "grad_norm": 20.413482666015625, + "kl": 1.958984375, + "learning_rate": 4.927785932028799e-07, + "loss": 0.2254, + "reward": 0.3504464402794838, + "reward_std": 0.20354238897562027, + "rewards/accuracy_reward": 0.04687500209547579, + "rewards/format_reward": 0.004464285913854837, + "rewards/tag_count_reward": 0.2991071566939354, "step": 566 }, { "clip_ratio": 0.0, - "completion_length": 1992.4978332519531, + "completion_length": 1602.435302734375, "epoch": 0.1693674856246733, - "grad_norm": 0.11330489069223404, - "kl": 0.003406524658203125, - "learning_rate": 9.85432481816087e-08, - "loss": 0.0229, - "reward": 0.3325892984867096, - "reward_std": 0.10895239654928446, - "rewards/accuracy_reward": 0.04241071501746774, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2901785895228386, + "grad_norm": 10.642688751220703, + "kl": 2.298828125, + "learning_rate": 4.927162409080436e-07, + "loss": 0.2131, + "reward": 0.3493303656578064, + "reward_std": 0.22611487284302711, + "rewards/accuracy_reward": 0.042410716181620955, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3046875223517418, "step": 567 }, { "clip_ratio": 0.0, - "completion_length": 2011.3817749023438, + "completion_length": 1595.0491638183594, "epoch": 0.16966619371219477, - "grad_norm": 0.11798050999641418, - "kl": 0.002880096435546875, - "learning_rate": 9.85307249123452e-08, - "loss": 0.017, - "reward": 0.341517873108387, - "reward_std": 0.10057830903679132, - "rewards/accuracy_reward": 0.060267860535532236, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500074505806, + "grad_norm": 17.878570556640625, + "kl": 2.2890625, + "learning_rate": 4.92653624561726e-07, + "loss": 0.2408, + "reward": 0.3582589328289032, + "reward_std": 0.19988222047686577, + "rewards/accuracy_reward": 0.06919643026776612, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2868303656578064, "step": 568 }, { "clip_ratio": 0.0, - "completion_length": 2005.4107971191406, + "completion_length": 1564.5067443847656, "epoch": 0.16996490179971624, - "grad_norm": 0.10531098395586014, - "kl": 0.00315093994140625, - "learning_rate": 9.851814884640948e-08, - "loss": 0.0312, - "reward": 0.2963169738650322, - "reward_std": 0.08241367992013693, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 8.563328742980957, + "kl": 2.42578125, + "learning_rate": 4.925907442320475e-07, + "loss": 0.2268, + "reward": 0.364955373108387, + "reward_std": 0.2186395600438118, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.278459832072258, + "rewards/tag_count_reward": 0.3069196492433548, "step": 569 }, { "clip_ratio": 0.0, - "completion_length": 1929.8125915527344, + "completion_length": 1514.6339721679688, "epoch": 0.1702636098872377, - "grad_norm": 0.12639302015304565, - "kl": 0.00513458251953125, - "learning_rate": 9.850551999748314e-08, - "loss": 0.038, - "reward": 0.4659598395228386, - "reward_std": 0.10572483949363232, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 14.028228759765625, + "kl": 2.359375, + "learning_rate": 4.925275999874156e-07, + "loss": 0.2459, + "reward": 0.4676339477300644, + "reward_std": 0.22803118452429771, + "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.3069196566939354, "step": 570 }, { "clip_ratio": 0.0, - "completion_length": 1972.0670471191406, + "completion_length": 1577.91748046875, "epoch": 0.17056231797475915, - "grad_norm": 0.1237359419465065, - "kl": 0.004032135009765625, - "learning_rate": 9.849283837930506e-08, - "loss": 0.0357, - "reward": 0.3108259066939354, - "reward_std": 0.09739127475768328, - "rewards/accuracy_reward": 0.02008928661234677, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2907366156578064, + "grad_norm": 10.996620178222656, + "kl": 2.82421875, + "learning_rate": 4.924641918965254e-07, + "loss": 0.2594, + "reward": 0.3091517984867096, + "reward_std": 0.19352185726165771, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.297991082072258, "step": 571 }, { "clip_ratio": 0.0, - "completion_length": 1924.5514526367188, + "completion_length": 1526.7612609863281, "epoch": 0.17086102606228062, - "grad_norm": 0.11692798882722855, - "kl": 0.005344390869140625, - "learning_rate": 9.848010400567167e-08, - "loss": 0.0284, - "reward": 0.3861607238650322, - "reward_std": 0.09627113118767738, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 25.230566024780273, + "kl": 2.8671875, + "learning_rate": 4.924005200283583e-07, + "loss": 0.236, + "reward": 0.377232164144516, + "reward_std": 0.20791489630937576, + "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2946428656578064, + "rewards/tag_count_reward": 0.2924107238650322, "step": 572 }, { "clip_ratio": 0.0, - "completion_length": 1989.6161499023438, + "completion_length": 1561.0089721679688, "epoch": 0.1711597341498021, - "grad_norm": 0.12468663603067398, - "kl": 0.00377655029296875, - "learning_rate": 9.846731689043665e-08, - "loss": 0.0269, - "reward": 0.3510044813156128, - "reward_std": 0.09678738750517368, - "rewards/accuracy_reward": 0.06250000349245965, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2885044738650322, + "grad_norm": 7.374900817871094, + "kl": 2.72265625, + "learning_rate": 4.923365844521832e-07, + "loss": 0.2421, + "reward": 0.3560267984867096, + "reward_std": 0.2072165422141552, + "rewards/accuracy_reward": 0.06473214598372579, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2890625149011612, "step": 573 }, { "clip_ratio": 0.0, - "completion_length": 2001.5558776855469, + "completion_length": 1622.1072082519531, "epoch": 0.17145844223732357, - "grad_norm": 0.12387003004550934, - "kl": 0.003448486328125, - "learning_rate": 9.845447704751118e-08, - "loss": 0.0326, - "reward": 0.3309151977300644, - "reward_std": 0.07470415346324444, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 41.591552734375, + "kl": 3.40234375, + "learning_rate": 4.922723852375559e-07, + "loss": 0.2648, + "reward": 0.314174123108387, + "reward_std": 0.17675819993019104, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286272332072258, + "rewards/tag_count_reward": 0.2739955484867096, "step": 574 }, { "clip_ratio": 0.0, - "completion_length": 2003.8282165527344, + "completion_length": 1616.3795471191406, "epoch": 0.17175715032484504, - "grad_norm": 0.1290106177330017, - "kl": 0.00359344482421875, - "learning_rate": 9.844158449086371e-08, - "loss": 0.0198, - "reward": 0.330357164144516, - "reward_std": 0.11546876188367605, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 14.09976577758789, + "kl": 2.2109375, + "learning_rate": 4.922079224543185e-07, + "loss": 0.2336, + "reward": 0.3431919813156128, + "reward_std": 0.212439626455307, + "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.299107164144516, + "rewards/tag_count_reward": 0.3074776902794838, "step": 575 }, { "clip_ratio": 0.0, - "completion_length": 1934.2054138183594, + "completion_length": 1580.3371276855469, "epoch": 0.1720558584123665, - "grad_norm": 0.10726863145828247, - "kl": 0.004932403564453125, - "learning_rate": 9.842863923452012e-08, - "loss": 0.0168, - "reward": 0.3666294887661934, - "reward_std": 0.08673475962132215, - "rewards/accuracy_reward": 0.06919643026776612, + "grad_norm": 15.591771125793457, + "kl": 2.064453125, + "learning_rate": 4.921431961726006e-07, + "loss": 0.2128, + "reward": 0.3459821566939354, + "reward_std": 0.18646401166915894, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330559372902, + "rewards/tag_count_reward": 0.290178582072258, "step": 576 }, { "clip_ratio": 0.0, - "completion_length": 2012.7121276855469, + "completion_length": 1602.5134582519531, "epoch": 0.17235456649988798, - "grad_norm": 0.1227695494890213, - "kl": 0.00347137451171875, - "learning_rate": 9.841564129256355e-08, - "loss": 0.0246, - "reward": 0.3074776977300644, - "reward_std": 0.09626431949436665, - "rewards/accuracy_reward": 0.022321430034935474, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "grad_norm": 11.33595085144043, + "kl": 2.130859375, + "learning_rate": 4.920782064628177e-07, + "loss": 0.2221, + "reward": 0.344866082072258, + "reward_std": 0.2150227315723896, + "rewards/accuracy_reward": 0.040178574388846755, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.302455373108387, "step": 577 }, { "clip_ratio": 0.0, - "completion_length": 2006.4219665527344, + "completion_length": 1604.3371276855469, "epoch": 0.17265327458740945, - "grad_norm": 0.09959421306848526, - "kl": 0.00327301025390625, - "learning_rate": 9.84025906791345e-08, - "loss": 0.0242, - "reward": 0.3359375149011612, - "reward_std": 0.08665119390934706, - "rewards/accuracy_reward": 0.05580357299186289, + "grad_norm": 8.615790367126465, + "kl": 2.71484375, + "learning_rate": 4.920129533956725e-07, + "loss": 0.2498, + "reward": 0.356584832072258, + "reward_std": 0.2187850959599018, + "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339402794838, + "rewards/tag_count_reward": 0.2963169887661934, "step": 578 }, { "clip_ratio": 0.0, - "completion_length": 2004.0223693847656, + "completion_length": 1670.72998046875, "epoch": 0.17295198267493092, - "grad_norm": 0.12541188299655914, - "kl": 0.00341033935546875, - "learning_rate": 9.838948740843075e-08, - "loss": 0.025, - "reward": 0.3286830484867096, - "reward_std": 0.11020229663699865, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 13.563003540039062, + "kl": 2.765625, + "learning_rate": 4.919474370421538e-07, + "loss": 0.2533, + "reward": 0.3364955559372902, + "reward_std": 0.22764960676431656, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.290736623108387, + "rewards/tag_count_reward": 0.294084832072258, "step": 579 }, { "clip_ratio": 0.0, - "completion_length": 1985.6987609863281, + "completion_length": 1630.2411499023438, "epoch": 0.1732506907624524, - "grad_norm": 0.13118359446525574, - "kl": 0.003879547119140625, - "learning_rate": 9.837633149470739e-08, - "loss": 0.0246, - "reward": 0.3537946566939354, - "reward_std": 0.15141483955085278, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 13.145462989807129, + "kl": 2.67578125, + "learning_rate": 4.918816574735369e-07, + "loss": 0.2387, + "reward": 0.3616071566939354, + "reward_std": 0.22869007661938667, + "rewards/accuracy_reward": 0.05580357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2912946566939354, + "rewards/tag_count_reward": 0.3058035746216774, "step": 580 }, { "clip_ratio": 0.0, - "completion_length": 2033.6406860351562, + "completion_length": 1694.3906860351562, "epoch": 0.17354939884997386, - "grad_norm": 0.10745615512132645, - "kl": 0.00254058837890625, - "learning_rate": 9.836312295227674e-08, - "loss": 0.0128, - "reward": 0.2779018059372902, - "reward_std": 0.06986631080508232, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 20.640827178955078, + "kl": 3.359375, + "learning_rate": 4.918156147613837e-07, + "loss": 0.2687, + "reward": 0.290736623108387, + "reward_std": 0.20415988191962242, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2667410895228386, + "rewards/tag_count_reward": 0.2728794813156128, "step": 581 }, { "clip_ratio": 0.0, - "completion_length": 2012.3773498535156, + "completion_length": 1678.6875610351562, "epoch": 0.17384810693749533, - "grad_norm": 0.1199922114610672, - "kl": 0.0033416748046875, - "learning_rate": 9.834986179550841e-08, - "loss": 0.0249, - "reward": 0.3716518059372902, - "reward_std": 0.09465650934726, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 7.124320983886719, + "kl": 2.53125, + "learning_rate": 4.917493089775421e-07, + "loss": 0.2339, + "reward": 0.3549107313156128, + "reward_std": 0.19793832302093506, + "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2845982238650322, + "rewards/tag_count_reward": 0.2678571492433548, "step": 582 }, { "clip_ratio": 0.0, - "completion_length": 2013.7902526855469, + "completion_length": 1692.9108276367188, "epoch": 0.1741468150250168, - "grad_norm": 0.10348664969205856, - "kl": 0.002899169921875, - "learning_rate": 9.833654803882926e-08, - "loss": 0.0171, - "reward": 0.3822544813156128, - "reward_std": 0.05764681473374367, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 13.515953063964844, + "kl": 2.8515625, + "learning_rate": 4.916827401941464e-07, + "loss": 0.245, + "reward": 0.3733259066939354, + "reward_std": 0.2015364058315754, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2818080559372902, + "rewards/tag_count_reward": 0.2773437611758709, "step": 583 }, { "clip_ratio": 0.0, - "completion_length": 1956.774658203125, + "completion_length": 1643.9442749023438, "epoch": 0.17444552311253828, - "grad_norm": 0.12020695954561234, - "kl": 0.004619598388671875, - "learning_rate": 9.832318169672333e-08, - "loss": 0.0391, - "reward": 0.4023437649011612, - "reward_std": 0.06500465422868729, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 13.006391525268555, + "kl": 2.79296875, + "learning_rate": 4.916159084836166e-07, + "loss": 0.2416, + "reward": 0.3789062723517418, + "reward_std": 0.16606170311570168, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2907366156578064, + "rewards/tag_count_reward": 0.2695312649011612, "step": 584 }, { "clip_ratio": 0.0, - "completion_length": 2004.5513916015625, + "completion_length": 1644.3840026855469, "epoch": 0.17474423120005975, - "grad_norm": 0.10759938508272171, - "kl": 0.003490447998046875, - "learning_rate": 9.830976278373188e-08, - "loss": 0.0225, - "reward": 0.3203125074505806, - "reward_std": 0.046354315243661404, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 8.40479850769043, + "kl": 2.8515625, + "learning_rate": 4.915488139186594e-07, + "loss": 0.2629, + "reward": 0.349330373108387, + "reward_std": 0.2074294127523899, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2845982238650322, + "rewards/tag_count_reward": 0.302455373108387, "step": 585 }, { "clip_ratio": 0.0, - "completion_length": 1964.5916137695312, + "completion_length": 1646.1540832519531, "epoch": 0.17504293928758122, - "grad_norm": 0.12332329899072647, - "kl": 0.004634857177734375, - "learning_rate": 9.82962913144534e-08, - "loss": 0.0255, - "reward": 0.3398437649011612, - "reward_std": 0.0976981595158577, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 18.074647903442383, + "kl": 3.30078125, + "learning_rate": 4.91481456572267e-07, + "loss": 0.2596, + "reward": 0.3225446492433548, + "reward_std": 0.20551162213087082, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.290736623108387, + "rewards/tag_count_reward": 0.2912946492433548, "step": 586 }, { "clip_ratio": 0.0, - "completion_length": 1986.0111999511719, + "completion_length": 1656.9577026367188, "epoch": 0.1753416473751027, - "grad_norm": 0.11486637592315674, - "kl": 0.003993988037109375, - "learning_rate": 9.828276730354352e-08, - "loss": 0.0293, - "reward": 0.3225446566939354, - "reward_std": 0.10905128717422485, - "rewards/accuracy_reward": 0.04017857299186289, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.282366082072258, + "grad_norm": 6.897797107696533, + "kl": 2.8515625, + "learning_rate": 4.914138365177176e-07, + "loss": 0.26, + "reward": 0.3141741193830967, + "reward_std": 0.22537293657660484, + "rewards/accuracy_reward": 0.03125000139698386, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2806919775903225, "step": 587 }, { "clip_ratio": 0.0, - "completion_length": 1965.6942749023438, + "completion_length": 1574.5960693359375, "epoch": 0.17564035546262416, - "grad_norm": 0.13012564182281494, - "kl": 0.004749298095703125, - "learning_rate": 9.826919076571502e-08, - "loss": 0.0199, - "reward": 0.3722098469734192, - "reward_std": 0.12214506138116121, - "rewards/accuracy_reward": 0.0781250037252903, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2940848395228386, + "grad_norm": 13.853618621826172, + "kl": 2.349609375, + "learning_rate": 4.913459538285751e-07, + "loss": 0.2598, + "reward": 0.3722098395228386, + "reward_std": 0.23297493532299995, + "rewards/accuracy_reward": 0.06919643376022577, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3007812649011612, "step": 588 }, { "clip_ratio": 0.0, - "completion_length": 2014.2835693359375, + "completion_length": 1621.83935546875, "epoch": 0.17593906355014563, - "grad_norm": 0.1154482290148735, - "kl": 0.0034942626953125, - "learning_rate": 9.825556171573788e-08, - "loss": 0.0151, - "reward": 0.329241082072258, - "reward_std": 0.07242862042039633, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 16.838857650756836, + "kl": 2.328125, + "learning_rate": 4.912778085786893e-07, + "loss": 0.2318, + "reward": 0.3448660969734192, + "reward_std": 0.20028283074498177, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2845982238650322, + "rewards/tag_count_reward": 0.293526791036129, "step": 589 }, { "clip_ratio": 0.0, - "completion_length": 2013.1675109863281, + "completion_length": 1692.7076721191406, "epoch": 0.1762377716376671, - "grad_norm": 0.10010701417922974, - "kl": 0.0034027099609375, - "learning_rate": 9.824188016843913e-08, - "loss": 0.0243, - "reward": 0.313616082072258, - "reward_std": 0.04989237990230322, + "grad_norm": 17.532466888427734, + "kl": 3.38671875, + "learning_rate": 4.912094008421956e-07, + "loss": 0.2709, + "reward": 0.317522332072258, + "reward_std": 0.17356320843100548, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2756696566939354, + "rewards/tag_count_reward": 0.2795758992433548, "step": 590 }, { "clip_ratio": 0.0, - "completion_length": 1960.0826721191406, + "completion_length": 1581.6116638183594, "epoch": 0.17653647972518857, - "grad_norm": 0.0984894409775734, - "kl": 0.00476837158203125, - "learning_rate": 9.822814613870297e-08, - "loss": 0.0289, - "reward": 0.329799123108387, - "reward_std": 0.05980819556862116, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 11.020123481750488, + "kl": 2.76171875, + "learning_rate": 4.911407306935148e-07, + "loss": 0.2541, + "reward": 0.3537946566939354, + "reward_std": 0.17900799587368965, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "rewards/tag_count_reward": 0.3091518059372902, "step": 591 }, { "clip_ratio": 0.0, - "completion_length": 1998.7657165527344, + "completion_length": 1668.1473999023438, "epoch": 0.17683518781271004, - "grad_norm": 0.11400695890188217, - "kl": 0.00365447998046875, - "learning_rate": 9.821435964147066e-08, - "loss": 0.0297, - "reward": 0.298549123108387, - "reward_std": 0.08373365364968777, - "rewards/accuracy_reward": 0.013392857508733869, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "grad_norm": 37.67497253417969, + "kl": 1.87890625, + "learning_rate": 4.910717982073534e-07, + "loss": 0.2106, + "reward": 0.2991071566939354, + "reward_std": 0.1814970299601555, + "rewards/accuracy_reward": 0.0022321429569274187, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.294642873108387, "step": 592 }, { "clip_ratio": 0.0, - "completion_length": 2012.4688415527344, + "completion_length": 1638.9375915527344, "epoch": 0.17713389590023149, - "grad_norm": 0.10547696799039841, - "kl": 0.003528594970703125, - "learning_rate": 9.820052069174061e-08, - "loss": 0.0175, - "reward": 0.3203125149011612, - "reward_std": 0.06063377484679222, - "rewards/accuracy_reward": 0.04017857322469354, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339402794838, + "grad_norm": 25.699308395385742, + "kl": 2.04296875, + "learning_rate": 4.91002603458703e-07, + "loss": 0.231, + "reward": 0.3549107238650322, + "reward_std": 0.22091063484549522, + "rewards/accuracy_reward": 0.05580357555299997, + "rewards/format_reward": 0.006696428870782256, + "rewards/tag_count_reward": 0.2924107238650322, "step": 593 }, { "clip_ratio": 0.0, - "completion_length": 2000.6675109863281, + "completion_length": 1667.4554138183594, "epoch": 0.17743260398775296, - "grad_norm": 0.11068642884492874, - "kl": 0.003757476806640625, - "learning_rate": 9.818662930456817e-08, - "loss": 0.0236, - "reward": 0.439174123108387, - "reward_std": 0.12220688536763191, - "rewards/accuracy_reward": 0.15178571757860482, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2873883992433548, + "grad_norm": 6.941884994506836, + "kl": 2.3984375, + "learning_rate": 4.909331465228409e-07, + "loss": 0.2225, + "reward": 0.4179687649011612, + "reward_std": 0.2182648368179798, + "rewards/accuracy_reward": 0.13616071827709675, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2795759066939354, "step": 594 }, { "clip_ratio": 0.0, - "completion_length": 1993.6764221191406, + "completion_length": 1666.7701721191406, "epoch": 0.17773131207527443, - "grad_norm": 0.11685801297426224, - "kl": 0.00400543212890625, - "learning_rate": 9.817268549506587e-08, - "loss": 0.026, - "reward": 0.3309151902794838, - "reward_std": 0.06439430266618729, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 52.901390075683594, + "kl": 4.2734375, + "learning_rate": 4.908634274753294e-07, + "loss": 0.3053, + "reward": 0.3236607313156128, + "reward_std": 0.18835443258285522, + "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401902794838, + "rewards/tag_count_reward": 0.2723214402794838, "step": 595 }, { "clip_ratio": 0.0, - "completion_length": 1888.6741638183594, + "completion_length": 1513.2567749023438, "epoch": 0.1780300201627959, - "grad_norm": 0.12414352595806122, - "kl": 0.00638580322265625, - "learning_rate": 9.815868927840318e-08, - "loss": 0.0335, - "reward": 0.4045759215950966, - "reward_std": 0.09588562604039907, - "rewards/accuracy_reward": 0.0959821455180645, + "grad_norm": 10.335769653320312, + "kl": 2.78125, + "learning_rate": 4.907934463920159e-07, + "loss": 0.2762, + "reward": 0.404017873108387, + "reward_std": 0.2034577988088131, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937649011612, + "rewards/tag_count_reward": 0.3191964477300644, "step": 596 }, { "clip_ratio": 0.0, - "completion_length": 2015.4063415527344, + "completion_length": 1700.5335388183594, "epoch": 0.17832872825031737, - "grad_norm": 0.09391847997903824, - "kl": 0.003330230712890625, - "learning_rate": 9.814464066980662e-08, - "loss": 0.018, - "reward": 0.3448660895228386, - "reward_std": 0.03479075152426958, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 28.970849990844727, + "kl": 3.8515625, + "learning_rate": 4.907232033490331e-07, + "loss": 0.3015, + "reward": 0.3510044813156128, + "reward_std": 0.17583762481808662, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375149011612, + "rewards/tag_count_reward": 0.2728794738650322, "step": 597 }, { "clip_ratio": 0.0, - "completion_length": 2008.1205749511719, + "completion_length": 1686.16748046875, "epoch": 0.17862743633783884, - "grad_norm": 0.1289839744567871, - "kl": 0.003635406494140625, - "learning_rate": 9.813053968455967e-08, - "loss": 0.0258, - "reward": 0.3906250223517418, - "reward_std": 0.12252834253013134, - "rewards/accuracy_reward": 0.10937500605359674, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500149011612, + "grad_norm": 22.58493423461914, + "kl": 3.79296875, + "learning_rate": 4.906526984227984e-07, + "loss": 0.3228, + "reward": 0.3738839402794838, + "reward_std": 0.22883356735110283, + "rewards/accuracy_reward": 0.08705357578583062, + "rewards/format_reward": 0.006696428870782256, + "rewards/tag_count_reward": 0.2801339440047741, "step": 598 }, { "clip_ratio": 0.0, - "completion_length": 1945.7032165527344, + "completion_length": 1569.5826416015625, "epoch": 0.1789261444253603, - "grad_norm": 0.13418006896972656, - "kl": 0.00540924072265625, - "learning_rate": 9.811638633800286e-08, - "loss": 0.0248, - "reward": 0.3309151902794838, - "reward_std": 0.1132102683186531, - "rewards/accuracy_reward": 0.03125000232830644, + "grad_norm": 14.223552703857422, + "kl": 2.9765625, + "learning_rate": 4.905819316900142e-07, + "loss": 0.3145, + "reward": 0.337611623108387, + "reward_std": 0.2109643742442131, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651902794838, + "rewards/tag_count_reward": 0.3152901977300644, "step": 599 }, { "clip_ratio": 0.0, - "completion_length": 2003.9242248535156, + "completion_length": 1622.274658203125, "epoch": 0.17922485251288178, - "grad_norm": 0.11640671640634537, - "kl": 0.004062652587890625, - "learning_rate": 9.810218064553362e-08, - "loss": 0.0179, - "reward": 0.3716518059372902, - "reward_std": 0.07612411770969629, - "rewards/accuracy_reward": 0.08482143119908869, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2868303656578064, + "grad_norm": 9.300459861755371, + "kl": 2.63671875, + "learning_rate": 4.905109032276681e-07, + "loss": 0.2732, + "reward": 0.372767873108387, + "reward_std": 0.21699827164411545, + "rewards/accuracy_reward": 0.0848214328289032, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.285714291036129, "step": 600 }, { "clip_ratio": 0.0, - "completion_length": 1950.0447082519531, + "completion_length": 1651.21435546875, "epoch": 0.17952356060040325, - "grad_norm": 0.14411337673664093, - "kl": 0.005458831787109375, - "learning_rate": 9.808792262260635e-08, - "loss": 0.0245, - "reward": 0.3353794813156128, - "reward_std": 0.12459973804652691, - "rewards/accuracy_reward": 0.03125000139698386, + "grad_norm": 51.11439895629883, + "kl": 5.03125, + "learning_rate": 4.904396131130317e-07, + "loss": 0.3929, + "reward": 0.317522332072258, + "reward_std": 0.23021812364459038, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.2818080484867096, "step": 601 }, { "clip_ratio": 0.0, - "completion_length": 1996.9085693359375, + "completion_length": 1674.1719360351562, "epoch": 0.17982226868792472, - "grad_norm": 0.11018816381692886, - "kl": 0.0038299560546875, - "learning_rate": 9.807361228473239e-08, - "loss": 0.0237, - "reward": 0.3833705633878708, - "reward_std": 0.08146911580115557, - "rewards/accuracy_reward": 0.09821428824216127, + "grad_norm": 20.548410415649414, + "kl": 3.171875, + "learning_rate": 4.90368061423662e-07, + "loss": 0.2202, + "reward": 0.3766741305589676, + "reward_std": 0.1925434172153473, + "rewards/accuracy_reward": 0.08928571501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "rewards/tag_count_reward": 0.2873884066939354, "step": 602 }, { "clip_ratio": 0.0, - "completion_length": 1938.0313110351562, + "completion_length": 1580.935302734375, "epoch": 0.1801209767754462, - "grad_norm": 0.13691875338554382, - "kl": 0.005584716796875, - "learning_rate": 9.805924964748e-08, - "loss": 0.0353, - "reward": 0.3381696566939354, - "reward_std": 0.10988021083176136, - "rewards/accuracy_reward": 0.029017857741564512, + "grad_norm": 16.438804626464844, + "kl": 2.796875, + "learning_rate": 4.902962482374e-07, + "loss": 0.2856, + "reward": 0.3582589402794838, + "reward_std": 0.21278730779886246, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.322544664144516, "step": 603 }, { "clip_ratio": 0.0, - "completion_length": 1919.4889221191406, + "completion_length": 1576.2232971191406, "epoch": 0.18041968486296767, - "grad_norm": 0.1195433959364891, - "kl": 0.00608062744140625, - "learning_rate": 9.804483472647431e-08, - "loss": 0.026, - "reward": 0.3989955559372902, - "reward_std": 0.07496745884418488, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 21.912410736083984, + "kl": 2.5234375, + "learning_rate": 4.902241736323715e-07, + "loss": 0.2472, + "reward": 0.4162946566939354, + "reward_std": 0.18398118019104004, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "rewards/tag_count_reward": 0.3203125074505806, "step": 604 }, { "clip_ratio": 0.0, - "completion_length": 1934.134033203125, + "completion_length": 1618.6072387695312, "epoch": 0.18071839295048914, - "grad_norm": 0.13177490234375, - "kl": 0.005645751953125, - "learning_rate": 9.803036753739732e-08, - "loss": 0.0325, - "reward": 0.3582589402794838, - "reward_std": 0.09373020008206367, - "rewards/accuracy_reward": 0.06473214528523386, + "grad_norm": 10.333484649658203, + "kl": 3.0390625, + "learning_rate": 4.901518376869866e-07, + "loss": 0.2857, + "reward": 0.3565848395228386, + "reward_std": 0.18648212775588036, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2935267984867096, + "rewards/tag_count_reward": 0.3030134066939354, "step": 605 }, { "clip_ratio": 0.0, - "completion_length": 2024.0045166015625, + "completion_length": 1692.0357971191406, "epoch": 0.1810171010380106, - "grad_norm": 0.10388819873332977, - "kl": 0.00323486328125, - "learning_rate": 9.801584809598793e-08, - "loss": 0.0122, - "reward": 0.275111623108387, - "reward_std": 0.06108877854421735, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 39.94902420043945, + "kl": 4.359375, + "learning_rate": 4.900792404799396e-07, + "loss": 0.3473, + "reward": 0.2929687649011612, + "reward_std": 0.20128410309553146, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2706473395228386, + "rewards/tag_count_reward": 0.2773437574505806, "step": 606 }, { "clip_ratio": 0.0, - "completion_length": 1970.9554443359375, + "completion_length": 1629.6027526855469, "epoch": 0.18131580912553208, - "grad_norm": 0.13596095144748688, - "kl": 0.004852294921875, - "learning_rate": 9.800127641804188e-08, - "loss": 0.0311, - "reward": 0.4386161044239998, - "reward_std": 0.11893527396023273, - "rewards/accuracy_reward": 0.1316964328289032, + "grad_norm": 26.49481201171875, + "kl": 3.83203125, + "learning_rate": 4.900063820902094e-07, + "loss": 0.3042, + "reward": 0.436383955180645, + "reward_std": 0.2070818915963173, + "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.3091517984867096, "step": 607 }, { "clip_ratio": 0.0, - "completion_length": 1993.1161804199219, + "completion_length": 1623.35498046875, "epoch": 0.18161451721305355, - "grad_norm": 0.12552356719970703, - "kl": 0.004589080810546875, - "learning_rate": 9.798665251941172e-08, - "loss": 0.0304, - "reward": 0.3197544813156128, - "reward_std": 0.1136146280914545, - "rewards/accuracy_reward": 0.026785715483129025, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2929687649011612, + "grad_norm": 14.277666091918945, + "kl": 4.11328125, + "learning_rate": 4.899332625970586e-07, + "loss": 0.3485, + "reward": 0.3074776902794838, + "reward_std": 0.2183593437075615, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2896205484867096, "step": 608 }, { "clip_ratio": 0.0, - "completion_length": 2023.2969360351562, + "completion_length": 1687.8460693359375, "epoch": 0.18191322530057502, - "grad_norm": 0.10934681445360184, - "kl": 0.00348663330078125, - "learning_rate": 9.79719764160068e-08, - "loss": 0.0169, - "reward": 0.290178582072258, - "reward_std": 0.08074933756142855, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 5.5822553634643555, + "kl": 3.25, + "learning_rate": 4.89859882080034e-07, + "loss": 0.2597, + "reward": 0.3030133992433548, + "reward_std": 0.22221703827381134, + "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500149011612, + "rewards/tag_count_reward": 0.2806919813156128, "step": 609 }, { "clip_ratio": 0.0, - "completion_length": 1990.2210388183594, + "completion_length": 1618.9397888183594, "epoch": 0.1822119333880965, - "grad_norm": 0.12029717862606049, - "kl": 0.004619598388671875, - "learning_rate": 9.795724812379333e-08, - "loss": 0.0329, - "reward": 0.3130580559372902, - "reward_std": 0.11420746706426144, - "rewards/accuracy_reward": 0.022321430034935474, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.290736623108387, + "grad_norm": 10.38758373260498, + "kl": 3.109375, + "learning_rate": 4.897862406189667e-07, + "loss": 0.3041, + "reward": 0.3392857238650322, + "reward_std": 0.24260738864541054, + "rewards/accuracy_reward": 0.04910714365541935, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2879464477300644, "step": 610 }, { "clip_ratio": 0.0, - "completion_length": 1921.7813110351562, + "completion_length": 1577.9286499023438, "epoch": 0.18251064147561796, - "grad_norm": 0.1621660590171814, - "kl": 0.00629425048828125, - "learning_rate": 9.794246765879419e-08, - "loss": 0.0554, - "reward": 0.5150669887661934, - "reward_std": 0.15306039806455374, - "rewards/accuracy_reward": 0.1986607238650322, + "grad_norm": 9.135944366455078, + "kl": 2.833984375, + "learning_rate": 4.89712338293971e-07, + "loss": 0.2745, + "reward": 0.4447544887661934, + "reward_std": 0.21125075966119766, + "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.3041294738650322, "step": 611 }, { "clip_ratio": 0.0, - "completion_length": 1986.4264221191406, + "completion_length": 1580.3103332519531, "epoch": 0.18280934956313943, - "grad_norm": 0.13141745328903198, - "kl": 0.004734039306640625, - "learning_rate": 9.792763503708911e-08, - "loss": 0.0403, - "reward": 0.294084832072258, - "reward_std": 0.06998394895344973, - "rewards/accuracy_reward": 0.0, + "grad_norm": 29.207996368408203, + "kl": 2.73828125, + "learning_rate": 4.896381751854456e-07, + "loss": 0.2872, + "reward": 0.3175223395228386, + "reward_std": 0.2046714834868908, + "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294084832072258, + "rewards/tag_count_reward": 0.3041294813156128, "step": 612 }, { "clip_ratio": 0.0, - "completion_length": 1936.6987609863281, + "completion_length": 1483.1540832519531, "epoch": 0.1831080576506609, - "grad_norm": 0.13934257626533508, - "kl": 0.00586700439453125, - "learning_rate": 9.791275027481454e-08, - "loss": 0.0484, - "reward": 0.4062500149011612, - "reward_std": 0.09853017795830965, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 26.977313995361328, + "kl": 2.46484375, + "learning_rate": 4.895637513740727e-07, + "loss": 0.2681, + "reward": 0.4207589477300644, + "reward_std": 0.17794334888458252, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.333705373108387, "step": 613 }, { "clip_ratio": 0.0, - "completion_length": 1982.7835693359375, + "completion_length": 1615.3906860351562, "epoch": 0.18340676573818235, - "grad_norm": 0.13111437857151031, - "kl": 0.00485992431640625, - "learning_rate": 9.789781338816361e-08, - "loss": 0.0371, - "reward": 0.3582589477300644, - "reward_std": 0.11459081061184406, - "rewards/accuracy_reward": 0.05803571594879031, + "grad_norm": 10.343697547912598, + "kl": 2.9921875, + "learning_rate": 4.89489066940818e-07, + "loss": 0.2736, + "reward": 0.3794643059372902, + "reward_std": 0.20134438574314117, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3002232313156128, + "rewards/tag_count_reward": 0.3169643059372902, "step": 614 }, { "clip_ratio": 0.0, - "completion_length": 1997.21435546875, + "completion_length": 1681.935302734375, "epoch": 0.18370547382570382, - "grad_norm": 0.12100611627101898, - "kl": 0.00435638427734375, - "learning_rate": 9.78828243933862e-08, - "loss": 0.0264, - "reward": 0.3286830484867096, - "reward_std": 0.11570020485669374, - "rewards/accuracy_reward": 0.033482144586741924, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952009066939354, + "grad_norm": 14.852068901062012, + "kl": 3.83203125, + "learning_rate": 4.89414121966931e-07, + "loss": 0.3302, + "reward": 0.299107164144516, + "reward_std": 0.2102036066353321, + "rewards/accuracy_reward": 0.01562500116415322, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.2812500223517418, "step": 615 }, { "clip_ratio": 0.0, - "completion_length": 1973.6518859863281, + "completion_length": 1658.2188415527344, "epoch": 0.1840041819132253, - "grad_norm": 0.13030511140823364, - "kl": 0.004985809326171875, - "learning_rate": 9.786778330678887e-08, - "loss": 0.0198, - "reward": 0.3292410895228386, - "reward_std": 0.07312548719346523, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 16.310871124267578, + "kl": 3.6484375, + "learning_rate": 4.893389165339443e-07, + "loss": 0.2814, + "reward": 0.3348214477300644, + "reward_std": 0.19258644804358482, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2890625149011612, + "rewards/tag_count_reward": 0.2879464402794838, "step": 616 }, { "clip_ratio": 0.0, - "completion_length": 1938.46435546875, + "completion_length": 1569.5804138183594, "epoch": 0.18430289000074676, - "grad_norm": 0.13947314023971558, - "kl": 0.00605010986328125, - "learning_rate": 9.785269014473485e-08, - "loss": 0.0353, - "reward": 0.3850446715950966, - "reward_std": 0.0762511370703578, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 21.803630828857422, + "kl": 3.1484375, + "learning_rate": 4.892634507236742e-07, + "loss": 0.3106, + "reward": 0.3822544813156128, + "reward_std": 0.1858588345348835, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.306361623108387, "step": 617 }, { "clip_ratio": 0.0, - "completion_length": 2010.7746276855469, + "completion_length": 1668.4822387695312, "epoch": 0.18460159808826823, - "grad_norm": 0.12764765322208405, - "kl": 0.00437164306640625, - "learning_rate": 9.783754492364404e-08, - "loss": 0.0266, - "reward": 0.368861623108387, - "reward_std": 0.08611817471683025, - "rewards/accuracy_reward": 0.08258929220028222, + "grad_norm": 14.831948280334473, + "kl": 3.86328125, + "learning_rate": 4.891877246182202e-07, + "loss": 0.3119, + "reward": 0.3783482164144516, + "reward_std": 0.1814921796321869, + "rewards/accuracy_reward": 0.0803571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286272332072258, + "rewards/tag_count_reward": 0.2979910746216774, "step": 618 }, { "clip_ratio": 0.0, - "completion_length": 1940.1407165527344, + "completion_length": 1591.1339721679688, "epoch": 0.1849003061757897, - "grad_norm": 0.1284300982952118, - "kl": 0.00598907470703125, - "learning_rate": 9.782234765999291e-08, - "loss": 0.0184, - "reward": 0.3186384066939354, - "reward_std": 0.08755614422261715, - "rewards/accuracy_reward": 0.024553572526201606, + "grad_norm": 39.02629852294922, + "kl": 4.34375, + "learning_rate": 4.891117382999646e-07, + "loss": 0.3558, + "reward": 0.3275669813156128, + "reward_std": 0.2049858383834362, + "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.294084832072258, "step": 619 }, { "clip_ratio": 0.0, - "completion_length": 1949.1407165527344, + "completion_length": 1618.7143859863281, "epoch": 0.18519901426331117, - "grad_norm": 0.16001001000404358, - "kl": 0.005634307861328125, - "learning_rate": 9.780709837031463e-08, - "loss": 0.0411, - "reward": 0.3789062649011612, - "reward_std": 0.07969588413834572, - "rewards/accuracy_reward": 0.08705357392318547, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2918526902794838, + "grad_norm": 26.222776412963867, + "kl": 3.7421875, + "learning_rate": 4.890354918515731e-07, + "loss": 0.3142, + "reward": 0.3967634066939354, + "reward_std": 0.21178822219371796, + "rewards/accuracy_reward": 0.09151786309666932, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3030134066939354, "step": 620 }, { "clip_ratio": 0.0, - "completion_length": 2002.2880554199219, + "completion_length": 1662.9130249023438, "epoch": 0.18549772235083264, - "grad_norm": 0.09649185836315155, - "kl": 0.004505157470703125, - "learning_rate": 9.779179707119893e-08, - "loss": 0.0284, - "reward": 0.2901785895228386, - "reward_std": 0.056237153708934784, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 8.32137680053711, + "kl": 3.53125, + "learning_rate": 4.889589853559946e-07, + "loss": 0.2963, + "reward": 0.3125000149011612, + "reward_std": 0.20667875930666924, + "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2812500149011612, + "rewards/tag_count_reward": 0.2946428656578064, "step": 621 }, { "clip_ratio": 0.0, - "completion_length": 1883.4710693359375, + "completion_length": 1503.6317749023438, "epoch": 0.18579643043835412, - "grad_norm": 0.14180447161197662, - "kl": 0.00768280029296875, - "learning_rate": 9.77764437792921e-08, - "loss": 0.0543, - "reward": 0.4268973395228386, - "reward_std": 0.15743717178702354, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 43.870338439941406, + "kl": 1.9375, + "learning_rate": 4.888822188964606e-07, + "loss": 0.2487, + "reward": 0.4508928805589676, + "reward_std": 0.2510574646294117, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.3348214402794838, "step": 622 }, { "clip_ratio": 0.0, - "completion_length": 1833.0692443847656, + "completion_length": 1484.8259887695312, "epoch": 0.1860951385258756, - "grad_norm": 0.1496400684118271, - "kl": 0.0090789794921875, - "learning_rate": 9.776103851129704e-08, - "loss": 0.0456, - "reward": 0.4525669813156128, - "reward_std": 0.17151938192546368, - "rewards/accuracy_reward": 0.12500000605359674, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "grad_norm": 33.6615104675293, + "kl": 2.12109375, + "learning_rate": 4.888051925564853e-07, + "loss": 0.2328, + "reward": 0.440848246216774, + "reward_std": 0.23657935112714767, + "rewards/accuracy_reward": 0.10937500605359674, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.329241082072258, "step": 623 }, { "clip_ratio": 0.0, - "completion_length": 1939.5112609863281, + "completion_length": 1610.1005249023438, "epoch": 0.18639384661339706, - "grad_norm": 0.12245795130729675, - "kl": 0.006389617919921875, - "learning_rate": 9.774558128397316e-08, - "loss": 0.0325, - "reward": 0.4023437723517418, - "reward_std": 0.10083308909088373, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 32.655677795410156, + "kl": 2.302734375, + "learning_rate": 4.887279064198659e-07, + "loss": 0.2492, + "reward": 0.4090401977300644, + "reward_std": 0.2252359464764595, + "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.301897332072258, + "rewards/tag_count_reward": 0.3018973395228386, "step": 624 }, { "clip_ratio": 0.0, - "completion_length": 2005.9755249023438, + "completion_length": 1678.9241638183594, "epoch": 0.18669255470091853, - "grad_norm": 0.11727248877286911, - "kl": 0.00455474853515625, - "learning_rate": 9.773007211413643e-08, - "loss": 0.0229, - "reward": 0.2818080484867096, - "reward_std": 0.05028681270778179, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 22.373123168945312, + "kl": 2.83203125, + "learning_rate": 4.886503605706821e-07, + "loss": 0.2782, + "reward": 0.325334832072258, + "reward_std": 0.20584102347493172, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2795759066939354, + "rewards/tag_count_reward": 0.2985491156578064, "step": 625 }, { "clip_ratio": 0.0, - "completion_length": 1993.5134887695312, + "completion_length": 1638.8482971191406, "epoch": 0.18699126278844, - "grad_norm": 0.11104948073625565, - "kl": 0.004993438720703125, - "learning_rate": 9.771451101865929e-08, - "loss": 0.0255, - "reward": 0.3543526902794838, - "reward_std": 0.09674243349581957, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 15.191483497619629, + "kl": 2.65234375, + "learning_rate": 4.885725550932964e-07, + "loss": 0.2436, + "reward": 0.3655134066939354, + "reward_std": 0.21288607269525528, + "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.282924123108387, + "rewards/tag_count_reward": 0.3052455484867096, "step": 626 }, { "clip_ratio": 0.0, - "completion_length": 1900.0313415527344, + "completion_length": 1627.3973999023438, "epoch": 0.18728997087596147, - "grad_norm": 0.12716974318027496, - "kl": 0.00748443603515625, - "learning_rate": 9.769889801447067e-08, - "loss": 0.0363, - "reward": 0.4202009066939354, - "reward_std": 0.07411943282932043, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 15.465039253234863, + "kl": 3.78515625, + "learning_rate": 4.884944900723533e-07, + "loss": 0.3171, + "reward": 0.3872768059372902, + "reward_std": 0.19754140451550484, + "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.3091517984867096, "step": 627 }, { "clip_ratio": 0.0, - "completion_length": 1974.7947387695312, + "completion_length": 1637.3370971679688, "epoch": 0.18758867896348294, - "grad_norm": 0.1381523311138153, - "kl": 0.0054779052734375, - "learning_rate": 9.768323311855601e-08, - "loss": 0.0396, - "reward": 0.318080373108387, - "reward_std": 0.08953116834163666, - "rewards/accuracy_reward": 0.022321430267766118, + "grad_norm": 33.3397216796875, + "kl": 4.16796875, + "learning_rate": 4.884161655927801e-07, + "loss": 0.3018, + "reward": 0.3085937574505806, + "reward_std": 0.20259520784020424, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589328289032, + "rewards/tag_count_reward": 0.2885044738650322, "step": 628 }, { "clip_ratio": 0.0, - "completion_length": 1979.4063110351562, + "completion_length": 1671.247802734375, "epoch": 0.1878873870510044, - "grad_norm": 0.12769055366516113, - "kl": 0.00524139404296875, - "learning_rate": 9.766751634795718e-08, - "loss": 0.0248, - "reward": 0.4425223395228386, - "reward_std": 0.11111154034733772, - "rewards/accuracy_reward": 0.14285714738070965, + "grad_norm": 37.147525787353516, + "kl": 4.47265625, + "learning_rate": 4.883375817397859e-07, + "loss": 0.3346, + "reward": 0.4190848395228386, + "reward_std": 0.21098532527685165, + "rewards/accuracy_reward": 0.13169643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651902794838, + "rewards/tag_count_reward": 0.2873883992433548, "step": 629 }, { "clip_ratio": 0.0, - "completion_length": 1975.8482971191406, + "completion_length": 1632.8840026855469, "epoch": 0.18818609513852588, - "grad_norm": 0.1448988914489746, - "kl": 0.00536346435546875, - "learning_rate": 9.765174771977247e-08, - "loss": 0.0365, - "reward": 0.325892873108387, - "reward_std": 0.09200314059853554, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 27.872949600219727, + "kl": 4.00390625, + "learning_rate": 4.882587385988623e-07, + "loss": 0.3139, + "reward": 0.318080373108387, + "reward_std": 0.2043987177312374, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107313156128, + "rewards/tag_count_reward": 0.2912946492433548, "step": 630 }, { "clip_ratio": 0.0, - "completion_length": 1919.3125610351562, + "completion_length": 1567.22998046875, "epoch": 0.18848480322604735, - "grad_norm": 0.1447986364364624, - "kl": 0.007572174072265625, - "learning_rate": 9.763592725115663e-08, - "loss": 0.0436, - "reward": 0.3169642984867096, - "reward_std": 0.09114566072821617, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 10.563283920288086, + "kl": 3.765625, + "learning_rate": 4.881796362557832e-07, + "loss": 0.3254, + "reward": 0.3247767984867096, + "reward_std": 0.20603583008050919, + "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3058035746216774, + "rewards/tag_count_reward": 0.2979910746216774, "step": 631 }, { "clip_ratio": 0.0, - "completion_length": 1866.2545471191406, + "completion_length": 1497.3326721191406, "epoch": 0.18878351131356882, - "grad_norm": 0.17159882187843323, - "kl": 0.008819580078125, - "learning_rate": 9.762005495932075e-08, - "loss": 0.0637, - "reward": 0.3934151977300644, - "reward_std": 0.10838056728243828, - "rewards/accuracy_reward": 0.07142857648432255, + "grad_norm": 13.469141006469727, + "kl": 2.61328125, + "learning_rate": 4.881002747966038e-07, + "loss": 0.2605, + "reward": 0.4218750149011612, + "reward_std": 0.19058503955602646, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.3392857238650322, "step": 632 }, { "clip_ratio": 0.0, - "completion_length": 1937.2344665527344, + "completion_length": 1568.7969360351562, "epoch": 0.1890822194010903, - "grad_norm": 0.1189238429069519, - "kl": 0.006683349609375, - "learning_rate": 9.760413086153234e-08, - "loss": 0.0343, - "reward": 0.3616071566939354, - "reward_std": 0.11600067280232906, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 25.033424377441406, + "kl": 2.59765625, + "learning_rate": 4.880206543076617e-07, + "loss": 0.26, + "reward": 0.3420759066939354, + "reward_std": 0.20988080650568008, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3102678656578064, + "rewards/tag_count_reward": 0.3242187649011612, "step": 633 }, { "clip_ratio": 0.0, - "completion_length": 1930.6295471191406, + "completion_length": 1613.1407165527344, "epoch": 0.18938092748861177, - "grad_norm": 0.14091913402080536, - "kl": 0.00701904296875, - "learning_rate": 9.758815497511528e-08, - "loss": 0.043, - "reward": 0.3638392984867096, - "reward_std": 0.09348317421972752, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 13.932612419128418, + "kl": 3.41796875, + "learning_rate": 4.879407748755764e-07, + "loss": 0.2831, + "reward": 0.3872768059372902, + "reward_std": 0.20878805965185165, + "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.302455373108387, "step": 634 }, { "clip_ratio": 0.0, - "completion_length": 1984.2389221191406, + "completion_length": 1649.4308776855469, "epoch": 0.18967963557613324, - "grad_norm": 0.12655022740364075, - "kl": 0.00551605224609375, - "learning_rate": 9.757212731744973e-08, - "loss": 0.0333, - "reward": 0.3041294664144516, - "reward_std": 0.08104340638965368, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 4.663081645965576, + "kl": 3.28515625, + "learning_rate": 4.878606365872486e-07, + "loss": 0.2992, + "reward": 0.3030134066939354, + "reward_std": 0.19142171368002892, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297433041036129, + "rewards/tag_count_reward": 0.287388414144516, "step": 635 }, { "clip_ratio": 0.0, - "completion_length": 1991.4063415527344, + "completion_length": 1674.6920471191406, "epoch": 0.18997834366365468, - "grad_norm": 0.13040976226329803, - "kl": 0.0052947998046875, - "learning_rate": 9.755604790597223e-08, - "loss": 0.0285, - "reward": 0.324776791036129, - "reward_std": 0.10170167870819569, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 44.26906204223633, + "kl": 2.3515625, + "learning_rate": 4.877802395298612e-07, + "loss": 0.2654, + "reward": 0.3281250223517418, + "reward_std": 0.22746697440743446, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2935267984867096, + "rewards/tag_count_reward": 0.290178582072258, "step": 636 }, { "clip_ratio": 0.0, - "completion_length": 1937.7545471191406, + "completion_length": 1646.3996276855469, "epoch": 0.19027705175117615, - "grad_norm": 0.1462779939174652, - "kl": 0.00658416748046875, - "learning_rate": 9.753991675817562e-08, - "loss": 0.0419, - "reward": 0.3392857313156128, - "reward_std": 0.1091129332780838, - "rewards/accuracy_reward": 0.03348214295692742, + "grad_norm": 42.17460632324219, + "kl": 2.333984375, + "learning_rate": 4.87699583790878e-07, + "loss": 0.2495, + "reward": 0.3515625223517418, + "reward_std": 0.19890738278627396, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.3091517984867096, "step": 637 }, { "clip_ratio": 0.0, - "completion_length": 1973.0246276855469, + "completion_length": 1622.7098999023438, "epoch": 0.19057575983869762, - "grad_norm": 0.12442576140165329, - "kl": 0.00574493408203125, - "learning_rate": 9.752373389160895e-08, - "loss": 0.0228, - "reward": 0.3705357238650322, - "reward_std": 0.07348818611353636, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 12.488162994384766, + "kl": 2.77734375, + "learning_rate": 4.876186694580448e-07, + "loss": 0.2436, + "reward": 0.401227705180645, + "reward_std": 0.17957303300499916, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107238650322, + "rewards/tag_count_reward": 0.314174123108387, "step": 638 }, { "clip_ratio": 0.0, - "completion_length": 1979.4264221191406, + "completion_length": 1588.1897888183594, "epoch": 0.1908744679262191, - "grad_norm": 0.14625002443790436, - "kl": 0.00603485107421875, - "learning_rate": 9.750749932387766e-08, - "loss": 0.0305, - "reward": 0.4101562574505806, - "reward_std": 0.10721490532159805, - "rewards/accuracy_reward": 0.1071428582072258, + "grad_norm": 19.67913246154785, + "kl": 3.0703125, + "learning_rate": 4.875374966193883e-07, + "loss": 0.3204, + "reward": 0.4252232238650322, + "reward_std": 0.20518175140023232, + "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.313616082072258, "step": 639 }, { "clip_ratio": 0.0, - "completion_length": 1972.8371276855469, + "completion_length": 1665.0648193359375, "epoch": 0.19117317601374056, - "grad_norm": 0.13007235527038574, - "kl": 0.00605010986328125, - "learning_rate": 9.749121307264335e-08, - "loss": 0.0395, - "reward": 0.3588169813156128, - "reward_std": 0.10217902436852455, - "rewards/accuracy_reward": 0.06473214644938707, + "grad_norm": 32.708717346191406, + "kl": 4.12109375, + "learning_rate": 4.874560653632167e-07, + "loss": 0.3251, + "reward": 0.3593750223517418, + "reward_std": 0.23873838409781456, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294084832072258, + "rewards/tag_count_reward": 0.2834821492433548, "step": 640 }, { "clip_ratio": 0.0, - "completion_length": 1972.5781860351562, + "completion_length": 1676.6630249023438, "epoch": 0.19147188410126204, - "grad_norm": 0.12897583842277527, - "kl": 0.00569915771484375, - "learning_rate": 9.747487515562383e-08, - "loss": 0.0302, - "reward": 0.3437500223517418, - "reward_std": 0.07652648631483316, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 32.92763137817383, + "kl": 4.4765625, + "learning_rate": 4.873743757781191e-07, + "loss": 0.3681, + "reward": 0.3420759066939354, + "reward_std": 0.21803581342101097, + "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2968750149011612, + "rewards/tag_count_reward": 0.2773437649011612, "step": 641 }, { "clip_ratio": 0.0, - "completion_length": 1988.7210693359375, + "completion_length": 1685.747802734375, "epoch": 0.1917705921887835, - "grad_norm": 0.118667833507061, - "kl": 0.00554656982421875, - "learning_rate": 9.74584855905932e-08, - "loss": 0.0349, - "reward": 0.345424123108387, - "reward_std": 0.08713644836097956, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 27.858640670776367, + "kl": 4.21875, + "learning_rate": 4.87292427952966e-07, + "loss": 0.3191, + "reward": 0.3543526977300644, + "reward_std": 0.19386287406086922, + "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2963169738650322, + "rewards/tag_count_reward": 0.3030134066939354, "step": 642 }, { "clip_ratio": 0.0, - "completion_length": 1951.1764221191406, + "completion_length": 1563.3170166015625, "epoch": 0.19206930027630498, - "grad_norm": 0.13661813735961914, - "kl": 0.0067291259765625, - "learning_rate": 9.744204439538166e-08, - "loss": 0.0433, - "reward": 0.4034598469734192, - "reward_std": 0.07205959968268871, - "rewards/accuracy_reward": 0.10937500861473382, + "grad_norm": 19.176326751708984, + "kl": 4.1171875, + "learning_rate": 4.872102219769083e-07, + "loss": 0.3384, + "reward": 0.432477705180645, + "reward_std": 0.16325363144278526, + "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294084832072258, + "rewards/tag_count_reward": 0.3208705559372902, "step": 643 }, { "clip_ratio": 0.0, - "completion_length": 1889.305908203125, + "completion_length": 1591.0246276855469, "epoch": 0.19236800836382645, - "grad_norm": 0.1544024795293808, - "kl": 0.00814056396484375, - "learning_rate": 9.742555158787565e-08, - "loss": 0.0502, - "reward": 0.4296875298023224, - "reward_std": 0.10808515921235085, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 5.66936731338501, + "kl": 3.38671875, + "learning_rate": 4.871277579393782e-07, + "loss": 0.3103, + "reward": 0.4196428880095482, + "reward_std": 0.19601045176386833, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.316964291036129, "step": 644 }, { "clip_ratio": 0.0, - "completion_length": 1983.0134887695312, + "completion_length": 1606.3951721191406, "epoch": 0.19266671645134792, - "grad_norm": 0.12702025473117828, - "kl": 0.0057830810546875, - "learning_rate": 9.740900718601772e-08, - "loss": 0.0267, - "reward": 0.3956473395228386, - "reward_std": 0.1493928311392665, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 14.642088890075684, + "kl": 3.49609375, + "learning_rate": 4.870450359300886e-07, + "loss": 0.2979, + "reward": 0.3861607313156128, + "reward_std": 0.22472280263900757, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651902794838, + "rewards/tag_count_reward": 0.3147321566939354, "step": 645 }, { "clip_ratio": 0.0, - "completion_length": 2024.4465637207031, + "completion_length": 1710.2188110351562, "epoch": 0.1929654245388694, - "grad_norm": 0.1097978726029396, - "kl": 0.00447845458984375, - "learning_rate": 9.739241120780655e-08, - "loss": 0.0164, - "reward": 0.369419664144516, - "reward_std": 0.09350718557834625, - "rewards/accuracy_reward": 0.08928571594879031, + "grad_norm": 6.783075332641602, + "kl": 3.484375, + "learning_rate": 4.869620560390327e-07, + "loss": 0.2848, + "reward": 0.424107164144516, + "reward_std": 0.2475946806371212, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2801339328289032, + "rewards/tag_count_reward": 0.3013392984867096, "step": 646 }, { "clip_ratio": 0.0, - "completion_length": 1961.4219665527344, + "completion_length": 1600.3973693847656, "epoch": 0.19326413262639086, - "grad_norm": 0.11555801331996918, - "kl": 0.0064239501953125, - "learning_rate": 9.737576367129694e-08, - "loss": 0.0274, - "reward": 0.3275669813156128, - "reward_std": 0.05937607679516077, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 12.36570930480957, + "kl": 3.5859375, + "learning_rate": 4.868788183564847e-07, + "loss": 0.3079, + "reward": 0.3443080484867096, + "reward_std": 0.15416168048977852, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2896205484867096, + "rewards/tag_count_reward": 0.3085937574505806, "step": 647 }, { "clip_ratio": 0.0, - "completion_length": 1926.2746276855469, + "completion_length": 1593.4040832519531, "epoch": 0.19356284071391233, - "grad_norm": 0.13968753814697266, - "kl": 0.0074005126953125, - "learning_rate": 9.735906459459978e-08, - "loss": 0.0429, - "reward": 0.4029017984867096, - "reward_std": 0.08927657082676888, - "rewards/accuracy_reward": 0.09151786123402417, + "grad_norm": 22.541234970092773, + "kl": 2.82421875, + "learning_rate": 4.867953229729988e-07, + "loss": 0.2725, + "reward": 0.388950914144516, + "reward_std": 0.1738101914525032, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.3085937574505806, "step": 648 }, { "clip_ratio": 0.0, - "completion_length": 1962.2009887695312, + "completion_length": 1596.3839721679688, "epoch": 0.1938615488014338, - "grad_norm": 0.15233954787254333, - "kl": 0.0069122314453125, - "learning_rate": 9.734231399588202e-08, - "loss": 0.0403, - "reward": 0.4146205633878708, - "reward_std": 0.1370495930314064, - "rewards/accuracy_reward": 0.10937500186264515, + "grad_norm": 29.291837692260742, + "kl": 2.734375, + "learning_rate": 4.867115699794101e-07, + "loss": 0.2901, + "reward": 0.4068080484867096, + "reward_std": 0.1978348344564438, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.3152901977300644, "step": 649 }, { "clip_ratio": 0.0, - "completion_length": 1955.8393859863281, + "completion_length": 1603.6340026855469, "epoch": 0.19416025688895527, - "grad_norm": 0.13606108725070953, - "kl": 0.006744384765625, - "learning_rate": 9.732551189336669e-08, - "loss": 0.0451, - "reward": 0.4229910969734192, - "reward_std": 0.13023904897272587, - "rewards/accuracy_reward": 0.11607143515720963, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "grad_norm": 32.39169692993164, + "kl": 2.275390625, + "learning_rate": 4.866275594668335e-07, + "loss": 0.237, + "reward": 0.451450914144516, + "reward_std": 0.2406710982322693, + "rewards/accuracy_reward": 0.12276786146685481, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.326450914144516, "step": 650 }, { "clip_ratio": 0.0, - "completion_length": 1979.62060546875, + "completion_length": 1638.8013916015625, "epoch": 0.19445896497647674, - "grad_norm": 0.1434536725282669, - "kl": 0.00583648681640625, - "learning_rate": 9.730865830533282e-08, - "loss": 0.0388, - "reward": 0.3677455484867096, - "reward_std": 0.0747321154922247, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 26.57655143737793, + "kl": 2.890625, + "learning_rate": 4.865432915266641e-07, + "loss": 0.2803, + "reward": 0.3962053656578064, + "reward_std": 0.21351991221308708, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294084832072258, + "rewards/tag_count_reward": 0.3024553656578064, "step": 651 }, { "clip_ratio": 0.0, - "completion_length": 1980.5759887695312, + "completion_length": 1655.0938415527344, "epoch": 0.19475767306399822, - "grad_norm": 0.1353389322757721, - "kl": 0.0061187744140625, - "learning_rate": 9.729175325011545e-08, - "loss": 0.0354, - "reward": 0.388950914144516, - "reward_std": 0.09270811825990677, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 5.260843753814697, + "kl": 3.4765625, + "learning_rate": 4.864587662505773e-07, + "loss": 0.2902, + "reward": 0.388392873108387, + "reward_std": 0.18621479347348213, + "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.305803582072258, "step": 652 }, { "clip_ratio": 0.0, - "completion_length": 1943.1608276367188, + "completion_length": 1654.3817749023438, "epoch": 0.1950563811515197, - "grad_norm": 0.13152237236499786, - "kl": 0.00705718994140625, - "learning_rate": 9.727479674610564e-08, - "loss": 0.0218, - "reward": 0.3861607313156128, - "reward_std": 0.1109043387696147, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 13.082962036132812, + "kl": 3.69921875, + "learning_rate": 4.863739837305282e-07, + "loss": 0.2962, + "reward": 0.3750000149011612, + "reward_std": 0.2190452516078949, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107313156128, + "rewards/tag_count_reward": 0.290178582072258, "step": 653 }, { "clip_ratio": 0.0, - "completion_length": 1937.2857971191406, + "completion_length": 1635.0402526855469, "epoch": 0.19535508923904116, - "grad_norm": 0.13761569559574127, - "kl": 0.00719451904296875, - "learning_rate": 9.72577888117504e-08, - "loss": 0.0441, - "reward": 0.3314732238650322, - "reward_std": 0.09597261343151331, - "rewards/accuracy_reward": 0.033482144586741924, + "grad_norm": 4.247363090515137, + "kl": 3.17578125, + "learning_rate": 4.86288944058752e-07, + "loss": 0.2875, + "reward": 0.330357164144516, + "reward_std": 0.2030586116015911, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297991082072258, + "rewards/tag_count_reward": 0.3035714402794838, "step": 654 }, { "clip_ratio": 0.0, - "completion_length": 2010.7411804199219, + "completion_length": 1734.6116943359375, "epoch": 0.19565379732656263, - "grad_norm": 0.12892284989356995, - "kl": 0.0052490234375, - "learning_rate": 9.724072946555268e-08, - "loss": 0.0242, - "reward": 0.3147321566939354, - "reward_std": 0.10169387608766556, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 23.231365203857422, + "kl": 4.1328125, + "learning_rate": 4.862036473277634e-07, + "loss": 0.3105, + "reward": 0.2890625149011612, + "reward_std": 0.1819249652326107, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2946428656578064, + "rewards/tag_count_reward": 0.2845982313156128, "step": 655 }, { "clip_ratio": 0.0, - "completion_length": 1980.41748046875, + "completion_length": 1635.8572082519531, "epoch": 0.1959525054140841, - "grad_norm": 0.14201684296131134, - "kl": 0.006500244140625, - "learning_rate": 9.722361872607142e-08, - "loss": 0.0334, - "reward": 0.3872767984867096, - "reward_std": 0.0901622474193573, - "rewards/accuracy_reward": 0.0959821455180645, + "grad_norm": 8.652819633483887, + "kl": 3.58984375, + "learning_rate": 4.86118093630357e-07, + "loss": 0.3167, + "reward": 0.3833705559372902, + "reward_std": 0.2126331329345703, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2912946492433548, + "rewards/tag_count_reward": 0.2918526902794838, "step": 656 }, { "clip_ratio": 0.0, - "completion_length": 1883.3148193359375, + "completion_length": 1560.5915832519531, "epoch": 0.19625121350160554, - "grad_norm": 0.1382894515991211, - "kl": 0.00887298583984375, - "learning_rate": 9.720645661192137e-08, - "loss": 0.0364, - "reward": 0.4017857238650322, - "reward_std": 0.1300662774592638, - "rewards/accuracy_reward": 0.08035714412108064, + "grad_norm": 16.258974075317383, + "kl": 3.546875, + "learning_rate": 4.860322830596069e-07, + "loss": 0.3565, + "reward": 0.3895089477300644, + "reward_std": 0.21800580248236656, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.3180803656578064, "step": 657 }, { "clip_ratio": 0.0, - "completion_length": 2020.7389221191406, + "completion_length": 1796.0581359863281, "epoch": 0.196549921589127, - "grad_norm": 0.1297629475593567, - "kl": 0.00482940673828125, - "learning_rate": 9.718924314177326e-08, - "loss": 0.031, - "reward": 0.344866082072258, - "reward_std": 0.06059699505567551, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 66.2879638671875, + "kl": 4.84375, + "learning_rate": 4.859462157088663e-07, + "loss": 0.3193, + "reward": 0.349330373108387, + "reward_std": 0.17484188824892044, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2734375074505806, + "rewards/tag_count_reward": 0.2734375223517418, "step": 658 }, { "clip_ratio": 0.0, - "completion_length": 1946.0112609863281, + "completion_length": 1577.77685546875, "epoch": 0.19684862967664848, - "grad_norm": 0.13999773561954498, - "kl": 0.00728607177734375, - "learning_rate": 9.717197833435366e-08, - "loss": 0.0497, - "reward": 0.3861607387661934, - "reward_std": 0.08751245215535164, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 11.260113716125488, + "kl": 3.58984375, + "learning_rate": 4.858598916717683e-07, + "loss": 0.34, + "reward": 0.4112723395228386, + "reward_std": 0.17002077400684357, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.3309151902794838, "step": 659 }, { "clip_ratio": 0.0, - "completion_length": 1988.2523193359375, + "completion_length": 1647.8951721191406, "epoch": 0.19714733776416996, - "grad_norm": 0.138148695230484, - "kl": 0.00673675537109375, - "learning_rate": 9.715466220844498e-08, - "loss": 0.0278, - "reward": 0.3348214477300644, - "reward_std": 0.12342043220996857, - "rewards/accuracy_reward": 0.03348214365541935, + "grad_norm": 33.16048812866211, + "kl": 4.39453125, + "learning_rate": 4.857733110422249e-07, + "loss": 0.3565, + "reward": 0.3270089402794838, + "reward_std": 0.2113390676677227, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3013393059372902, + "rewards/tag_count_reward": 0.3046875074505806, "step": 660 }, { "clip_ratio": 0.0, - "completion_length": 1930.5938415527344, + "completion_length": 1621.7232666015625, "epoch": 0.19744604585169143, - "grad_norm": 0.11217966675758362, - "kl": 0.00748443603515625, - "learning_rate": 9.713729478288549e-08, - "loss": 0.0157, - "reward": 0.4944196715950966, - "reward_std": 0.05636673327535391, - "rewards/accuracy_reward": 0.2008928656578064, + "grad_norm": 30.13826560974121, + "kl": 4.03515625, + "learning_rate": 4.856864739144274e-07, + "loss": 0.3086, + "reward": 0.498325914144516, + "reward_std": 0.16964975371956825, + "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2935267984867096, + "rewards/tag_count_reward": 0.3085937574505806, "step": 661 }, { "clip_ratio": 0.0, - "completion_length": 2024.69873046875, + "completion_length": 1749.1563415527344, "epoch": 0.1977447539392129, - "grad_norm": 0.13185878098011017, - "kl": 0.005126953125, - "learning_rate": 9.71198760765692e-08, - "loss": 0.0184, - "reward": 0.3191964402794838, - "reward_std": 0.12865397706627846, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 16.775165557861328, + "kl": 3.8671875, + "learning_rate": 4.85599380382846e-07, + "loss": 0.3028, + "reward": 0.2957589477300644, + "reward_std": 0.21433276683092117, + "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.285714291036129, + "rewards/tag_count_reward": 0.2712053656578064, "step": 662 }, { "clip_ratio": 0.0, - "completion_length": 1991.3326721191406, + "completion_length": 1697.16748046875, "epoch": 0.19804346202673437, - "grad_norm": 0.12862788140773773, - "kl": 0.0062255859375, - "learning_rate": 9.7102406108446e-08, - "loss": 0.0292, - "reward": 0.2935268059372902, - "reward_std": 0.06713220570236444, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 4.305462837219238, + "kl": 3.37890625, + "learning_rate": 4.8551203054223e-07, + "loss": 0.2979, + "reward": 0.3007812723517418, + "reward_std": 0.18942279741168022, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2890625149011612, + "rewards/tag_count_reward": 0.2896205484867096, "step": 663 }, { "clip_ratio": 0.0, - "completion_length": 1936.7723999023438, + "completion_length": 1588.7388916015625, "epoch": 0.19834217011425584, - "grad_norm": 0.16430030763149261, - "kl": 0.0078582763671875, - "learning_rate": 9.708488489752147e-08, - "loss": 0.0486, - "reward": 0.3476562649011612, - "reward_std": 0.09446763806045055, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 77.33695983886719, + "kl": 2.1015625, + "learning_rate": 4.854244244876073e-07, + "loss": 0.2773, + "reward": 0.3671875149011612, + "reward_std": 0.18020589649677277, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.3247767984867096, "step": 664 }, { "clip_ratio": 0.0, - "completion_length": 1928.5246276855469, + "completion_length": 1656.0246276855469, "epoch": 0.1986408782017773, - "grad_norm": 0.14416755735874176, - "kl": 0.00763702392578125, - "learning_rate": 9.7067312462857e-08, - "loss": 0.057, - "reward": 0.361607164144516, - "reward_std": 0.14083919767290354, - "rewards/accuracy_reward": 0.05133928940631449, + "grad_norm": 4.391027450561523, + "kl": 3.1796875, + "learning_rate": 4.85336562314285e-07, + "loss": 0.2796, + "reward": 0.372209832072258, + "reward_std": 0.20364606007933617, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3102678656578064, + "rewards/tag_count_reward": 0.309709832072258, "step": 665 }, { "clip_ratio": 0.0, - "completion_length": 1969.8884887695312, + "completion_length": 1601.0491943359375, "epoch": 0.19893958628929878, - "grad_norm": 0.1333206593990326, - "kl": 0.0070648193359375, - "learning_rate": 9.704968882356965e-08, - "loss": 0.0397, - "reward": 0.396763414144516, - "reward_std": 0.10984595492482185, - "rewards/accuracy_reward": 0.09598214598372579, + "grad_norm": 10.950960159301758, + "kl": 3.20703125, + "learning_rate": 4.852484441178482e-07, + "loss": 0.2908, + "reward": 0.4263393059372902, + "reward_std": 0.2214193306863308, + "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3007812649011612, + "rewards/tag_count_reward": 0.3125000149011612, "step": 666 }, { "clip_ratio": 0.0, - "completion_length": 1970.3014221191406, + "completion_length": 1626.7120971679688, "epoch": 0.19923829437682025, - "grad_norm": 0.13563406467437744, - "kl": 0.0067901611328125, - "learning_rate": 9.703201399883225e-08, - "loss": 0.0429, - "reward": 0.3152901902794838, - "reward_std": 0.08633553609251976, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 20.771196365356445, + "kl": 3.421875, + "learning_rate": 4.851600699941612e-07, + "loss": 0.3336, + "reward": 0.3309151977300644, + "reward_std": 0.19198953732848167, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.3175223395228386, "step": 667 }, { "clip_ratio": 0.0, - "completion_length": 1939.9554748535156, + "completion_length": 1625.7991638183594, "epoch": 0.19953700246434172, - "grad_norm": 0.14902889728546143, - "kl": 0.00806427001953125, - "learning_rate": 9.701428800787323e-08, - "loss": 0.0406, - "reward": 0.4720982387661934, - "reward_std": 0.11096477136015892, - "rewards/accuracy_reward": 0.1674107201397419, + "grad_norm": 6.68268346786499, + "kl": 3.83984375, + "learning_rate": 4.850714400393661e-07, + "loss": 0.3348, + "reward": 0.4687500298023224, + "reward_std": 0.2117708921432495, + "rewards/accuracy_reward": 0.16517858067527413, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.3035714402794838, "step": 668 }, { "clip_ratio": 0.0, - "completion_length": 1956.9911804199219, + "completion_length": 1598.1273193359375, "epoch": 0.1998357105518632, - "grad_norm": 0.1412150263786316, - "kl": 0.00757598876953125, - "learning_rate": 9.699651086997675e-08, - "loss": 0.0338, - "reward": 0.4531250149011612, - "reward_std": 0.0823222417384386, - "rewards/accuracy_reward": 0.1473214328289032, + "grad_norm": 21.330608367919922, + "kl": 3.3515625, + "learning_rate": 4.849825543498837e-07, + "loss": 0.3204, + "reward": 0.487723246216774, + "reward_std": 0.18235694989562035, + "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3058035895228386, + "rewards/tag_count_reward": 0.329241082072258, "step": 669 }, { "clip_ratio": 0.0, - "completion_length": 1988.0871276855469, + "completion_length": 1643.3594665527344, "epoch": 0.20013441863938466, - "grad_norm": 0.13425354659557343, - "kl": 0.00676727294921875, - "learning_rate": 9.697868260448262e-08, - "loss": 0.0349, - "reward": 0.3303571566939354, - "reward_std": 0.11756924726068974, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 9.666836738586426, + "kl": 3.4453125, + "learning_rate": 4.848934130224131e-07, + "loss": 0.3204, + "reward": 0.3459821566939354, + "reward_std": 0.2026408426463604, + "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.3169643059372902, "step": 670 }, { "clip_ratio": 0.0, - "completion_length": 1913.2165832519531, + "completion_length": 1564.1072082519531, "epoch": 0.20043312672690614, - "grad_norm": 0.15712763369083405, - "kl": 0.0086517333984375, - "learning_rate": 9.69608032307862e-08, - "loss": 0.045, - "reward": 0.446428582072258, - "reward_std": 0.11719497106969357, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 27.48702049255371, + "kl": 4.5, + "learning_rate": 4.84804016153931e-07, + "loss": 0.3625, + "reward": 0.444196455180645, + "reward_std": 0.20088427513837814, + "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.3303571566939354, "step": 671 }, { "clip_ratio": 0.0, - "completion_length": 1905.321533203125, + "completion_length": 1523.6585693359375, "epoch": 0.2007318348144276, - "grad_norm": 0.14520163834095, - "kl": 0.00911712646484375, - "learning_rate": 9.694287276833854e-08, - "loss": 0.0367, - "reward": 0.4280134066939354, - "reward_std": 0.09762746747583151, - "rewards/accuracy_reward": 0.11383928847499192, + "grad_norm": 14.57810115814209, + "kl": 3.83984375, + "learning_rate": 4.847143638416927e-07, + "loss": 0.3373, + "reward": 0.477678582072258, + "reward_std": 0.19053140096366405, + "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.3459821566939354, "step": 672 }, { "clip_ratio": 0.0, - "completion_length": 1939.3170471191406, + "completion_length": 1587.7165832519531, "epoch": 0.20103054290194908, - "grad_norm": 0.1469992995262146, - "kl": 0.00789642333984375, - "learning_rate": 9.692489123664617e-08, - "loss": 0.0379, - "reward": 0.400111623108387, - "reward_std": 0.16923784837126732, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 31.174837112426758, + "kl": 4.59375, + "learning_rate": 4.846244561832309e-07, + "loss": 0.3924, + "reward": 0.4017857387661934, + "reward_std": 0.2314198613166809, + "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.3236607238650322, "step": 673 }, { "clip_ratio": 0.0, - "completion_length": 1966.8616943359375, + "completion_length": 1596.8505249023438, "epoch": 0.20132925098947055, - "grad_norm": 0.1376555860042572, - "kl": 0.00765228271484375, - "learning_rate": 9.69068586552713e-08, - "loss": 0.0376, - "reward": 0.3599330484867096, - "reward_std": 0.1457563079893589, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 23.666088104248047, + "kl": 4.76953125, + "learning_rate": 4.845342932763565e-07, + "loss": 0.4063, + "reward": 0.3593750149011612, + "reward_std": 0.24428743869066238, + "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.3125000149011612, "step": 674 }, { "clip_ratio": 0.0, - "completion_length": 1893.446533203125, + "completion_length": 1566.3929443359375, "epoch": 0.20162795907699202, - "grad_norm": 0.16009575128555298, - "kl": 0.0097503662109375, - "learning_rate": 9.688877504383158e-08, - "loss": 0.0597, - "reward": 0.3437500149011612, - "reward_std": 0.11406720336526632, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 12.397934913635254, + "kl": 4.01953125, + "learning_rate": 4.844438752191579e-07, + "loss": 0.3537, + "reward": 0.341517873108387, + "reward_std": 0.1769193708896637, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.332589291036129, "step": 675 }, { "clip_ratio": 0.0, - "completion_length": 1976.7366943359375, + "completion_length": 1613.3416137695312, "epoch": 0.2019266671645135, - "grad_norm": 0.13228686153888702, - "kl": 0.0074310302734375, - "learning_rate": 9.687064042200018e-08, - "loss": 0.0394, - "reward": 0.3861607238650322, - "reward_std": 0.12244551815092564, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 8.899201393127441, + "kl": 3.98046875, + "learning_rate": 4.843532021100009e-07, + "loss": 0.3494, + "reward": 0.3967634066939354, + "reward_std": 0.22016541659832, + "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.3164062574505806, "step": 676 }, { "clip_ratio": 0.0, - "completion_length": 1882.0156860351562, + "completion_length": 1548.21435546875, "epoch": 0.20222537525203496, - "grad_norm": 0.14761832356452942, - "kl": 0.0101165771484375, - "learning_rate": 9.685245480950583e-08, - "loss": 0.055, - "reward": 0.419084832072258, - "reward_std": 0.11319888569414616, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 27.794336318969727, + "kl": 3.22265625, + "learning_rate": 4.842622740475291e-07, + "loss": 0.3359, + "reward": 0.428013414144516, + "reward_std": 0.19770359247922897, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.3342633992433548, "step": 677 }, { "clip_ratio": 0.0, - "completion_length": 1937.2165832519531, + "completion_length": 1570.52685546875, "epoch": 0.20252408333955643, - "grad_norm": 0.15670983493328094, - "kl": 0.00852203369140625, - "learning_rate": 9.683421822613269e-08, - "loss": 0.0488, - "reward": 0.4190848395228386, - "reward_std": 0.08014841563999653, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 25.658205032348633, + "kl": 3.5703125, + "learning_rate": 4.841710911306634e-07, + "loss": 0.3606, + "reward": 0.4564732313156128, + "reward_std": 0.16606799513101578, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419887661934, + "rewards/tag_count_reward": 0.3359375149011612, "step": 678 }, { "clip_ratio": 0.0, - "completion_length": 1886.5514221191406, + "completion_length": 1532.40185546875, "epoch": 0.20282279142707788, - "grad_norm": 0.15836626291275024, - "kl": 0.0098876953125, - "learning_rate": 9.681593069172036e-08, - "loss": 0.064, - "reward": 0.4001116305589676, - "reward_std": 0.14575566537678242, - "rewards/accuracy_reward": 0.06696429033763707, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "grad_norm": 25.947763442993164, + "kl": 3.12109375, + "learning_rate": 4.840796534586018e-07, + "loss": 0.3106, + "reward": 0.3978794887661934, + "reward_std": 0.19667642191052437, + "rewards/accuracy_reward": 0.05133928917348385, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3443080484867096, "step": 679 }, { "clip_ratio": 0.0, - "completion_length": 1991.0737609863281, + "completion_length": 1696.8773193359375, "epoch": 0.20312149951459935, - "grad_norm": 0.11908315122127533, - "kl": 0.0069732666015625, - "learning_rate": 9.679759222616388e-08, - "loss": 0.0294, - "reward": 0.411830373108387, - "reward_std": 0.10152447409927845, - "rewards/accuracy_reward": 0.12276785937137902, + "grad_norm": 25.272611618041992, + "kl": 4.109375, + "learning_rate": 4.839879611308194e-07, + "loss": 0.3037, + "reward": 0.419642873108387, + "reward_std": 0.2102460116147995, + "rewards/accuracy_reward": 0.12500000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2890625074505806, + "rewards/tag_count_reward": 0.294642873108387, "step": 680 }, { "clip_ratio": 0.0, - "completion_length": 1869.7902526855469, + "completion_length": 1451.6630249023438, "epoch": 0.20342020760212082, - "grad_norm": 0.15809400379657745, - "kl": 0.0102996826171875, - "learning_rate": 9.677920284941373e-08, - "loss": 0.0515, - "reward": 0.4464285969734192, - "reward_std": 0.1107041947543621, - "rewards/accuracy_reward": 0.12500000488944352, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "grad_norm": 26.330211639404297, + "kl": 3.33984375, + "learning_rate": 4.838960142470687e-07, + "loss": 0.3343, + "reward": 0.4827009215950966, + "reward_std": 0.22188091650605202, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3554687649011612, "step": 681 }, { "clip_ratio": 0.0, - "completion_length": 1923.5290832519531, + "completion_length": 1607.9978332519531, "epoch": 0.2037189156896423, - "grad_norm": 0.1474037915468216, - "kl": 0.00894927978515625, - "learning_rate": 9.676076258147574e-08, - "loss": 0.0295, - "reward": 0.3722098395228386, - "reward_std": 0.1123335249722004, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 9.8863525390625, + "kl": 3.77734375, + "learning_rate": 4.838038129073787e-07, + "loss": 0.3168, + "reward": 0.3995535895228386, + "reward_std": 0.19469749927520752, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.3392857313156128, "step": 682 }, { "clip_ratio": 0.0, - "completion_length": 1909.1473999023438, - "epoch": 0.20401762377716376, - "grad_norm": 0.14916008710861206, - "kl": 0.00896453857421875, - "learning_rate": 9.674227144241109e-08, - "loss": 0.0379, - "reward": 0.4296875149011612, - "reward_std": 0.10114308074116707, - "rewards/accuracy_reward": 0.11607143469154835, + "completion_length": 1570.2254943847656, + "epoch": 0.20401762377716376, + "grad_norm": 11.120509147644043, + "kl": 3.73046875, + "learning_rate": 4.837113572120555e-07, + "loss": 0.3472, + "reward": 0.4202009066939354, + "reward_std": 0.19817324727773666, + "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.3286830559372902, "step": 683 }, { "clip_ratio": 0.0, - "completion_length": 1879.9978637695312, + "completion_length": 1587.0603637695312, "epoch": 0.20431633186468523, - "grad_norm": 0.1369178742170334, - "kl": 0.0101318359375, - "learning_rate": 9.672372945233636e-08, - "loss": 0.0511, - "reward": 0.4481026902794838, - "reward_std": 0.13854299299418926, - "rewards/accuracy_reward": 0.11160714738070965, + "grad_norm": 49.873897552490234, + "kl": 4.390625, + "learning_rate": 4.836186472616818e-07, + "loss": 0.3218, + "reward": 0.3984375223517418, + "reward_std": 0.19624651595950127, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.3270089477300644, "step": 684 }, { "clip_ratio": 0.0, - "completion_length": 1892.2857971191406, + "completion_length": 1582.4800109863281, "epoch": 0.2046150399522067, - "grad_norm": 0.1601635366678238, - "kl": 0.009796142578125, - "learning_rate": 9.67051366314234e-08, - "loss": 0.0479, - "reward": 0.3878348395228386, - "reward_std": 0.11176582984626293, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 16.45689582824707, + "kl": 4.1796875, + "learning_rate": 4.83525683157117e-07, + "loss": 0.3553, + "reward": 0.3911830484867096, + "reward_std": 0.22156401351094246, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.3309151902794838, "step": 685 }, { "clip_ratio": 0.0, - "completion_length": 1941.7679443359375, + "completion_length": 1588.60498046875, "epoch": 0.20491374803972817, - "grad_norm": 0.14801359176635742, - "kl": 0.009063720703125, - "learning_rate": 9.668649299989937e-08, - "loss": 0.0322, - "reward": 0.4034598395228386, - "reward_std": 0.1313863079994917, - "rewards/accuracy_reward": 0.08705357392318547, + "grad_norm": 19.330488204956055, + "kl": 4.28515625, + "learning_rate": 4.834324649994969e-07, + "loss": 0.3524, + "reward": 0.4330357387661934, + "reward_std": 0.2463468685746193, + "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.3348214328289032, "step": 686 }, { "clip_ratio": 0.0, - "completion_length": 1977.2098999023438, + "completion_length": 1684.2857971191406, "epoch": 0.20521245612724964, - "grad_norm": 0.1267000436782837, - "kl": 0.0077972412109375, - "learning_rate": 9.666779857804674e-08, - "loss": 0.0417, - "reward": 0.3242187649011612, - "reward_std": 0.12428969703614712, - "rewards/accuracy_reward": 0.026785715948790312, + "grad_norm": 31.813735961914062, + "kl": 4.96875, + "learning_rate": 4.833389928902337e-07, + "loss": 0.391, + "reward": 0.3314732313156128, + "reward_std": 0.22441529482603073, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.3046875149011612, "step": 687 }, { "clip_ratio": 0.0, - "completion_length": 1905.6139221191406, + "completion_length": 1545.7232666015625, "epoch": 0.2055111642147711, - "grad_norm": 0.16158601641654968, - "kl": 0.01020050048828125, - "learning_rate": 9.664905338620319e-08, - "loss": 0.0584, - "reward": 0.4263392984867096, - "reward_std": 0.1407697070389986, - "rewards/accuracy_reward": 0.10491071501746774, + "grad_norm": 7.393466472625732, + "kl": 4.18359375, + "learning_rate": 4.83245266931016e-07, + "loss": 0.3733, + "reward": 0.435267873108387, + "reward_std": 0.21350396052002907, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.3392857313156128, "step": 688 }, { "clip_ratio": 0.0, - "completion_length": 1970.3795471191406, + "completion_length": 1604.4398193359375, "epoch": 0.20580987230229258, - "grad_norm": 0.13838672637939453, - "kl": 0.0083160400390625, - "learning_rate": 9.663025744476165e-08, - "loss": 0.0441, - "reward": 0.3816964402794838, - "reward_std": 0.13476444222033024, - "rewards/accuracy_reward": 0.0736607201397419, + "grad_norm": 5.005248069763184, + "kl": 4.1328125, + "learning_rate": 4.831512872238082e-07, + "loss": 0.3578, + "reward": 0.4095982387661934, + "reward_std": 0.2491498589515686, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.3203125149011612, "step": 689 }, { "clip_ratio": 0.0, - "completion_length": 1947.1920471191406, + "completion_length": 1585.0938110351562, "epoch": 0.20610858038981406, - "grad_norm": 0.1403801292181015, - "kl": 0.008880615234375, - "learning_rate": 9.661141077417027e-08, - "loss": 0.0358, - "reward": 0.3593750149011612, - "reward_std": 0.11175479553639889, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 14.290268898010254, + "kl": 3.97265625, + "learning_rate": 4.830570538708513e-07, + "loss": 0.3358, + "reward": 0.3822544738650322, + "reward_std": 0.22422278299927711, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3058035746216774, + "rewards/tag_count_reward": 0.3108259066939354, "step": 690 }, { "clip_ratio": 0.0, - "completion_length": 1980.6541137695312, + "completion_length": 1676.305908203125, "epoch": 0.20640728847733553, - "grad_norm": 0.13505926728248596, - "kl": 0.0077972412109375, - "learning_rate": 9.659251339493237e-08, - "loss": 0.0315, - "reward": 0.4670759066939354, - "reward_std": 0.1113192681223154, - "rewards/accuracy_reward": 0.16294643515720963, + "grad_norm": 22.375356674194336, + "kl": 3.16015625, + "learning_rate": 4.829625669746619e-07, + "loss": 0.2748, + "reward": 0.4799107313156128, + "reward_std": 0.1814117282629013, + "rewards/accuracy_reward": 0.15178571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.3281250149011612, "step": 691 }, { "clip_ratio": 0.0, - "completion_length": 1869.1228637695312, + "completion_length": 1566.1317443847656, "epoch": 0.206705996564857, - "grad_norm": 0.1687944531440735, - "kl": 0.01105499267578125, - "learning_rate": 9.657356532760646e-08, - "loss": 0.068, - "reward": 0.380022332072258, - "reward_std": 0.12900681607425213, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 36.52425765991211, + "kl": 2.884765625, + "learning_rate": 4.828678266380323e-07, + "loss": 0.2878, + "reward": 0.3738839402794838, + "reward_std": 0.19423465430736542, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.3426339402794838, "step": 692 }, { "clip_ratio": 0.0, - "completion_length": 1961.55810546875, + "completion_length": 1623.6406860351562, "epoch": 0.20700470465237847, - "grad_norm": 0.14809079468250275, - "kl": 0.0087127685546875, - "learning_rate": 9.655456659280617e-08, - "loss": 0.0361, - "reward": 0.4123884066939354, - "reward_std": 0.1616346761584282, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 43.555179595947266, + "kl": 2.8203125, + "learning_rate": 4.827728329640308e-07, + "loss": 0.2763, + "reward": 0.3906250149011612, + "reward_std": 0.211230106651783, + "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.3325893059372902, "step": 693 }, { "clip_ratio": 0.0, - "completion_length": 1967.6563110351562, + "completion_length": 1623.8393249511719, "epoch": 0.20730341273989994, - "grad_norm": 0.15619973838329315, - "kl": 0.0086517333984375, - "learning_rate": 9.653551721120026e-08, - "loss": 0.0337, - "reward": 0.3984375223517418, - "reward_std": 0.11172406189143658, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 27.840177536010742, + "kl": 3.28515625, + "learning_rate": 4.826775860560013e-07, + "loss": 0.3078, + "reward": 0.415736623108387, + "reward_std": 0.20240634679794312, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3180803656578064, + "rewards/tag_count_reward": 0.3197544738650322, "step": 694 }, { "clip_ratio": 0.0, - "completion_length": 1925.6317749023438, + "completion_length": 1613.8817749023438, "epoch": 0.2076021208274214, - "grad_norm": 0.1588163822889328, - "kl": 0.01007080078125, - "learning_rate": 9.65164172035126e-08, - "loss": 0.0448, - "reward": 0.4693080633878708, - "reward_std": 0.10378284379839897, + "grad_norm": 6.394109725952148, + "kl": 4.0078125, + "learning_rate": 4.82582086017563e-07, + "loss": 0.3677, + "reward": 0.4581473469734192, + "reward_std": 0.17779334634542465, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.3063616156578064, "step": 695 }, { "clip_ratio": 0.0, - "completion_length": 1976.3750610351562, + "completion_length": 1609.3996276855469, "epoch": 0.20790082891494288, - "grad_norm": 0.1494244784116745, - "kl": 0.0084381103515625, - "learning_rate": 9.649726659052213e-08, - "loss": 0.0363, - "reward": 0.3158482238650322, - "reward_std": 0.11128064803779125, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 15.590879440307617, + "kl": 3.41796875, + "learning_rate": 4.824863329526106e-07, + "loss": 0.3195, + "reward": 0.3231026977300644, + "reward_std": 0.1993384137749672, + "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.3074776902794838, "step": 696 }, { "clip_ratio": 0.0, - "completion_length": 1944.0915832519531, + "completion_length": 1616.2388916015625, "epoch": 0.20819953700246435, - "grad_norm": 0.13358892500400543, - "kl": 0.0092315673828125, - "learning_rate": 9.647806539306283e-08, - "loss": 0.0309, - "reward": 0.4174107313156128, - "reward_std": 0.10140101984143257, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 14.615315437316895, + "kl": 3.7578125, + "learning_rate": 4.823903269653141e-07, + "loss": 0.306, + "reward": 0.419642873108387, + "reward_std": 0.18257762491703033, + "rewards/accuracy_reward": 0.09598215040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.3236607238650322, "step": 697 }, { "clip_ratio": 0.0, - "completion_length": 1956.5603332519531, + "completion_length": 1609.6072082519531, "epoch": 0.20849824508998582, - "grad_norm": 0.14027139544487, - "kl": 0.00894927978515625, - "learning_rate": 9.64588136320237e-08, - "loss": 0.042, - "reward": 0.3482142984867096, - "reward_std": 0.08856059517711401, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 5.701258659362793, + "kl": 3.609375, + "learning_rate": 4.822940681601186e-07, + "loss": 0.3122, + "reward": 0.3744419887661934, + "reward_std": 0.1936628483235836, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714402794838, + "rewards/tag_count_reward": 0.3208705484867096, "step": 698 }, { "clip_ratio": 0.0, - "completion_length": 1914.1005249023438, + "completion_length": 1593.5781860351562, "epoch": 0.2087969531775073, - "grad_norm": 0.1414783000946045, - "kl": 0.01039886474609375, - "learning_rate": 9.643951132834881e-08, - "loss": 0.0394, - "reward": 0.4017857313156128, - "reward_std": 0.09522906877100468, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 27.374202728271484, + "kl": 3.2734375, + "learning_rate": 4.821975566417441e-07, + "loss": 0.3038, + "reward": 0.389508955180645, + "reward_std": 0.20435847714543343, + "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.3314732238650322, "step": 699 }, { "clip_ratio": 0.0, - "completion_length": 1988.5000610351562, + "completion_length": 1679.4107666015625, "epoch": 0.20909566126502874, - "grad_norm": 0.13901452720165253, - "kl": 0.0078277587890625, - "learning_rate": 9.642015850303715e-08, - "loss": 0.0329, - "reward": 0.3504464477300644, - "reward_std": 0.10275771282613277, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 9.274701118469238, + "kl": 3.80078125, + "learning_rate": 4.821007925151858e-07, + "loss": 0.3045, + "reward": 0.3861607313156128, + "reward_std": 0.2269614152610302, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3013392984867096, + "rewards/tag_count_reward": 0.321428582072258, "step": 700 }, { "clip_ratio": 0.0, - "completion_length": 1893.0692749023438, + "completion_length": 1566.6161193847656, "epoch": 0.2093943693525502, - "grad_norm": 0.15658767521381378, - "kl": 0.0107574462890625, - "learning_rate": 9.640075517714272e-08, - "loss": 0.0524, - "reward": 0.4285714477300644, - "reward_std": 0.08054602704942226, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 5.112265586853027, + "kl": 3.99609375, + "learning_rate": 4.820037758857136e-07, + "loss": 0.3589, + "reward": 0.4408482238650322, + "reward_std": 0.19334249570965767, + "rewards/accuracy_reward": 0.11160714761354029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.329241082072258, "step": 701 }, { "clip_ratio": 0.0, - "completion_length": 1944.6273498535156, + "completion_length": 1665.2366943359375, "epoch": 0.20969307744007168, - "grad_norm": 0.13902033865451813, - "kl": 0.00927734375, - "learning_rate": 9.638130137177441e-08, - "loss": 0.0312, - "reward": 0.310267873108387, - "reward_std": 0.08007166720926762, + "grad_norm": 31.22503089904785, + "kl": 4.40625, + "learning_rate": 4.81906506858872e-07, + "loss": 0.3442, + "reward": 0.309709832072258, + "reward_std": 0.168434988707304, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.3052455484867096, "step": 702 }, { "clip_ratio": 0.0, - "completion_length": 1924.0514221191406, + "completion_length": 1616.1585388183594, "epoch": 0.20999178552759315, - "grad_norm": 0.15202951431274414, - "kl": 0.010009765625, - "learning_rate": 9.636179710809605e-08, - "loss": 0.0363, - "reward": 0.4285714402794838, - "reward_std": 0.17391288094222546, - "rewards/accuracy_reward": 0.09821429336443543, + "grad_norm": 48.63649368286133, + "kl": 5.0234375, + "learning_rate": 4.818089855404803e-07, + "loss": 0.3901, + "reward": 0.4174107313156128, + "reward_std": 0.24856522306799889, + "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.3191964477300644, "step": 703 }, { "clip_ratio": 0.0, - "completion_length": 1976.4264221191406, + "completion_length": 1623.3817749023438, "epoch": 0.21029049361511462, - "grad_norm": 0.13702863454818726, - "kl": 0.00899505615234375, - "learning_rate": 9.634224240732639e-08, - "loss": 0.0354, - "reward": 0.3426339477300644, - "reward_std": 0.09945535659790039, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 63.81969451904297, + "kl": 5.32421875, + "learning_rate": 4.817112120366319e-07, + "loss": 0.4024, + "reward": 0.353236623108387, + "reward_std": 0.20233870670199394, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2935268059372902, + "rewards/tag_count_reward": 0.3018973395228386, "step": 704 }, { "clip_ratio": 0.0, - "completion_length": 1960.3438415527344, + "completion_length": 1628.1719665527344, "epoch": 0.2105892017026361, - "grad_norm": 0.13541404902935028, - "kl": 0.00921630859375, - "learning_rate": 9.632263729073902e-08, - "loss": 0.0356, - "reward": 0.3119419738650322, - "reward_std": 0.08641701284796, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 5.314508438110352, + "kl": 3.8984375, + "learning_rate": 4.81613186453695e-07, + "loss": 0.3341, + "reward": 0.338169664144516, + "reward_std": 0.19756008312106133, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3007812574505806, + "rewards/tag_count_reward": 0.3203125074505806, "step": 705 }, { "clip_ratio": 0.0, - "completion_length": 1919.4978637695312, + "completion_length": 1602.12060546875, "epoch": 0.21088790979015756, - "grad_norm": 0.1585385799407959, - "kl": 0.01013946533203125, - "learning_rate": 9.630298177966238e-08, - "loss": 0.0404, - "reward": 0.4308035969734192, - "reward_std": 0.12948151864111423, - "rewards/accuracy_reward": 0.12053571757860482, + "grad_norm": 30.735994338989258, + "kl": 4.5078125, + "learning_rate": 4.815149088983119e-07, + "loss": 0.3567, + "reward": 0.4520089477300644, + "reward_std": 0.2243431620299816, + "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310267873108387, + "rewards/tag_count_reward": 0.329241082072258, "step": 706 }, { "clip_ratio": 0.0, - "completion_length": 2001.2478637695312, + "completion_length": 1666.4665832519531, "epoch": 0.21118661787767903, - "grad_norm": 0.13002656400203705, - "kl": 0.00806427001953125, - "learning_rate": 9.628327589547976e-08, - "loss": 0.025, - "reward": 0.2974330484867096, - "reward_std": 0.06562062911689281, - "rewards/accuracy_reward": 0.0, + "grad_norm": 13.175734519958496, + "kl": 4.37109375, + "learning_rate": 4.814163794773988e-07, + "loss": 0.3431, + "reward": 0.3203125074505806, + "reward_std": 0.1819184236228466, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.3158482238650322, "step": 707 }, { "clip_ratio": 0.0, - "completion_length": 1887.3861999511719, + "completion_length": 1579.6585388183594, "epoch": 0.2114853259652005, - "grad_norm": 0.15324050188064575, - "kl": 0.0118865966796875, - "learning_rate": 9.62635196596292e-08, - "loss": 0.0549, - "reward": 0.3878348395228386, - "reward_std": 0.15100394003093243, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 48.19032669067383, + "kl": 3.0, + "learning_rate": 4.81317598298146e-07, + "loss": 0.3184, + "reward": 0.3705357387661934, + "reward_std": 0.20747651532292366, + "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.332589291036129, "step": 708 }, { "clip_ratio": 0.0, - "completion_length": 1951.2567443847656, + "completion_length": 1580.1719360351562, "epoch": 0.21178403405272198, - "grad_norm": 0.16710920631885529, - "kl": 0.0102386474609375, - "learning_rate": 9.624371309360356e-08, - "loss": 0.0438, - "reward": 0.4051339477300644, - "reward_std": 0.12181063927710056, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 53.444732666015625, + "kl": 2.94921875, + "learning_rate": 4.812185654680178e-07, + "loss": 0.3213, + "reward": 0.424665205180645, + "reward_std": 0.2092951312661171, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.3286830484867096, "step": 709 }, { "clip_ratio": 0.0, - "completion_length": 1993.4710693359375, + "completion_length": 1668.0871276855469, "epoch": 0.21208274214024345, - "grad_norm": 0.13168954849243164, - "kl": 0.00859832763671875, - "learning_rate": 9.622385621895046e-08, - "loss": 0.0296, - "reward": 0.3783482238650322, - "reward_std": 0.08798561058938503, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 22.1900577545166, + "kl": 3.0703125, + "learning_rate": 4.811192810947523e-07, + "loss": 0.2699, + "reward": 0.4073660895228386, + "reward_std": 0.19444474577903748, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.3247767984867096, "step": 710 }, { "clip_ratio": 0.0, - "completion_length": 1927.7277526855469, + "completion_length": 1588.9754943847656, "epoch": 0.21238145022776492, - "grad_norm": 0.16136197745800018, - "kl": 0.0108642578125, - "learning_rate": 9.62039490572722e-08, - "loss": 0.0579, - "reward": 0.3922991305589676, - "reward_std": 0.14450719486922026, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 9.09578800201416, + "kl": 3.46875, + "learning_rate": 4.810197452863611e-07, + "loss": 0.3286, + "reward": 0.3856026902794838, + "reward_std": 0.1920848786830902, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.3342634066939354, "step": 711 }, { "clip_ratio": 0.0, - "completion_length": 1841.9866638183594, + "completion_length": 1494.9420166015625, "epoch": 0.2126801583152864, - "grad_norm": 0.1863638311624527, - "kl": 0.0133056640625, - "learning_rate": 9.618399163022584e-08, - "loss": 0.0808, - "reward": 0.4492187723517418, - "reward_std": 0.13888874650001526, - "rewards/accuracy_reward": 0.10937500721774995, + "grad_norm": 19.644140243530273, + "kl": 3.8671875, + "learning_rate": 4.809199581511292e-07, + "loss": 0.3953, + "reward": 0.4508928805589676, + "reward_std": 0.22199459746479988, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437723517418, + "rewards/tag_count_reward": 0.3415178656578064, "step": 712 }, { "clip_ratio": 0.0, - "completion_length": 1895.430908203125, + "completion_length": 1640.3125610351562, "epoch": 0.21297886640280786, - "grad_norm": 0.1620858758687973, - "kl": 0.0109100341796875, - "learning_rate": 9.616398395952312e-08, - "loss": 0.0442, - "reward": 0.4414062723517418, - "reward_std": 0.08976492844522, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 30.95254898071289, + "kl": 4.4453125, + "learning_rate": 4.808199197976156e-07, + "loss": 0.3397, + "reward": 0.4257812723517418, + "reward_std": 0.1747324801981449, + "rewards/accuracy_reward": 0.11383929080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.3119419664144516, "step": 713 }, { "clip_ratio": 0.0, - "completion_length": 1928.2790832519531, + "completion_length": 1628.8103332519531, "epoch": 0.21327757449032933, - "grad_norm": 0.14622561633586884, - "kl": 0.0109100341796875, - "learning_rate": 9.61439260669304e-08, - "loss": 0.0437, - "reward": 0.3616071566939354, - "reward_std": 0.09847544319927692, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 148.57635498046875, + "kl": 9.671875, + "learning_rate": 4.80719630334652e-07, + "loss": 0.5103, + "reward": 0.3643973395228386, + "reward_std": 0.19513047859072685, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.3175223395228386, "step": 714 }, { "clip_ratio": 0.0, - "completion_length": 1902.1585693359375, + "completion_length": 1615.200927734375, "epoch": 0.2135762825778508, - "grad_norm": 0.1599535197019577, - "kl": 0.0112762451171875, - "learning_rate": 9.612381797426872e-08, - "loss": 0.0405, - "reward": 0.3822544813156128, - "reward_std": 0.13052287325263023, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 53.69288635253906, + "kl": 5.2109375, + "learning_rate": 4.806190898713435e-07, + "loss": 0.3921, + "reward": 0.3510044813156128, + "reward_std": 0.22202418744564056, + "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.3108258992433548, "step": 715 }, { "clip_ratio": 0.0, - "completion_length": 1949.3728637695312, + "completion_length": 1625.6541137695312, "epoch": 0.21387499066537227, - "grad_norm": 0.1509225219488144, - "kl": 0.0102081298828125, - "learning_rate": 9.610365970341369e-08, - "loss": 0.0473, - "reward": 0.4408482387661934, - "reward_std": 0.14623294584453106, - "rewards/accuracy_reward": 0.11830357857979834, + "grad_norm": 46.037628173828125, + "kl": 5.1953125, + "learning_rate": 4.805182985170684e-07, + "loss": 0.4016, + "reward": 0.415736623108387, + "reward_std": 0.20253733545541763, + "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.3085937649011612, "step": 716 }, { "clip_ratio": 0.0, - "completion_length": 1959.1965026855469, + "completion_length": 1598.2835693359375, "epoch": 0.21417369875289374, - "grad_norm": 0.14496546983718872, - "kl": 0.010101318359375, - "learning_rate": 9.608345127629555e-08, - "loss": 0.0372, - "reward": 0.3599330484867096, - "reward_std": 0.10279363207519054, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 7.247920989990234, + "kl": 4.4140625, + "learning_rate": 4.804172563814777e-07, + "loss": 0.379, + "reward": 0.3733259066939354, + "reward_std": 0.19638732820749283, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3108258992433548, + "rewards/tag_count_reward": 0.3264509066939354, "step": 717 }, { "clip_ratio": 0.0, - "completion_length": 1921.2590026855469, + "completion_length": 1609.94873046875, "epoch": 0.21447240684041521, - "grad_norm": 0.1490282416343689, - "kl": 0.0111083984375, - "learning_rate": 9.606319271489909e-08, - "loss": 0.0416, - "reward": 0.3744419738650322, - "reward_std": 0.12004970572888851, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 23.84544563293457, + "kl": 4.33203125, + "learning_rate": 4.803159635744954e-07, + "loss": 0.349, + "reward": 0.392857164144516, + "reward_std": 0.21716271713376045, + "rewards/accuracy_reward": 0.06696428684517741, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.325892873108387, "step": 718 }, { "clip_ratio": 0.0, - "completion_length": 1949.5447082519531, + "completion_length": 1632.18310546875, "epoch": 0.21477111492793668, - "grad_norm": 0.14894434809684753, - "kl": 0.0108489990234375, - "learning_rate": 9.604288404126362e-08, - "loss": 0.0377, - "reward": 0.4040178805589676, - "reward_std": 0.12434626929461956, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 45.51645278930664, + "kl": 4.6953125, + "learning_rate": 4.80214420206318e-07, + "loss": 0.3727, + "reward": 0.3839285895228386, + "reward_std": 0.17488805204629898, + "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310267873108387, + "rewards/tag_count_reward": 0.3058035895228386, "step": 719 }, { "clip_ratio": 0.0, - "completion_length": 1958.1072692871094, + "completion_length": 1620.3371276855469, "epoch": 0.21506982301545816, - "grad_norm": 0.154600590467453, - "kl": 0.0106658935546875, - "learning_rate": 9.602252527748301e-08, - "loss": 0.0389, - "reward": 0.353794664144516, - "reward_std": 0.16105252876877785, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 5.110437870025635, + "kl": 4.0, + "learning_rate": 4.801126263874151e-07, + "loss": 0.3379, + "reward": 0.3526785895228386, + "reward_std": 0.21703342348337173, + "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.3147321566939354, "step": 720 }, { "clip_ratio": 0.0, - "completion_length": 1976.0514526367188, + "completion_length": 1612.5826721191406, "epoch": 0.21536853110297963, - "grad_norm": 0.17134681344032288, - "kl": 0.0101165771484375, - "learning_rate": 9.60021164457056e-08, - "loss": 0.0493, - "reward": 0.3867187649011612, - "reward_std": 0.10954906046390533, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 32.55705261230469, + "kl": 3.30078125, + "learning_rate": 4.80010582228528e-07, + "loss": 0.3226, + "reward": 0.3978794738650322, + "reward_std": 0.1938558891415596, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.3175223395228386, "step": 721 }, { "clip_ratio": 0.0, - "completion_length": 1918.9375915527344, + "completion_length": 1530.5402526855469, "epoch": 0.21566723919050107, - "grad_norm": 0.14087891578674316, - "kl": 0.011322021484375, - "learning_rate": 9.598165756813417e-08, - "loss": 0.0351, - "reward": 0.3577009066939354, - "reward_std": 0.1359529010951519, - "rewards/accuracy_reward": 0.035714288242161274, + "grad_norm": 48.4517936706543, + "kl": 2.59765625, + "learning_rate": 4.799082878406709e-07, + "loss": 0.2687, + "reward": 0.3537946566939354, + "reward_std": 0.19409164786338806, + "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.3314732313156128, "step": 722 }, { "clip_ratio": 0.0, - "completion_length": 1905.71435546875, + "completion_length": 1551.0871276855469, "epoch": 0.21596594727802254, - "grad_norm": 0.17439456284046173, - "kl": 0.0123748779296875, - "learning_rate": 9.596114866702601e-08, - "loss": 0.061, - "reward": 0.4988839477300644, - "reward_std": 0.159921046346426, - "rewards/accuracy_reward": 0.16741072572767735, + "grad_norm": 52.26213455200195, + "kl": 2.74609375, + "learning_rate": 4.7980574333513e-07, + "loss": 0.2981, + "reward": 0.4709821566939354, + "reward_std": 0.18524416908621788, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.3348214477300644, "step": 723 }, { "clip_ratio": 0.0, - "completion_length": 1915.9911804199219, + "completion_length": 1634.4308471679688, "epoch": 0.216264655365544, - "grad_norm": 0.16557441651821136, - "kl": 0.0117645263671875, - "learning_rate": 9.594058976469277e-08, - "loss": 0.0545, - "reward": 0.4017857313156128, - "reward_std": 0.16062949411571026, - "rewards/accuracy_reward": 0.07812500558793545, + "grad_norm": 24.10224723815918, + "kl": 3.08984375, + "learning_rate": 4.797029488234639e-07, + "loss": 0.2747, + "reward": 0.376116082072258, + "reward_std": 0.19443616643548012, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.3225446492433548, "step": 724 }, { "clip_ratio": 0.0, - "completion_length": 1906.3014221191406, + "completion_length": 1652.0201721191406, "epoch": 0.21656336345306548, - "grad_norm": 0.1331939399242401, - "kl": 0.011505126953125, - "learning_rate": 9.591998088350053e-08, - "loss": 0.0354, - "reward": 0.3967634066939354, - "reward_std": 0.06791420956142247, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 28.628019332885742, + "kl": 2.87890625, + "learning_rate": 4.795999044175026e-07, + "loss": 0.2707, + "reward": 0.3978794813156128, + "reward_std": 0.18203013390302658, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.3108259066939354, "step": 725 }, { "clip_ratio": 0.0, - "completion_length": 1978.8594665527344, + "completion_length": 1641.5246276855469, "epoch": 0.21686207154058695, - "grad_norm": 0.14819641411304474, - "kl": 0.0107879638671875, - "learning_rate": 9.589932204586973e-08, - "loss": 0.0401, - "reward": 0.3973214477300644, - "reward_std": 0.11096329241991043, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 14.411606788635254, + "kl": 3.625, + "learning_rate": 4.794966102293486e-07, + "loss": 0.3369, + "reward": 0.3984375223517418, + "reward_std": 0.2092544436454773, + "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3058035895228386, + "rewards/tag_count_reward": 0.3002232238650322, "step": 726 }, { "clip_ratio": 0.0, - "completion_length": 1969.2433776855469, + "completion_length": 1644.1763916015625, "epoch": 0.21716077962810842, - "grad_norm": 0.1402166485786438, - "kl": 0.0105438232421875, - "learning_rate": 9.587861327427515e-08, - "loss": 0.0331, - "reward": 0.3392857238650322, - "reward_std": 0.11252451315522194, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 22.025117874145508, + "kl": 3.9765625, + "learning_rate": 4.793930663713757e-07, + "loss": 0.3312, + "reward": 0.3532366156578064, + "reward_std": 0.20541147887706757, + "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964328289032, + "rewards/tag_count_reward": 0.3242187649011612, "step": 727 }, { "clip_ratio": 0.0, - "completion_length": 1933.3504943847656, + "completion_length": 1612.0201721191406, "epoch": 0.2174594877156299, - "grad_norm": 0.13643842935562134, - "kl": 0.0109405517578125, - "learning_rate": 9.585785459124593e-08, - "loss": 0.0407, - "reward": 0.3655134066939354, - "reward_std": 0.10520149581134319, - "rewards/accuracy_reward": 0.05357143026776612, + "grad_norm": 54.84812545776367, + "kl": 2.1015625, + "learning_rate": 4.792892729562296e-07, + "loss": 0.2401, + "reward": 0.3895089477300644, + "reward_std": 0.1836816817522049, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419738650322, + "rewards/tag_count_reward": 0.3404018059372902, "step": 728 }, { "clip_ratio": 0.0, - "completion_length": 1822.3996276855469, + "completion_length": 1542.7656860351562, "epoch": 0.21775819580315137, - "grad_norm": 0.16552171111106873, - "kl": 0.0139312744140625, - "learning_rate": 9.583704601936546e-08, - "loss": 0.0518, - "reward": 0.4726562723517418, - "reward_std": 0.160709535703063, - "rewards/accuracy_reward": 0.12946429336443543, + "grad_norm": 23.361364364624023, + "kl": 3.265625, + "learning_rate": 4.791852300968273e-07, + "loss": 0.2968, + "reward": 0.4564732313156128, + "reward_std": 0.19759487733244896, + "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.3515625223517418, "step": 729 }, { "clip_ratio": 0.0, - "completion_length": 1885.6831359863281, + "completion_length": 1552.6875305175781, "epoch": 0.21805690389067284, - "grad_norm": 0.15367430448532104, - "kl": 0.0125732421875, - "learning_rate": 9.581618758127144e-08, - "loss": 0.0426, - "reward": 0.4174107313156128, - "reward_std": 0.14263391215354204, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 9.52374267578125, + "kl": 3.7265625, + "learning_rate": 4.790809379063572e-07, + "loss": 0.3255, + "reward": 0.4369419887661934, + "reward_std": 0.19534089788794518, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250223517418, + "rewards/tag_count_reward": 0.345424123108387, "step": 730 }, { "clip_ratio": 0.0, - "completion_length": 1921.9197082519531, + "completion_length": 1571.6161499023438, "epoch": 0.2183556119781943, - "grad_norm": 0.14948955178260803, - "kl": 0.0123443603515625, - "learning_rate": 9.579527929965581e-08, - "loss": 0.0351, - "reward": 0.3638392984867096, - "reward_std": 0.09835939109325409, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 29.042966842651367, + "kl": 5.3984375, + "learning_rate": 4.78976396498279e-07, + "loss": 0.4659, + "reward": 0.3856026977300644, + "reward_std": 0.18712127581238747, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.3320312574505806, "step": 731 }, { "clip_ratio": 0.0, - "completion_length": 1913.7723999023438, + "completion_length": 1587.2880249023438, "epoch": 0.21865432006571578, - "grad_norm": 0.14623035490512848, - "kl": 0.012420654296875, - "learning_rate": 9.577432119726469e-08, - "loss": 0.047, - "reward": 0.4017857238650322, - "reward_std": 0.13954560086131096, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 53.375308990478516, + "kl": 5.578125, + "learning_rate": 4.788716059863235e-07, + "loss": 0.4411, + "reward": 0.399553582072258, + "reward_std": 0.21364906057715416, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.3281250074505806, "step": 732 }, { "clip_ratio": 0.0, - "completion_length": 1885.634033203125, + "completion_length": 1539.9375915527344, "epoch": 0.21895302815323725, - "grad_norm": 0.18024763464927673, - "kl": 0.0133514404296875, - "learning_rate": 9.57533132968985e-08, - "loss": 0.0676, - "reward": 0.4408482313156128, - "reward_std": 0.16392276249825954, - "rewards/accuracy_reward": 0.09821429289877415, + "grad_norm": 48.08573913574219, + "kl": 4.80859375, + "learning_rate": 4.787665664844925e-07, + "loss": 0.3709, + "reward": 0.4419643059372902, + "reward_std": 0.2109740674495697, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.357142873108387, "step": 733 }, { "clip_ratio": 0.0, - "completion_length": 1942.8572082519531, + "completion_length": 1576.7277221679688, "epoch": 0.21925173624075872, - "grad_norm": 0.14323367178440094, - "kl": 0.0120391845703125, - "learning_rate": 9.573225562141174e-08, - "loss": 0.0336, - "reward": 0.365513414144516, - "reward_std": 0.11247957497835159, - "rewards/accuracy_reward": 0.058035718044266105, + "grad_norm": 22.21114158630371, + "kl": 4.765625, + "learning_rate": 4.786612781070587e-07, + "loss": 0.4177, + "reward": 0.3794642984867096, + "reward_std": 0.19743217527866364, + "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.3281250149011612, "step": 734 }, { "clip_ratio": 0.0, - "completion_length": 1979.6518859863281, + "completion_length": 1646.8014221191406, "epoch": 0.2195504443282802, - "grad_norm": 0.13991086184978485, - "kl": 0.0112762451171875, - "learning_rate": 9.571114819371311e-08, - "loss": 0.0327, - "reward": 0.4369419738650322, - "reward_std": 0.16199872083961964, - "rewards/accuracy_reward": 0.1272321529686451, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "grad_norm": 89.7652816772461, + "kl": 5.9375, + "learning_rate": 4.785557409685656e-07, + "loss": 0.4095, + "reward": 0.4319196715950966, + "reward_std": 0.24269666522741318, + "rewards/accuracy_reward": 0.12500000861473382, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3046875149011612, "step": 735 }, { "clip_ratio": 0.0, - "completion_length": 1915.05810546875, + "completion_length": 1596.9152526855469, "epoch": 0.21984915241580166, - "grad_norm": 0.163910374045372, - "kl": 0.013092041015625, - "learning_rate": 9.568999103676542e-08, - "loss": 0.0255, - "reward": 0.3878348395228386, - "reward_std": 0.13606654107570648, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 18.36240005493164, + "kl": 4.35546875, + "learning_rate": 4.784499551838271e-07, + "loss": 0.3542, + "reward": 0.3956473395228386, + "reward_std": 0.20697414502501488, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.3331473395228386, "step": 736 }, { "clip_ratio": 0.0, - "completion_length": 1919.6831359863281, + "completion_length": 1573.8416137695312, "epoch": 0.22014786050332313, - "grad_norm": 0.1626885086297989, - "kl": 0.01312255859375, - "learning_rate": 9.566878417358558e-08, - "loss": 0.0475, - "reward": 0.4107143059372902, - "reward_std": 0.13858931697905064, - "rewards/accuracy_reward": 0.07812500465661287, + "grad_norm": 19.221790313720703, + "kl": 4.40234375, + "learning_rate": 4.783439208679279e-07, + "loss": 0.3774, + "reward": 0.3638393059372902, + "reward_std": 0.1808967962861061, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325892984867096, + "rewards/tag_count_reward": 0.3214285895228386, "step": 737 }, { "clip_ratio": 0.0, - "completion_length": 1909.6317749023438, + "completion_length": 1573.5603332519531, "epoch": 0.2204465685908446, - "grad_norm": 0.15074266493320465, - "kl": 0.0129547119140625, - "learning_rate": 9.564752762724458e-08, - "loss": 0.0385, - "reward": 0.3275669813156128, - "reward_std": 0.09000955987721682, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 13.014540672302246, + "kl": 3.96875, + "learning_rate": 4.782376381362229e-07, + "loss": 0.3688, + "reward": 0.3364955559372902, + "reward_std": 0.19221926480531693, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.3231026977300644, "step": 738 }, { "clip_ratio": 0.0, - "completion_length": 1897.2857971191406, + "completion_length": 1618.7344360351562, "epoch": 0.22074527667836608, - "grad_norm": 0.1702752411365509, - "kl": 0.0135955810546875, - "learning_rate": 9.562622142086746e-08, - "loss": 0.0594, - "reward": 0.4079241380095482, - "reward_std": 0.15723763592541218, - "rewards/accuracy_reward": 0.07142857438884676, + "grad_norm": 9.605168342590332, + "kl": 4.51953125, + "learning_rate": 4.781311071043373e-07, + "loss": 0.3848, + "reward": 0.3777901977300644, + "reward_std": 0.24306915700435638, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.336495541036129, + "rewards/tag_count_reward": 0.3108259066939354, "step": 739 }, { "clip_ratio": 0.0, - "completion_length": 1912.2657165527344, + "completion_length": 1596.5603332519531, "epoch": 0.22104398476588755, - "grad_norm": 0.16827943921089172, - "kl": 0.0131683349609375, - "learning_rate": 9.560486557763329e-08, - "loss": 0.0605, - "reward": 0.455357164144516, - "reward_std": 0.11586533114314079, - "rewards/accuracy_reward": 0.129464291036129, + "grad_norm": 39.075279235839844, + "kl": 3.22265625, + "learning_rate": 4.780243278881664e-07, + "loss": 0.3355, + "reward": 0.465959832072258, + "reward_std": 0.17779749631881714, + "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928805589676, + "rewards/tag_count_reward": 0.3431919887661934, "step": 740 }, { "clip_ratio": 0.0, - "completion_length": 1913.0625915527344, + "completion_length": 1542.259033203125, "epoch": 0.22134269285340902, - "grad_norm": 0.16229446232318878, - "kl": 0.013458251953125, - "learning_rate": 9.558346012077512e-08, - "loss": 0.0543, - "reward": 0.3945312649011612, - "reward_std": 0.16771608218550682, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 57.15321350097656, + "kl": 2.97265625, + "learning_rate": 4.779173006038756e-07, + "loss": 0.3336, + "reward": 0.3856026977300644, + "reward_std": 0.21280289441347122, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562723517418, + "rewards/tag_count_reward": 0.3498884066939354, "step": 741 }, { "clip_ratio": 0.0, - "completion_length": 1898.1094665527344, + "completion_length": 1579.43310546875, "epoch": 0.2216414009409305, - "grad_norm": 0.17017200589179993, - "kl": 0.014068603515625, - "learning_rate": 9.556200507358002e-08, - "loss": 0.0651, - "reward": 0.372767873108387, - "reward_std": 0.11340212263166904, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 42.650882720947266, + "kl": 3.0234375, + "learning_rate": 4.778100253679e-07, + "loss": 0.3209, + "reward": 0.3872768059372902, + "reward_std": 0.2000235915184021, + "rewards/accuracy_reward": 0.05580357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.3314732313156128, "step": 742 }, { "clip_ratio": 0.0, - "completion_length": 1962.4688415527344, + "completion_length": 1604.0491943359375, "epoch": 0.22194010902845193, - "grad_norm": 0.1501195877790451, - "kl": 0.012115478515625, - "learning_rate": 9.554050045938892e-08, - "loss": 0.0393, - "reward": 0.3113839402794838, - "reward_std": 0.0878664143383503, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 29.784175872802734, + "kl": 3.078125, + "learning_rate": 4.777025022969446e-07, + "loss": 0.295, + "reward": 0.3325893059372902, + "reward_std": 0.18082531541585922, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.3236607313156128, "step": 743 }, { "clip_ratio": 0.0, - "completion_length": 1885.7232971191406, + "completion_length": 1539.9911193847656, "epoch": 0.2222388171159734, - "grad_norm": 0.17050085961818695, - "kl": 0.0147552490234375, - "learning_rate": 9.551894630159679e-08, - "loss": 0.0486, - "reward": 0.5156250223517418, - "reward_std": 0.20103923231363297, - "rewards/accuracy_reward": 0.1584821562282741, + "grad_norm": 48.358585357666016, + "kl": 3.24609375, + "learning_rate": 4.775947315079839e-07, + "loss": 0.3469, + "reward": 0.498325914144516, + "reward_std": 0.2129516378045082, + "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357142873108387, + "rewards/tag_count_reward": 0.357700914144516, "step": 744 }, { "clip_ratio": 0.0, - "completion_length": 1882.44873046875, + "completion_length": 1528.5045166015625, "epoch": 0.22253752520349487, - "grad_norm": 0.13707450032234192, - "kl": 0.014312744140625, - "learning_rate": 9.549734262365241e-08, - "loss": 0.0361, - "reward": 0.3861607313156128, - "reward_std": 0.13406977429986, - "rewards/accuracy_reward": 0.05803571501746774, + "grad_norm": 29.147829055786133, + "kl": 3.35546875, + "learning_rate": 4.774867131182621e-07, + "loss": 0.3464, + "reward": 0.3800223395228386, + "reward_std": 0.2092142403125763, + "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.3353794738650322, "step": 745 }, { "clip_ratio": 0.0, - "completion_length": 1931.24560546875, + "completion_length": 1569.8348999023438, "epoch": 0.22283623329101634, - "grad_norm": 0.1548091024160385, - "kl": 0.01409912109375, - "learning_rate": 9.547568944905849e-08, - "loss": 0.0524, - "reward": 0.3671875223517418, - "reward_std": 0.1326606720685959, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 10.879770278930664, + "kl": 4.31640625, + "learning_rate": 4.773784472452924e-07, + "loss": 0.3961, + "reward": 0.3750000149011612, + "reward_std": 0.24804991483688354, + "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.3348214402794838, "step": 746 }, { "clip_ratio": 0.0, - "completion_length": 1916.9598999023438, + "completion_length": 1629.0268859863281, "epoch": 0.22313494137853782, - "grad_norm": 0.14044201374053955, - "kl": 0.012939453125, - "learning_rate": 9.545398680137154e-08, - "loss": 0.0311, - "reward": 0.436383955180645, - "reward_std": 0.12197289802134037, - "rewards/accuracy_reward": 0.11160714738070965, + "grad_norm": 36.54946517944336, + "kl": 3.03125, + "learning_rate": 4.772699340068576e-07, + "loss": 0.2981, + "reward": 0.4241071566939354, + "reward_std": 0.209950502961874, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247768059372902, + "rewards/tag_count_reward": 0.3236607238650322, "step": 747 }, { "clip_ratio": 0.0, - "completion_length": 1959.0916137695312, + "completion_length": 1612.2768249511719, "epoch": 0.2234336494660593, - "grad_norm": 0.15776781737804413, - "kl": 0.0129852294921875, - "learning_rate": 9.543223470420192e-08, - "loss": 0.0391, - "reward": 0.4168526977300644, - "reward_std": 0.10020343400537968, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 35.734275817871094, + "kl": 5.3828125, + "learning_rate": 4.771611735210096e-07, + "loss": 0.4463, + "reward": 0.4123884066939354, + "reward_std": 0.1869749054312706, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.3231026902794838, "step": 748 }, { "clip_ratio": 0.0, - "completion_length": 1867.6474304199219, + "completion_length": 1578.7120971679688, "epoch": 0.22373235755358076, - "grad_norm": 0.16402949392795563, - "kl": 0.015228271484375, - "learning_rate": 9.541043318121378e-08, - "loss": 0.0503, - "reward": 0.3816964477300644, - "reward_std": 0.1383823435753584, - "rewards/accuracy_reward": 0.040178573690354824, + "grad_norm": 59.725196838378906, + "kl": 5.7421875, + "learning_rate": 4.770521659060689e-07, + "loss": 0.4366, + "reward": 0.3699776977300644, + "reward_std": 0.23239155858755112, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.3231026902794838, "step": 749 }, { "clip_ratio": 0.0, - "completion_length": 1845.0224304199219, + "completion_length": 1533.1741638183594, "epoch": 0.22403106564110223, - "grad_norm": 0.16689199209213257, - "kl": 0.0157470703125, - "learning_rate": 9.538858225612503e-08, - "loss": 0.0598, - "reward": 0.4179687798023224, - "reward_std": 0.14432229101657867, - "rewards/accuracy_reward": 0.08482143562287092, + "grad_norm": 119.00836181640625, + "kl": 6.765625, + "learning_rate": 4.769429112806251e-07, + "loss": 0.4908, + "reward": 0.4118303805589676, + "reward_std": 0.24525899812579155, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.3225446566939354, "step": 750 }, { "clip_ratio": 0.0, - "completion_length": 1928.0513916015625, + "completion_length": 1605.9933776855469, "epoch": 0.2243297737286237, - "grad_norm": 0.15471433103084564, - "kl": 0.0134124755859375, - "learning_rate": 9.536668195270735e-08, - "loss": 0.0413, - "reward": 0.3699776977300644, - "reward_std": 0.16720163263380527, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 89.06428527832031, + "kl": 6.15625, + "learning_rate": 4.768334097635368e-07, + "loss": 0.4508, + "reward": 0.3353794813156128, + "reward_std": 0.21304966509342194, + "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.3085937649011612, "step": 751 }, { "clip_ratio": 0.0, - "completion_length": 1912.2411804199219, + "completion_length": 1620.4665832519531, "epoch": 0.22462848181614517, - "grad_norm": 0.14503803849220276, - "kl": 0.01416015625, - "learning_rate": 9.534473229478612e-08, - "loss": 0.0484, - "reward": 0.3956473469734192, - "reward_std": 0.13132288865745068, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 49.671043395996094, + "kl": 5.2890625, + "learning_rate": 4.767236614739306e-07, + "loss": 0.4134, + "reward": 0.3805803656578064, + "reward_std": 0.1868327260017395, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.3270089477300644, "step": 752 }, { "clip_ratio": 0.0, - "completion_length": 1925.6786804199219, + "completion_length": 1662.2634582519531, "epoch": 0.22492718990366664, - "grad_norm": 0.16114585101604462, - "kl": 0.013885498046875, - "learning_rate": 9.53227333062404e-08, - "loss": 0.0467, - "reward": 0.4514509066939354, - "reward_std": 0.13774323649704456, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 112.0594253540039, + "kl": 6.609375, + "learning_rate": 4.7661366653120204e-07, + "loss": 0.4643, + "reward": 0.4129464402794838, + "reward_std": 0.1965065486729145, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.3080357313156128, "step": 753 }, { "clip_ratio": 0.0, - "completion_length": 1890.0202026367188, + "completion_length": 1588.6473693847656, "epoch": 0.2252258979911881, - "grad_norm": 0.16277985274791718, - "kl": 0.0148468017578125, - "learning_rate": 9.530068501100297e-08, - "loss": 0.0479, - "reward": 0.4927455559372902, - "reward_std": 0.1664425954222679, - "rewards/accuracy_reward": 0.14508929336443543, + "grad_norm": 62.234649658203125, + "kl": 5.09375, + "learning_rate": 4.7650342505501484e-07, + "loss": 0.372, + "reward": 0.4559151977300644, + "reward_std": 0.21867735683918, + "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.3309151902794838, "step": 754 }, { "clip_ratio": 0.0, - "completion_length": 1982.930908203125, + "completion_length": 1623.1004943847656, "epoch": 0.22552460607870958, - "grad_norm": 0.15087684988975525, - "kl": 0.012451171875, - "learning_rate": 9.527858743306019e-08, - "loss": 0.0391, - "reward": 0.3069196566939354, - "reward_std": 0.09960964694619179, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 38.875701904296875, + "kl": 3.05859375, + "learning_rate": 4.763929371653009e-07, + "loss": 0.3064, + "reward": 0.3130580484867096, + "reward_std": 0.1689031682908535, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553656578064, + "rewards/tag_count_reward": 0.3108259066939354, "step": 755 }, { "clip_ratio": 0.0, - "completion_length": 1914.3348999023438, + "completion_length": 1610.5848999023438, "epoch": 0.22582331416623105, - "grad_norm": 0.1570606827735901, - "kl": 0.015045166015625, - "learning_rate": 9.525644059645206e-08, - "loss": 0.0539, - "reward": 0.384486623108387, - "reward_std": 0.14787418209016323, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 45.475440979003906, + "kl": 2.921875, + "learning_rate": 4.762822029822603e-07, + "loss": 0.2885, + "reward": 0.3738839402794838, + "reward_std": 0.20382528752088547, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.329241082072258, "step": 756 }, { "clip_ratio": 0.0, - "completion_length": 1975.72998046875, + "completion_length": 1690.5603332519531, "epoch": 0.22612202225375252, - "grad_norm": 0.14908480644226074, - "kl": 0.012359619140625, - "learning_rate": 9.523424452527216e-08, - "loss": 0.0356, - "reward": 0.3705357313156128, - "reward_std": 0.13654364831745625, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 32.88844299316406, + "kl": 3.09765625, + "learning_rate": 4.761712226263608e-07, + "loss": 0.2894, + "reward": 0.3738839477300644, + "reward_std": 0.23021108284592628, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3102678656578064, + "rewards/tag_count_reward": 0.297991082072258, "step": 757 }, { "clip_ratio": 0.0, - "completion_length": 1983.7657165527344, + "completion_length": 1685.0938110351562, "epoch": 0.226420730341274, - "grad_norm": 0.14534436166286469, - "kl": 0.0126800537109375, - "learning_rate": 9.521199924366765e-08, - "loss": 0.0399, - "reward": 0.4084821566939354, - "reward_std": 0.11202424392104149, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 42.43009567260742, + "kl": 4.52734375, + "learning_rate": 4.760599962183383e-07, + "loss": 0.3366, + "reward": 0.4017857387661934, + "reward_std": 0.1825690008699894, + "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.310267873108387, "step": 758 }, { "clip_ratio": 0.0, - "completion_length": 1893.0201416015625, + "completion_length": 1657.6317443847656, "epoch": 0.22671943842879547, - "grad_norm": 0.16674858331680298, - "kl": 0.014617919921875, - "learning_rate": 9.518970477583922e-08, - "loss": 0.0456, - "reward": 0.3292410895228386, - "reward_std": 0.08513031527400017, + "grad_norm": 26.399612426757812, + "kl": 4.33984375, + "learning_rate": 4.7594852387919607e-07, + "loss": 0.317, + "reward": 0.3169643059372902, + "reward_std": 0.16340947151184082, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.3125000149011612, "step": 759 }, { "clip_ratio": 0.0, - "completion_length": 1945.6943054199219, + "completion_length": 1637.7210388183594, "epoch": 0.22701814651631694, - "grad_norm": 0.16265098750591278, - "kl": 0.01446533203125, - "learning_rate": 9.516736114604106e-08, - "loss": 0.0367, - "reward": 0.4681919813156128, - "reward_std": 0.09287049062550068, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 3.753678560256958, + "kl": 3.84375, + "learning_rate": 4.758368057302053e-07, + "loss": 0.3228, + "reward": 0.4821428805589676, + "reward_std": 0.18982116132974625, + "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.3147321492433548, "step": 760 }, { "clip_ratio": 0.0, - "completion_length": 1900.6831359863281, + "completion_length": 1633.2210388183594, "epoch": 0.2273168546038384, - "grad_norm": 0.15729431807994843, - "kl": 0.01458740234375, - "learning_rate": 9.514496837858085e-08, - "loss": 0.05, - "reward": 0.3577009066939354, - "reward_std": 0.11257094610482454, - "rewards/accuracy_reward": 0.03125000232830644, + "grad_norm": 7.221460819244385, + "kl": 3.71875, + "learning_rate": 4.7572484189290424e-07, + "loss": 0.3145, + "reward": 0.3404017984867096, + "reward_std": 0.2006298042833805, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.3203125149011612, "step": 761 }, { "clip_ratio": 0.0, - "completion_length": 1903.4978942871094, + "completion_length": 1611.0067443847656, "epoch": 0.22761556269135988, - "grad_norm": 0.1401359587907791, - "kl": 0.014984130859375, - "learning_rate": 9.512252649781974e-08, - "loss": 0.0307, - "reward": 0.4659598395228386, - "reward_std": 0.1262490190565586, - "rewards/accuracy_reward": 0.13616072107106447, + "grad_norm": 6.448947906494141, + "kl": 3.70703125, + "learning_rate": 4.7561263248909866e-07, + "loss": 0.326, + "reward": 0.4648437723517418, + "reward_std": 0.1962817832827568, + "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.3242187649011612, "step": 762 }, { "clip_ratio": 0.0, - "completion_length": 1824.2322387695312, + "completion_length": 1521.7723999023438, "epoch": 0.22791427077888135, - "grad_norm": 0.16714246571063995, - "kl": 0.0176544189453125, - "learning_rate": 9.510003552817226e-08, - "loss": 0.0586, - "reward": 0.4508928805589676, - "reward_std": 0.10878060199320316, - "rewards/accuracy_reward": 0.1071428619325161, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500223517418, + "grad_norm": 17.45954704284668, + "kl": 3.74609375, + "learning_rate": 4.7550017764086126e-07, + "loss": 0.3607, + "reward": 0.4581473395228386, + "reward_std": 0.21560953557491302, + "rewards/accuracy_reward": 0.1160714365541935, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3398437723517418, "step": 763 }, { "clip_ratio": 0.0, - "completion_length": 1835.0157165527344, + "completion_length": 1551.5045166015625, "epoch": 0.22821297886640282, - "grad_norm": 0.1735662817955017, - "kl": 0.0171661376953125, - "learning_rate": 9.507749549410641e-08, - "loss": 0.0638, - "reward": 0.4933035969734192, - "reward_std": 0.14368967898190022, - "rewards/accuracy_reward": 0.13839286239817739, + "grad_norm": 9.536385536193848, + "kl": 4.9765625, + "learning_rate": 4.75387477470532e-07, + "loss": 0.4627, + "reward": 0.4469866305589676, + "reward_std": 0.16817675530910492, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.3376116305589676, "step": 764 }, { "clip_ratio": 0.0, - "completion_length": 1910.5000915527344, + "completion_length": 1631.1429138183594, "epoch": 0.22851168695392426, - "grad_norm": 0.16192205250263214, - "kl": 0.0157928466796875, - "learning_rate": 9.505490642014354e-08, - "loss": 0.043, - "reward": 0.432477705180645, - "reward_std": 0.15335247665643692, - "rewards/accuracy_reward": 0.09821429220028222, + "grad_norm": 30.501157760620117, + "kl": 4.90625, + "learning_rate": 4.7527453210071764e-07, + "loss": 0.3982, + "reward": 0.3850446566939354, + "reward_std": 0.20481175929307938, + "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.3136160895228386, "step": 765 }, { "clip_ratio": 0.0, - "completion_length": 1822.5201721191406, + "completion_length": 1486.0826416015625, "epoch": 0.22881039504144574, - "grad_norm": 0.15976662933826447, - "kl": 0.017333984375, - "learning_rate": 9.503226833085833e-08, - "loss": 0.0529, - "reward": 0.4933035895228386, - "reward_std": 0.1544018853455782, - "rewards/accuracy_reward": 0.1495535783469677, + "grad_norm": 49.96992874145508, + "kl": 5.0625, + "learning_rate": 4.751613416542917e-07, + "loss": 0.4095, + "reward": 0.487723246216774, + "reward_std": 0.25005093216896057, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.3515625149011612, "step": 766 }, { "clip_ratio": 0.0, - "completion_length": 1754.9397888183594, + "completion_length": 1432.9822082519531, "epoch": 0.2291091031289672, - "grad_norm": 0.17232824862003326, - "kl": 0.0195465087890625, - "learning_rate": 9.500958125087881e-08, - "loss": 0.0761, - "reward": 0.4966517984867096, - "reward_std": 0.16333625465631485, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 9.327000617980957, + "kl": 4.14453125, + "learning_rate": 4.7504790625439405e-07, + "loss": 0.3766, + "reward": 0.5000000074505806, + "reward_std": 0.21503648161888123, + "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716517984867096, + "rewards/tag_count_reward": 0.3616071566939354, "step": 767 }, { "clip_ratio": 0.0, - "completion_length": 1912.8036804199219, + "completion_length": 1548.3125915527344, "epoch": 0.22940781121648868, - "grad_norm": 0.17851951718330383, - "kl": 0.016448974609375, - "learning_rate": 9.498684520488631e-08, - "loss": 0.0541, - "reward": 0.3922991305589676, - "reward_std": 0.12506997771561146, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 29.45963478088379, + "kl": 3.85546875, + "learning_rate": 4.7493422602443156e-07, + "loss": 0.3849, + "reward": 0.3934151977300644, + "reward_std": 0.18705883249640465, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.342075914144516, "step": 768 }, { "clip_ratio": 0.0, - "completion_length": 1968.5759887695312, + "completion_length": 1619.2947082519531, "epoch": 0.22970651930401015, - "grad_norm": 0.1634359508752823, - "kl": 0.0141448974609375, - "learning_rate": 9.496406021761543e-08, - "loss": 0.0393, - "reward": 0.4503348395228386, - "reward_std": 0.1665748655796051, - "rewards/accuracy_reward": 0.13392857951112092, + "grad_norm": 13.68345832824707, + "kl": 4.23046875, + "learning_rate": 4.748203010880771e-07, + "loss": 0.3538, + "reward": 0.4804687798023224, + "reward_std": 0.21931473538279533, + "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.337611623108387, "step": 769 }, { "clip_ratio": 0.0, - "completion_length": 1870.6541137695312, + "completion_length": 1569.4398193359375, "epoch": 0.23000522739153162, - "grad_norm": 0.1872042715549469, - "kl": 0.017120361328125, - "learning_rate": 9.494122631385397e-08, - "loss": 0.0581, - "reward": 0.4665178805589676, - "reward_std": 0.12145893275737762, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 11.466922760009766, + "kl": 4.3203125, + "learning_rate": 4.747061315692698e-07, + "loss": 0.3548, + "reward": 0.4296875223517418, + "reward_std": 0.19274571537971497, + "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482142984867096, + "rewards/tag_count_reward": 0.3270089402794838, "step": 770 }, { "clip_ratio": 0.0, - "completion_length": 1976.0380554199219, + "completion_length": 1649.4241638183594, "epoch": 0.2303039354790531, - "grad_norm": 0.15355834364891052, - "kl": 0.0137939453125, - "learning_rate": 9.491834351844298e-08, - "loss": 0.0374, - "reward": 0.3666294813156128, - "reward_std": 0.12688155472278595, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 8.567787170410156, + "kl": 3.92578125, + "learning_rate": 4.745917175922149e-07, + "loss": 0.3335, + "reward": 0.3660714402794838, + "reward_std": 0.20628343150019646, + "rewards/accuracy_reward": 0.05803571594879031, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3108259066939354, + "rewards/tag_count_reward": 0.3080357313156128, "step": 771 }, { "clip_ratio": 0.0, - "completion_length": 1916.9465026855469, + "completion_length": 1638.1473999023438, "epoch": 0.23060264356657456, - "grad_norm": 0.15234388411045074, - "kl": 0.0149078369140625, - "learning_rate": 9.489541185627672e-08, - "loss": 0.0363, - "reward": 0.4034598395228386, - "reward_std": 0.09831984713673592, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 11.818560600280762, + "kl": 4.28125, + "learning_rate": 4.744770592813836e-07, + "loss": 0.3427, + "reward": 0.3995535895228386, + "reward_std": 0.18462025746703148, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.3191964402794838, "step": 772 }, { - "clip_ratio": 0.0, - "completion_length": 1936.5715026855469, - "epoch": 0.23090135165409603, - "grad_norm": 0.16363662481307983, - "kl": 0.01605224609375, - "learning_rate": 9.487243135230258e-08, - "loss": 0.0486, - "reward": 0.3989955559372902, - "reward_std": 0.17795461416244507, - "rewards/accuracy_reward": 0.06696429057046771, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "clip_ratio": 0.0, + "completion_length": 1551.8348999023438, + "epoch": 0.23090135165409603, + "grad_norm": 22.21766471862793, + "kl": 3.640625, + "learning_rate": 4.743621567615129e-07, + "loss": 0.3363, + "reward": 0.3671875223517418, + "reward_std": 0.21998000144958496, + "rewards/accuracy_reward": 0.03348214412108064, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3314732313156128, "step": 773 }, { "clip_ratio": 0.0, - "completion_length": 1901.3326721191406, + "completion_length": 1607.6272888183594, "epoch": 0.2312000597416175, - "grad_norm": 0.17654725909233093, - "kl": 0.016693115234375, - "learning_rate": 9.484940203152112e-08, - "loss": 0.0609, - "reward": 0.4084821566939354, - "reward_std": 0.08826135098934174, + "grad_norm": 21.037748336791992, + "kl": 4.09765625, + "learning_rate": 4.742470101576056e-07, + "loss": 0.3941, + "reward": 0.381138414144516, + "reward_std": 0.16437776759266853, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337053582072258, + "rewards/tag_count_reward": 0.309709832072258, "step": 774 }, { "clip_ratio": 0.0, - "completion_length": 1885.0335693359375, + "completion_length": 1606.3371276855469, "epoch": 0.23149876782913897, - "grad_norm": 0.18974317610263824, - "kl": 0.017730712890625, - "learning_rate": 9.482632391898595e-08, - "loss": 0.0484, - "reward": 0.463169664144516, - "reward_std": 0.13314994610846043, - "rewards/accuracy_reward": 0.12053571967408061, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "grad_norm": 4.347177028656006, + "kl": 4.0859375, + "learning_rate": 4.7413161959492976e-07, + "loss": 0.3588, + "reward": 0.4536830633878708, + "reward_std": 0.19590357691049576, + "rewards/accuracy_reward": 0.1272321492433548, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3242187649011612, "step": 775 }, { "clip_ratio": 0.0, - "completion_length": 1820.8817749023438, + "completion_length": 1500.8683471679688, "epoch": 0.23179747591666044, - "grad_norm": 0.16740970313549042, - "kl": 0.017791748046875, - "learning_rate": 9.48031970398038e-08, - "loss": 0.0452, - "reward": 0.470982164144516, - "reward_std": 0.17542513087391853, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 18.262115478515625, + "kl": 3.125, + "learning_rate": 4.74015985199019e-07, + "loss": 0.2909, + "reward": 0.4386160969734192, + "reward_std": 0.23202602565288544, + "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3593750223517418, + "rewards/tag_count_reward": 0.3537946566939354, "step": 776 }, { "clip_ratio": 0.0, - "completion_length": 1898.8639526367188, + "completion_length": 1602.9085693359375, "epoch": 0.23209618400418192, - "grad_norm": 0.15560893714427948, - "kl": 0.01678466796875, - "learning_rate": 9.478002141913447e-08, - "loss": 0.0548, - "reward": 0.372767873108387, - "reward_std": 0.10796140879392624, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 29.736433029174805, + "kl": 3.7265625, + "learning_rate": 4.7390010709567236e-07, + "loss": 0.3683, + "reward": 0.3660714477300644, + "reward_std": 0.1861266754567623, + "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.321428582072258, "step": 777 }, { "clip_ratio": 0.0, - "completion_length": 1800.9576416015625, + "completion_length": 1493.3973693847656, "epoch": 0.2323948920917034, - "grad_norm": 0.1693618893623352, - "kl": 0.019317626953125, - "learning_rate": 9.475679708219075e-08, - "loss": 0.0711, - "reward": 0.4799107387661934, - "reward_std": 0.18279476650059223, - "rewards/accuracy_reward": 0.11830357927829027, + "grad_norm": 9.16287612915039, + "kl": 4.00390625, + "learning_rate": 4.7378398541095376e-07, + "loss": 0.3557, + "reward": 0.4475446566939354, + "reward_std": 0.21805253997445107, + "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3616071566939354, + "rewards/tag_count_reward": 0.3404017984867096, "step": 778 }, { "clip_ratio": 0.0, - "completion_length": 1819.4487609863281, + "completion_length": 1479.3304138183594, "epoch": 0.23269360017922486, - "grad_norm": 0.15779538452625275, - "kl": 0.01873779296875, - "learning_rate": 9.473352405423843e-08, - "loss": 0.0705, + "grad_norm": 11.847085952758789, + "kl": 3.7890625, + "learning_rate": 4.736676202711922e-07, + "loss": 0.3868, "reward": 0.411830373108387, - "reward_std": 0.16860179230570793, - "rewards/accuracy_reward": 0.06473214505240321, + "reward_std": 0.2492023967206478, + "rewards/accuracy_reward": 0.06473214691504836, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982164144516, + "rewards/tag_count_reward": 0.3470982313156128, "step": 779 }, { "clip_ratio": 0.0, - "completion_length": 1913.0715026855469, + "completion_length": 1568.9041137695312, "epoch": 0.23299230826674633, - "grad_norm": 0.1515500694513321, - "kl": 0.017181396484375, - "learning_rate": 9.471020236059631e-08, - "loss": 0.046, - "reward": 0.4218750223517418, - "reward_std": 0.1323515810072422, - "rewards/accuracy_reward": 0.0758928582072258, + "grad_norm": 36.21589660644531, + "kl": 5.4921875, + "learning_rate": 4.7355101180298153e-07, + "loss": 0.4737, + "reward": 0.4107142984867096, + "reward_std": 0.22869988530874252, + "rewards/accuracy_reward": 0.08035715040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821566939354, + "rewards/tag_count_reward": 0.3303571566939354, "step": 780 }, { "clip_ratio": 0.0, - "completion_length": 1920.2545776367188, + "completion_length": 1622.7523193359375, "epoch": 0.2332910163542678, - "grad_norm": 0.166532501578331, - "kl": 0.016845703125, - "learning_rate": 9.468683202663608e-08, - "loss": 0.0535, - "reward": 0.407924123108387, - "reward_std": 0.11253480613231659, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 40.842498779296875, + "kl": 5.203125, + "learning_rate": 4.734341601331804e-07, + "loss": 0.4187, + "reward": 0.3945312723517418, + "reward_std": 0.19413265585899353, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.3119419813156128, "step": 781 }, { "clip_ratio": 0.0, - "completion_length": 1889.8036499023438, + "completion_length": 1591.3236999511719, "epoch": 0.23358972444178927, - "grad_norm": 0.1539001613855362, - "kl": 0.0167999267578125, - "learning_rate": 9.466341307778238e-08, - "loss": 0.0471, - "reward": 0.443638414144516, - "reward_std": 0.10448970831930637, - "rewards/accuracy_reward": 0.11383928847499192, + "grad_norm": 11.995438575744629, + "kl": 4.34765625, + "learning_rate": 4.733170653889119e-07, + "loss": 0.3508, + "reward": 0.4469866156578064, + "reward_std": 0.19743777066469193, + "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.3398437649011612, "step": 782 }, { "clip_ratio": 0.0, - "completion_length": 1869.0848999023438, + "completion_length": 1600.5603332519531, "epoch": 0.23388843252931074, - "grad_norm": 0.17222349345684052, - "kl": 0.0185546875, - "learning_rate": 9.463994553951276e-08, - "loss": 0.0526, - "reward": 0.4352678805589676, - "reward_std": 0.14167924225330353, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 32.80662155151367, + "kl": 4.9140625, + "learning_rate": 4.7319972769756376e-07, + "loss": 0.387, + "reward": 0.4051339477300644, + "reward_std": 0.20813065022230148, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821492433548, + "rewards/tag_count_reward": 0.3203125149011612, "step": 783 }, { "clip_ratio": 0.0, - "completion_length": 1894.0067749023438, + "completion_length": 1533.3884582519531, "epoch": 0.2341871406168322, - "grad_norm": 0.16152217984199524, - "kl": 0.018463134765625, - "learning_rate": 9.461642943735756e-08, - "loss": 0.0532, - "reward": 0.3738839402794838, - "reward_std": 0.1260356307029724, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 49.4031867980957, + "kl": 3.25390625, + "learning_rate": 4.730821471867878e-07, + "loss": 0.3438, + "reward": 0.3973214477300644, + "reward_std": 0.20391527190804482, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.352678582072258, "step": 784 }, { "clip_ratio": 0.0, - "completion_length": 1905.6317749023438, + "completion_length": 1595.3460693359375, "epoch": 0.23448584870435368, - "grad_norm": 0.1783648580312729, - "kl": 0.018157958984375, - "learning_rate": 9.459286479690002e-08, - "loss": 0.0686, - "reward": 0.4302455559372902, - "reward_std": 0.14437528885900974, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 23.638885498046875, + "kl": 3.921875, + "learning_rate": 4.729643239845001e-07, + "loss": 0.385, + "reward": 0.4196428805589676, + "reward_std": 0.1922941543161869, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276828289032, + "rewards/tag_count_reward": 0.3370535895228386, "step": 785 }, { "clip_ratio": 0.0, - "completion_length": 1933.0447387695312, + "completion_length": 1651.57373046875, "epoch": 0.23478455679187513, - "grad_norm": 0.14699284732341766, - "kl": 0.017333984375, - "learning_rate": 9.456925164377614e-08, - "loss": 0.0424, - "reward": 0.358258955180645, - "reward_std": 0.08102561999112368, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 17.6804141998291, + "kl": 4.65625, + "learning_rate": 4.728462582188807e-07, + "loss": 0.3864, + "reward": 0.348772332072258, + "reward_std": 0.17659789323806763, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.306361623108387, "step": 786 }, { "clip_ratio": 0.0, - "completion_length": 1849.5625610351562, + "completion_length": 1538.9264221191406, "epoch": 0.2350832648793966, - "grad_norm": 0.17708241939544678, - "kl": 0.018890380859375, - "learning_rate": 9.454559000367475e-08, - "loss": 0.0561, - "reward": 0.3861607313156128, - "reward_std": 0.11902189441025257, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 10.762480735778809, + "kl": 4.3046875, + "learning_rate": 4.7272795001837373e-07, + "loss": 0.3689, + "reward": 0.3677455559372902, + "reward_std": 0.2079564332962036, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3660714477300644, + "rewards/tag_count_reward": 0.3342634066939354, "step": 787 }, { "clip_ratio": 0.0, - "completion_length": 1817.5536499023438, + "completion_length": 1501.1272888183594, "epoch": 0.23538197296691807, - "grad_norm": 0.16661857068538666, - "kl": 0.02166748046875, - "learning_rate": 9.452187990233736e-08, - "loss": 0.0505, - "reward": 0.5524553954601288, - "reward_std": 0.16325197368860245, - "rewards/accuracy_reward": 0.196428582072258, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "grad_norm": 18.453758239746094, + "kl": 4.10546875, + "learning_rate": 4.7260939951168677e-07, + "loss": 0.383, + "reward": 0.5106026977300644, + "reward_std": 0.21142082661390305, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3320312574505806, "step": 788 }, { "clip_ratio": 0.0, - "completion_length": 1904.7567443847656, + "completion_length": 1579.0647888183594, "epoch": 0.23568068105443954, - "grad_norm": 0.1623302549123764, - "kl": 0.01800537109375, - "learning_rate": 9.449812136555824e-08, - "loss": 0.055, - "reward": 0.4140625223517418, - "reward_std": 0.160513149574399, - "rewards/accuracy_reward": 0.07366071664728224, + "grad_norm": 15.09395980834961, + "kl": 4.5546875, + "learning_rate": 4.7249060682779116e-07, + "loss": 0.3899, + "reward": 0.3839285895228386, + "reward_std": 0.22301950678229332, + "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.3214285895228386, "step": 789 }, { "clip_ratio": 0.0, - "completion_length": 1831.1318054199219, + "completion_length": 1492.9911193847656, "epoch": 0.235979389141961, - "grad_norm": 0.19437535107135773, - "kl": 0.021148681640625, - "learning_rate": 9.447431441918437e-08, - "loss": 0.0675, - "reward": 0.4754464477300644, - "reward_std": 0.1628873459994793, - "rewards/accuracy_reward": 0.1026785746216774, + "grad_norm": 35.8751220703125, + "kl": 3.74609375, + "learning_rate": 4.7237157209592185e-07, + "loss": 0.4094, + "reward": 0.4553571566939354, + "reward_std": 0.2220921292901039, + "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.3370535895228386, "step": 790 }, { "clip_ratio": 0.0, - "completion_length": 1932.3550109863281, + "completion_length": 1661.5804138183594, "epoch": 0.23627809722948248, - "grad_norm": 0.17474594712257385, - "kl": 0.01837158203125, - "learning_rate": 9.445045908911536e-08, - "loss": 0.0502, - "reward": 0.4972098544239998, - "reward_std": 0.1429674345999956, - "rewards/accuracy_reward": 0.1651785783469677, + "grad_norm": 7.326234817504883, + "kl": 4.40234375, + "learning_rate": 4.722522954455768e-07, + "loss": 0.3756, + "reward": 0.451450914144516, + "reward_std": 0.1889878623187542, + "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.301897332072258, "step": 791 }, { "clip_ratio": 0.0, - "completion_length": 1986.16748046875, + "completion_length": 1653.5648193359375, "epoch": 0.23657680531700395, - "grad_norm": 0.1449783891439438, - "kl": 0.0169525146484375, - "learning_rate": 9.442655540130346e-08, - "loss": 0.0399, - "reward": 0.3588169738650322, - "reward_std": 0.11751631461083889, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 22.318750381469727, + "kl": 5.0234375, + "learning_rate": 4.7213277700651733e-07, + "loss": 0.4096, + "reward": 0.3549107313156128, + "reward_std": 0.21945121884346008, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.3013392984867096, "step": 792 }, { "clip_ratio": 0.0, - "completion_length": 1961.6072387695312, + "completion_length": 1680.6986999511719, "epoch": 0.23687551340452542, - "grad_norm": 0.15609724819660187, - "kl": 0.0163726806640625, - "learning_rate": 9.440260338175357e-08, - "loss": 0.0336, - "reward": 0.3487723469734192, - "reward_std": 0.11693353205919266, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 46.658878326416016, + "kl": 4.9921875, + "learning_rate": 4.7201301690876787e-07, + "loss": 0.355, + "reward": 0.3325892984867096, + "reward_std": 0.2030538022518158, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187723517418, + "rewards/tag_count_reward": 0.305803582072258, "step": 793 }, { "clip_ratio": 0.0, - "completion_length": 1781.30810546875, + "completion_length": 1473.0782165527344, "epoch": 0.2371742214920469, - "grad_norm": 0.17595504224300385, - "kl": 0.021453857421875, - "learning_rate": 9.437860305652312e-08, - "loss": 0.0659, - "reward": 0.5558036044239998, - "reward_std": 0.22024289146065712, - "rewards/accuracy_reward": 0.1808035783469677, + "grad_norm": 12.9006986618042, + "kl": 3.78515625, + "learning_rate": 4.718930152826156e-07, + "loss": 0.3483, + "reward": 0.5150669813156128, + "reward_std": 0.25594663619995117, + "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000223517418, + "rewards/tag_count_reward": 0.3588169738650322, "step": 794 }, { "clip_ratio": 0.0, - "completion_length": 1888.6139221191406, + "completion_length": 1606.0491638183594, "epoch": 0.23747292957956836, - "grad_norm": 0.18436789512634277, - "kl": 0.01953125, - "learning_rate": 9.435455445172213e-08, - "loss": 0.0669, - "reward": 0.4564732387661934, - "reward_std": 0.15487120300531387, + "grad_norm": 17.206375122070312, + "kl": 4.75, + "learning_rate": 4.7177277225861064e-07, + "loss": 0.3999, + "reward": 0.4123884215950966, + "reward_std": 0.21441031619906425, "rewards/accuracy_reward": 0.10491072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625149011612, + "rewards/tag_count_reward": 0.3074776902794838, "step": 795 }, { "clip_ratio": 0.0, - "completion_length": 1793.4688415527344, + "completion_length": 1437.5514221191406, "epoch": 0.23777163766708984, - "grad_norm": 0.1718894988298416, - "kl": 0.02227783203125, - "learning_rate": 9.433045759351309e-08, - "loss": 0.0769, - "reward": 0.4319196566939354, - "reward_std": 0.15367234125733376, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 60.66862487792969, + "kl": 2.91796875, + "learning_rate": 4.7165228796756547e-07, + "loss": 0.3661, + "reward": 0.4101562723517418, + "reward_std": 0.20885229483246803, + "rewards/accuracy_reward": 0.053571432596072555, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3694196566939354, + "rewards/tag_count_reward": 0.3565848469734192, "step": 796 }, { "clip_ratio": 0.0, - "completion_length": 1817.7277526855469, + "completion_length": 1512.5826416015625, "epoch": 0.2380703457546113, - "grad_norm": 0.16404883563518524, - "kl": 0.021453857421875, - "learning_rate": 9.430631250811105e-08, - "loss": 0.0503, - "reward": 0.3694196492433548, - "reward_std": 0.11146962456405163, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 56.03329086303711, + "kl": 3.19921875, + "learning_rate": 4.7153156254055524e-07, + "loss": 0.3607, + "reward": 0.3459821566939354, + "reward_std": 0.1770428754389286, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3582589328289032, + "rewards/tag_count_reward": 0.3370535895228386, "step": 797 }, { "clip_ratio": 0.0, - "completion_length": 1891.2813110351562, + "completion_length": 1539.58935546875, "epoch": 0.23836905384213278, - "grad_norm": 0.1728137582540512, - "kl": 0.019683837890625, - "learning_rate": 9.428211922178349e-08, - "loss": 0.0398, - "reward": 0.4135044813156128, - "reward_std": 0.18281488679349422, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 40.79336929321289, + "kl": 3.14453125, + "learning_rate": 4.7141059610891743e-07, + "loss": 0.3168, + "reward": 0.3989955484867096, + "reward_std": 0.244227334856987, + "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3554687649011612, + "rewards/tag_count_reward": 0.340959832072258, "step": 798 }, { "clip_ratio": 0.0, - "completion_length": 1852.8125915527344, + "completion_length": 1579.1719360351562, "epoch": 0.23866776192965425, - "grad_norm": 0.1748778373003006, - "kl": 0.020416259765625, - "learning_rate": 9.425787776085031e-08, - "loss": 0.0598, - "reward": 0.494977705180645, - "reward_std": 0.1865651085972786, - "rewards/accuracy_reward": 0.1294642947614193, + "grad_norm": 11.844035148620605, + "kl": 4.140625, + "learning_rate": 4.7128938880425157e-07, + "loss": 0.3777, + "reward": 0.4402901977300644, + "reward_std": 0.23507649451494217, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3655134066939354, + "rewards/tag_count_reward": 0.3219866305589676, "step": 799 }, { "clip_ratio": 0.0, - "completion_length": 1782.1138916015625, + "completion_length": 1451.9308776855469, "epoch": 0.23896647001717572, - "grad_norm": 0.16633865237236023, - "kl": 0.023712158203125, - "learning_rate": 9.423358815168389e-08, - "loss": 0.058, - "reward": 0.4732143208384514, - "reward_std": 0.15156977344304323, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 20.141048431396484, + "kl": 3.82421875, + "learning_rate": 4.711679407584194e-07, + "loss": 0.3857, + "reward": 0.439732164144516, + "reward_std": 0.20673994347453117, + "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3705357387661934, + "rewards/tag_count_reward": 0.3415178805589676, "step": 800 }, { "clip_ratio": 0.0, - "completion_length": 1864.4286499023438, + "completion_length": 1562.0291137695312, "epoch": 0.2392651781046972, - "grad_norm": 0.1759883016347885, - "kl": 0.02056884765625, - "learning_rate": 9.42092504207089e-08, - "loss": 0.0665, - "reward": 0.4185268133878708, - "reward_std": 0.14582183957099915, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 19.112764358520508, + "kl": 3.69140625, + "learning_rate": 4.710462521035445e-07, + "loss": 0.3362, + "reward": 0.3878348395228386, + "reward_std": 0.17721455916762352, + "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3537946566939354, + "rewards/tag_count_reward": 0.3409598395228386, "step": 801 }, { "clip_ratio": 0.0, - "completion_length": 1796.555908203125, + "completion_length": 1535.1808776855469, "epoch": 0.23956388619221866, - "grad_norm": 0.2071204036474228, - "kl": 0.02325439453125, - "learning_rate": 9.418486459440243e-08, - "loss": 0.066, - "reward": 0.409040205180645, - "reward_std": 0.15016416274011135, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 34.41617202758789, + "kl": 4.96875, + "learning_rate": 4.709243229720122e-07, + "loss": 0.4204, + "reward": 0.3705357313156128, + "reward_std": 0.2245902381837368, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357700914144516, + "rewards/tag_count_reward": 0.3214285895228386, "step": 802 }, { "clip_ratio": 0.0, - "completion_length": 1810.4598999023438, + "completion_length": 1520.9219665527344, "epoch": 0.23986259427974013, - "grad_norm": 0.18300241231918335, - "kl": 0.0223388671875, - "learning_rate": 9.416043069929387e-08, - "loss": 0.0652, - "reward": 0.4905134066939354, - "reward_std": 0.1593169290572405, - "rewards/accuracy_reward": 0.11383929033763707, + "grad_norm": 18.671527862548828, + "kl": 4.44921875, + "learning_rate": 4.7080215349646934e-07, + "loss": 0.4096, + "reward": 0.4335937723517418, + "reward_std": 0.18695341795682907, + "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376674123108387, + "rewards/tag_count_reward": 0.3443080559372902, "step": 803 }, { "clip_ratio": 0.0, - "completion_length": 1882.8661499023438, + "completion_length": 1535.9554748535156, "epoch": 0.2401613023672616, - "grad_norm": 0.1619580090045929, - "kl": 0.020416259765625, - "learning_rate": 9.413594876196489e-08, - "loss": 0.0447, - "reward": 0.3973214477300644, - "reward_std": 0.1499914899468422, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 21.111595153808594, + "kl": 4.30859375, + "learning_rate": 4.7067974380982443e-07, + "loss": 0.359, + "reward": 0.3632812649011612, + "reward_std": 0.22077381238341331, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.3364955484867096, "step": 804 }, { "clip_ratio": 0.0, - "completion_length": 1835.9733276367188, + "completion_length": 1561.138427734375, "epoch": 0.24046001045478307, - "grad_norm": 0.1756931096315384, - "kl": 0.022125244140625, - "learning_rate": 9.411141880904944e-08, - "loss": 0.064, - "reward": 0.5145089402794838, - "reward_std": 0.13991664163768291, - "rewards/accuracy_reward": 0.15401786379516125, + "grad_norm": 31.330413818359375, + "kl": 5.0390625, + "learning_rate": 4.705570940452472e-07, + "loss": 0.4185, + "reward": 0.4620535969734192, + "reward_std": 0.22796199470758438, + "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3604910895228386, + "rewards/tag_count_reward": 0.3191964328289032, "step": 805 }, { "clip_ratio": 0.0, - "completion_length": 1776.8818054199219, + "completion_length": 1521.6630249023438, "epoch": 0.24075871854230455, - "grad_norm": 0.17932775616645813, - "kl": 0.023193359375, - "learning_rate": 9.408684086723373e-08, - "loss": 0.0677, - "reward": 0.5251116380095482, - "reward_std": 0.1644109096378088, - "rewards/accuracy_reward": 0.1495535746216774, + "grad_norm": 34.99658203125, + "kl": 5.0234375, + "learning_rate": 4.7043420433616865e-07, + "loss": 0.4141, + "reward": 0.4787946790456772, + "reward_std": 0.25611385703086853, + "rewards/accuracy_reward": 0.15401786682195961, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580484867096, + "rewards/tag_count_reward": 0.3247767984867096, "step": 806 }, { "clip_ratio": 0.0, - "completion_length": 1790.1942749023438, + "completion_length": 1560.6741638183594, "epoch": 0.24105742662982602, - "grad_norm": 0.18193674087524414, - "kl": 0.0228271484375, - "learning_rate": 9.40622149632561e-08, - "loss": 0.0675, - "reward": 0.4525669887661934, - "reward_std": 0.15398069471120834, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 29.833375930786133, + "kl": 4.984375, + "learning_rate": 4.7031107481628053e-07, + "loss": 0.4151, + "reward": 0.416294664144516, + "reward_std": 0.22185399010777473, + "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455484867096, + "rewards/tag_count_reward": 0.3270089402794838, "step": 807 }, { "clip_ratio": 0.0, - "completion_length": 1849.1831359863281, + "completion_length": 1542.5402526855469, "epoch": 0.24135613471734746, - "grad_norm": 0.1792202591896057, - "kl": 0.02197265625, - "learning_rate": 9.403754112390718e-08, - "loss": 0.0565, - "reward": 0.381696455180645, - "reward_std": 0.12337894923985004, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 24.508716583251953, + "kl": 5.1171875, + "learning_rate": 4.7018770561953593e-07, + "loss": 0.4476, + "reward": 0.3554687723517418, + "reward_std": 0.1981598436832428, + "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107387661934, + "rewards/tag_count_reward": 0.3309151977300644, "step": 808 }, { "clip_ratio": 0.0, - "completion_length": 1917.7947082519531, + "completion_length": 1612.4085388183594, "epoch": 0.24165484280486893, - "grad_norm": 0.17700213193893433, - "kl": 0.02142333984375, - "learning_rate": 9.401281937602965e-08, - "loss": 0.0599, - "reward": 0.3588169738650322, - "reward_std": 0.12942556850612164, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 25.000417709350586, + "kl": 5.125, + "learning_rate": 4.7006409688014823e-07, + "loss": 0.4508, + "reward": 0.3320312649011612, + "reward_std": 0.21699140593409538, + "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3498884066939354, + "rewards/tag_count_reward": 0.3007812649011612, "step": 809 }, { "clip_ratio": 0.0, - "completion_length": 1878.0648193359375, + "completion_length": 1585.2880249023438, "epoch": 0.2419535508923904, - "grad_norm": 0.18423734605312347, - "kl": 0.022491455078125, - "learning_rate": 9.398804974651835e-08, - "loss": 0.0762, - "reward": 0.428013414144516, - "reward_std": 0.1332500334829092, - "rewards/accuracy_reward": 0.08705357555299997, + "grad_norm": 19.80357551574707, + "kl": 3.87109375, + "learning_rate": 4.6994024873259174e-07, + "loss": 0.346, + "reward": 0.3911830484867096, + "reward_std": 0.1907978244125843, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.3108259066939354, "step": 810 }, { "clip_ratio": 0.0, - "completion_length": 1788.8393859863281, + "completion_length": 1495.6741638183594, "epoch": 0.24225225897991187, - "grad_norm": 0.20034590363502502, - "kl": 0.024749755859375, - "learning_rate": 9.396323226232024e-08, - "loss": 0.073, - "reward": 0.451450914144516, - "reward_std": 0.1380604412406683, - "rewards/accuracy_reward": 0.08705357275903225, + "grad_norm": 53.10260009765625, + "kl": 3.66796875, + "learning_rate": 4.698161613116012e-07, + "loss": 0.3968, + "reward": 0.423549123108387, + "reward_std": 0.21475359424948692, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3643973395228386, + "rewards/tag_count_reward": 0.3342634066939354, "step": 811 }, { "clip_ratio": 0.0, - "completion_length": 1843.1116943359375, + "completion_length": 1572.0246276855469, "epoch": 0.24255096706743334, - "grad_norm": 0.16509340703487396, - "kl": 0.021942138671875, - "learning_rate": 9.393836695043426e-08, - "loss": 0.0611, - "reward": 0.3950893059372902, - "reward_std": 0.12594765424728394, - "rewards/accuracy_reward": 0.03348214412108064, + "grad_norm": 38.460811614990234, + "kl": 3.2109375, + "learning_rate": 4.6969183475217135e-07, + "loss": 0.3144, + "reward": 0.356584832072258, + "reward_std": 0.1807586532086134, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.361607164144516, + "rewards/tag_count_reward": 0.3297991156578064, "step": 812 }, { "clip_ratio": 0.0, - "completion_length": 1852.1072692871094, + "completion_length": 1575.4688415527344, "epoch": 0.2428496751549548, - "grad_norm": 0.1772996038198471, - "kl": 0.022186279296875, - "learning_rate": 9.391345383791147e-08, - "loss": 0.0489, - "reward": 0.4229910969734192, - "reward_std": 0.13520164601504803, - "rewards/accuracy_reward": 0.06696428591385484, + "grad_norm": 25.007333755493164, + "kl": 3.671875, + "learning_rate": 4.695672691895574e-07, + "loss": 0.3311, + "reward": 0.3867187798023224, + "reward_std": 0.20576754212379456, + "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560268059372902, + "rewards/tag_count_reward": 0.3242187649011612, "step": 813 }, { "clip_ratio": 0.0, - "completion_length": 1893.5581359863281, + "completion_length": 1569.9554138183594, "epoch": 0.24314838324247628, - "grad_norm": 0.17084595561027527, - "kl": 0.022491455078125, - "learning_rate": 9.38884929518549e-08, - "loss": 0.0624, - "reward": 0.3872768059372902, - "reward_std": 0.16214704513549805, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 31.91322135925293, + "kl": 3.52734375, + "learning_rate": 4.694424647592745e-07, + "loss": 0.3552, + "reward": 0.3543526902794838, + "reward_std": 0.20612715929746628, + "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.3231026902794838, "step": 814 }, { "clip_ratio": 0.0, - "completion_length": 1899.884033203125, + "completion_length": 1570.6808471679688, "epoch": 0.24344709132999776, - "grad_norm": 0.18310651183128357, - "kl": 0.0224609375, - "learning_rate": 9.386348431941952e-08, - "loss": 0.0604, - "reward": 0.3967634066939354, - "reward_std": 0.14015235472470522, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 17.008459091186523, + "kl": 4.4609375, + "learning_rate": 4.693174215970976e-07, + "loss": 0.3895, + "reward": 0.368861623108387, + "reward_std": 0.2210695967078209, + "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.340959832072258, + "rewards/tag_count_reward": 0.3085937574505806, "step": 815 }, { "clip_ratio": 0.0, - "completion_length": 1869.0022888183594, + "completion_length": 1614.9130249023438, "epoch": 0.24374579941751923, - "grad_norm": 0.17780816555023193, - "kl": 0.023590087890625, - "learning_rate": 9.383842796781229e-08, - "loss": 0.0658, - "reward": 0.4581473469734192, - "reward_std": 0.1482618935406208, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 47.531349182128906, + "kl": 5.21875, + "learning_rate": 4.691921398390614e-07, + "loss": 0.4317, + "reward": 0.405133955180645, + "reward_std": 0.21911616995930672, + "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3376116156578064, + "rewards/tag_count_reward": 0.2979910895228386, "step": 816 }, { "clip_ratio": 0.0, - "completion_length": 1898.5692749023438, + "completion_length": 1596.9085693359375, "epoch": 0.2440445075050407, - "grad_norm": 0.17975354194641113, - "kl": 0.022247314453125, - "learning_rate": 9.381332392429204e-08, - "loss": 0.0488, - "reward": 0.4101562649011612, - "reward_std": 0.1346494872123003, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 27.729637145996094, + "kl": 3.40234375, + "learning_rate": 4.690666196214602e-07, + "loss": 0.3009, + "reward": 0.407924123108387, + "reward_std": 0.1909848153591156, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.3231026902794838, "step": 817 }, { "clip_ratio": 0.0, - "completion_length": 1990.1451721191406, + "completion_length": 1739.05810546875, "epoch": 0.24434321559256217, - "grad_norm": 0.1394050121307373, - "kl": 0.020172119140625, - "learning_rate": 9.378817221616955e-08, - "loss": 0.0323, - "reward": 0.310267873108387, - "reward_std": 0.09824738837778568, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 47.85274887084961, + "kl": 4.9453125, + "learning_rate": 4.689408610808477e-07, + "loss": 0.3431, + "reward": 0.294642873108387, + "reward_std": 0.20507322624325752, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3013392984867096, + "rewards/tag_count_reward": 0.2723214402794838, "step": 818 }, { "clip_ratio": 0.0, - "completion_length": 1922.7210693359375, + "completion_length": 1556.3772888183594, "epoch": 0.24464192368008364, - "grad_norm": 0.18408608436584473, - "kl": 0.022979736328125, - "learning_rate": 9.376297287080738e-08, - "loss": 0.056, - "reward": 0.4715401977300644, - "reward_std": 0.14331724867224693, - "rewards/accuracy_reward": 0.12053571850992739, + "grad_norm": 32.593231201171875, + "kl": 3.8984375, + "learning_rate": 4.6881486435403694e-07, + "loss": 0.3749, + "reward": 0.466517873108387, + "reward_std": 0.22306020930409431, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044813156128, + "rewards/tag_count_reward": 0.3281250074505806, "step": 819 }, { "clip_ratio": 0.0, - "completion_length": 1868.8416137695312, + "completion_length": 1637.7746276855469, "epoch": 0.2449406317676051, - "grad_norm": 0.153935506939888, - "kl": 0.022613525390625, - "learning_rate": 9.373772591561997e-08, - "loss": 0.0452, - "reward": 0.3978794813156128, - "reward_std": 0.1478666178882122, - "rewards/accuracy_reward": 0.058035718044266105, + "grad_norm": 37.256160736083984, + "kl": 4.6796875, + "learning_rate": 4.6868862957809983e-07, + "loss": 0.358, + "reward": 0.3677455484867096, + "reward_std": 0.24929990246891975, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.314174123108387, "step": 820 }, { "clip_ratio": 0.0, - "completion_length": 1886.6473999023438, + "completion_length": 1587.24560546875, "epoch": 0.24523933985512658, - "grad_norm": 0.16428199410438538, - "kl": 0.024078369140625, - "learning_rate": 9.371243137807351e-08, - "loss": 0.0597, - "reward": 0.3816964477300644, - "reward_std": 0.14197412505745888, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 10.461759567260742, + "kl": 3.78515625, + "learning_rate": 4.685621568903676e-07, + "loss": 0.3312, + "reward": 0.3325893133878708, + "reward_std": 0.22354182228446007, + "rewards/accuracy_reward": 0.033482143422588706, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337053582072258, + "rewards/tag_count_reward": 0.2991071492433548, "step": 821 }, { "clip_ratio": 0.0, - "completion_length": 1840.4844665527344, + "completion_length": 1577.0647888183594, "epoch": 0.24553804794264805, - "grad_norm": 0.16401003301143646, - "kl": 0.024627685546875, - "learning_rate": 9.3687089285686e-08, - "loss": 0.0618, - "reward": 0.4168526902794838, - "reward_std": 0.1097372304648161, - "rewards/accuracy_reward": 0.06919643026776612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562574505806, + "grad_norm": 5.627130508422852, + "kl": 4.0078125, + "learning_rate": 4.6843544642843003e-07, + "loss": 0.3336, + "reward": 0.4001116305589676, + "reward_std": 0.18443744257092476, + "rewards/accuracy_reward": 0.058035717345774174, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3398437574505806, "step": 822 }, { "clip_ratio": 0.0, - "completion_length": 1822.2523193359375, + "completion_length": 1540.18310546875, "epoch": 0.24583675603016952, - "grad_norm": 0.1873084455728531, - "kl": 0.0252685546875, - "learning_rate": 9.366169966602714e-08, - "loss": 0.0637, - "reward": 0.4453125223517418, - "reward_std": 0.12569161131978035, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 26.076204299926758, + "kl": 5.05078125, + "learning_rate": 4.683084983301357e-07, + "loss": 0.4337, + "reward": 0.391741082072258, + "reward_std": 0.2098359428346157, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "rewards/tag_count_reward": 0.3203125149011612, "step": 823 }, { "clip_ratio": 0.0, - "completion_length": 1883.8103637695312, + "completion_length": 1632.0960388183594, "epoch": 0.246135464117691, - "grad_norm": 0.18223102390766144, - "kl": 0.024261474609375, - "learning_rate": 9.363626254671835e-08, - "loss": 0.0511, - "reward": 0.4162946715950966, - "reward_std": 0.1677427627146244, + "grad_norm": 59.55546951293945, + "kl": 5.28125, + "learning_rate": 4.681813127335917e-07, + "loss": 0.3974, + "reward": 0.376116082072258, + "reward_std": 0.24109845235943794, "rewards/accuracy_reward": 0.06696428707800806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.3091517984867096, "step": 824 }, { "clip_ratio": 0.0, - "completion_length": 1870.7858276367188, + "completion_length": 1578.1384582519531, "epoch": 0.24643417220521247, - "grad_norm": 0.17161820828914642, - "kl": 0.0238037109375, - "learning_rate": 9.36107779554327e-08, - "loss": 0.0657, - "reward": 0.3833705484867096, - "reward_std": 0.14049156941473484, - "rewards/accuracy_reward": 0.029017857741564512, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526902794838, + "grad_norm": 6.068005561828613, + "kl": 4.609375, + "learning_rate": 4.680538897771635e-07, + "loss": 0.414, + "reward": 0.3219866156578064, + "reward_std": 0.1817588433623314, + "rewards/accuracy_reward": 0.008928572060540318, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.313058041036129, "step": 825 }, { "clip_ratio": 0.0, - "completion_length": 1883.3125915527344, + "completion_length": 1615.4978332519531, "epoch": 0.24673288029273394, - "grad_norm": 0.1845966875553131, - "kl": 0.0252685546875, - "learning_rate": 9.358524591989499e-08, - "loss": 0.058, - "reward": 0.5111607387661934, - "reward_std": 0.1314643044024706, - "rewards/accuracy_reward": 0.15625000838190317, + "grad_norm": 36.44014358520508, + "kl": 4.6875, + "learning_rate": 4.6792622959947493e-07, + "loss": 0.3683, + "reward": 0.4648437723517418, + "reward_std": 0.18514490500092506, + "rewards/accuracy_reward": 0.15178572246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.3130580484867096, "step": 826 }, { "clip_ratio": 0.0, - "completion_length": 1891.0335693359375, + "completion_length": 1645.665283203125, "epoch": 0.2470315883802554, - "grad_norm": 0.17926642298698425, - "kl": 0.02447509765625, - "learning_rate": 9.355966646788151e-08, - "loss": 0.0417, - "reward": 0.4419643059372902, - "reward_std": 0.15141993761062622, - "rewards/accuracy_reward": 0.09598214598372579, + "grad_norm": 36.95512008666992, + "kl": 3.15625, + "learning_rate": 4.677983323394075e-07, + "loss": 0.2974, + "reward": 0.385044664144516, + "reward_std": 0.20364565402269363, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345982164144516, + "rewards/tag_count_reward": 0.2957589402794838, "step": 827 }, { "clip_ratio": 0.0, - "completion_length": 1789.4465026855469, + "completion_length": 1528.716552734375, "epoch": 0.24733029646777688, - "grad_norm": 0.1916741579771042, - "kl": 0.02642822265625, - "learning_rate": 9.353403962722021e-08, - "loss": 0.0651, - "reward": 0.4291294887661934, - "reward_std": 0.14656298235058784, - "rewards/accuracy_reward": 0.05357143213041127, + "grad_norm": 79.9479751586914, + "kl": 2.57421875, + "learning_rate": 4.676701981361011e-07, + "loss": 0.3113, + "reward": 0.3811383992433548, + "reward_std": 0.19384190067648888, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580484867096, + "rewards/tag_count_reward": 0.3409598395228386, "step": 828 }, { "clip_ratio": 0.0, - "completion_length": 1859.2723999023438, + "completion_length": 1628.4219360351562, "epoch": 0.24762900455529832, - "grad_norm": 0.17177750170230865, - "kl": 0.025604248046875, - "learning_rate": 9.350836542579059e-08, - "loss": 0.0412, - "reward": 0.4525669813156128, - "reward_std": 0.13601325172930956, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 13.317482948303223, + "kl": 4.09765625, + "learning_rate": 4.6754182712895296e-07, + "loss": 0.3446, + "reward": 0.4034598395228386, + "reward_std": 0.17611131817102432, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3521205484867096, + "rewards/tag_count_reward": 0.3208705484867096, "step": 829 }, { "clip_ratio": 0.0, - "completion_length": 1862.0759887695312, + "completion_length": 1558.9397583007812, "epoch": 0.2479277126428198, - "grad_norm": 0.19035989046096802, - "kl": 0.025543212890625, - "learning_rate": 9.348264389152369e-08, - "loss": 0.075, - "reward": 0.4051339477300644, - "reward_std": 0.11940881423652172, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 64.40067291259766, + "kl": 2.71484375, + "learning_rate": 4.6741321945761845e-07, + "loss": 0.3161, + "reward": 0.3699776902794838, + "reward_std": 0.18701285868883133, + "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.3186383992433548, "step": 830 }, { "clip_ratio": 0.0, - "completion_length": 1833.8951721191406, + "completion_length": 1620.02685546875, "epoch": 0.24822642073034126, - "grad_norm": 0.20269694924354553, - "kl": 0.025787353515625, - "learning_rate": 9.345687505240197e-08, - "loss": 0.0673, - "reward": 0.424665205180645, - "reward_std": 0.13641116209328175, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 11.651524543762207, + "kl": 4.09375, + "learning_rate": 4.6728437526200987e-07, + "loss": 0.3476, + "reward": 0.3705357313156128, + "reward_std": 0.20570750162005424, + "rewards/accuracy_reward": 0.0625000016298145, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.3080357313156128, "step": 831 }, { "clip_ratio": 0.0, - "completion_length": 1863.6786804199219, + "completion_length": 1573.74560546875, "epoch": 0.24852512881786273, - "grad_norm": 0.15887288749217987, - "kl": 0.025665283203125, - "learning_rate": 9.343105893645944e-08, - "loss": 0.0485, - "reward": 0.3984375149011612, - "reward_std": 0.11792740970849991, - "rewards/accuracy_reward": 0.0602678619325161, + "grad_norm": 12.349356651306152, + "kl": 3.87109375, + "learning_rate": 4.671552946822972e-07, + "loss": 0.3305, + "reward": 0.3906250149011612, + "reward_std": 0.17414593696594238, + "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.3303571566939354, "step": 832 }, { "clip_ratio": 0.0, - "completion_length": 1882.7322387695312, + "completion_length": 1655.5514221191406, "epoch": 0.2488238369053842, - "grad_norm": 0.1699819713830948, - "kl": 0.02520751953125, - "learning_rate": 9.340519557178148e-08, - "loss": 0.056, - "reward": 0.392299123108387, - "reward_std": 0.14569388516247272, - "rewards/accuracy_reward": 0.03348214412108064, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169813156128, + "grad_norm": 16.102136611938477, + "kl": 3.734375, + "learning_rate": 4.670259778589074e-07, + "loss": 0.3285, + "reward": 0.3398437649011612, + "reward_std": 0.1950475201010704, + "rewards/accuracy_reward": 0.015625000931322575, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3219866305589676, "step": 833 }, { "clip_ratio": 0.0, - "completion_length": 1852.6920776367188, + "completion_length": 1604.5425109863281, "epoch": 0.24912254499290568, - "grad_norm": 0.18929535150527954, - "kl": 0.0267333984375, - "learning_rate": 9.33792849865049e-08, - "loss": 0.0791, - "reward": 0.4453125298023224, - "reward_std": 0.11823303624987602, - "rewards/accuracy_reward": 0.09151786006987095, + "grad_norm": 22.57285499572754, + "kl": 4.8828125, + "learning_rate": 4.668964249325245e-07, + "loss": 0.3996, + "reward": 0.3811384066939354, + "reward_std": 0.1889372430741787, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3537946566939354, + "rewards/tag_count_reward": 0.3186384066939354, "step": 834 }, { "clip_ratio": 0.0, - "completion_length": 1892.3527526855469, + "completion_length": 1655.0111999511719, "epoch": 0.24942125308042715, - "grad_norm": 0.19037695229053497, - "kl": 0.025543212890625, - "learning_rate": 9.335332720881789e-08, - "loss": 0.0651, - "reward": 0.4040178805589676, - "reward_std": 0.14025508426129818, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 98.17493438720703, + "kl": 6.484375, + "learning_rate": 4.6676663604408946e-07, + "loss": 0.4829, + "reward": 0.3582589477300644, + "reward_std": 0.1953287236392498, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482142984867096, + "rewards/tag_count_reward": 0.3002232313156128, "step": 835 }, { "clip_ratio": 0.0, - "completion_length": 1766.0759887695312, + "completion_length": 1461.2500610351562, "epoch": 0.24971996116794862, - "grad_norm": 0.16499406099319458, - "kl": 0.029296875, - "learning_rate": 9.332732226695995e-08, - "loss": 0.042, - "reward": 0.4330357387661934, - "reward_std": 0.12663850001990795, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 93.57233428955078, + "kl": 2.65234375, + "learning_rate": 4.6663661133479977e-07, + "loss": 0.3361, + "reward": 0.4034598469734192, + "reward_std": 0.17028777301311493, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3816964402794838, + "rewards/tag_count_reward": 0.3632812723517418, "step": 836 }, { "clip_ratio": 0.0, - "completion_length": 1770.37060546875, + "completion_length": 1485.4978332519531, "epoch": 0.2500186692554701, - "grad_norm": 0.1797592043876648, - "kl": 0.028533935546875, - "learning_rate": 9.330127018922194e-08, - "loss": 0.0586, - "reward": 0.4709821566939354, - "reward_std": 0.15466641634702682, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 20.224430084228516, + "kl": 4.08203125, + "learning_rate": 4.6650635094610966e-07, + "loss": 0.3676, + "reward": 0.4224330484867096, + "reward_std": 0.19755249843001366, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000149011612, + "rewards/tag_count_reward": 0.3510044813156128, "step": 837 }, { "clip_ratio": 0.0, - "completion_length": 1819.1809387207031, + "completion_length": 1581.3728332519531, "epoch": 0.2503173773429916, - "grad_norm": 0.1748066246509552, - "kl": 0.02752685546875, - "learning_rate": 9.327517100394592e-08, - "loss": 0.0619, - "reward": 0.5312500149011612, - "reward_std": 0.17322255671024323, - "rewards/accuracy_reward": 0.1607142947614193, + "grad_norm": 26.901081085205078, + "kl": 4.82421875, + "learning_rate": 4.663758550197296e-07, + "loss": 0.4069, + "reward": 0.4570312649011612, + "reward_std": 0.20708920061588287, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3705357238650322, + "rewards/tag_count_reward": 0.329799123108387, "step": 838 }, { "clip_ratio": 0.0, - "completion_length": 1825.0045471191406, + "completion_length": 1548.9308776855469, "epoch": 0.25061608543051306, - "grad_norm": 0.18635889887809753, - "kl": 0.027069091796875, - "learning_rate": 9.324902473952527e-08, - "loss": 0.0563, - "reward": 0.4095982387661934, - "reward_std": 0.10329004004597664, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 26.57581329345703, + "kl": 4.27734375, + "learning_rate": 4.6624512369762637e-07, + "loss": 0.4071, + "reward": 0.376674123108387, + "reward_std": 0.1624312475323677, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716518059372902, + "rewards/tag_count_reward": 0.3364955559372902, "step": 839 }, { "clip_ratio": 0.0, - "completion_length": 1759.6139221191406, + "completion_length": 1545.5000610351562, "epoch": 0.2509147935180345, - "grad_norm": 0.19692619144916534, - "kl": 0.02972412109375, - "learning_rate": 9.322283142440458e-08, - "loss": 0.0821, - "reward": 0.5212053805589676, - "reward_std": 0.12248857878148556, - "rewards/accuracy_reward": 0.1361607201397419, + "grad_norm": 23.26570701599121, + "kl": 4.6875, + "learning_rate": 4.661141571220229e-07, + "loss": 0.3903, + "reward": 0.455357164144516, + "reward_std": 0.18889540433883667, + "rewards/accuracy_reward": 0.12053571967408061, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.385044664144516, + "rewards/tag_count_reward": 0.3348214477300644, "step": 840 }, { "clip_ratio": 0.0, - "completion_length": 1832.1451416015625, + "completion_length": 1576.0848999023438, "epoch": 0.25121350160555594, - "grad_norm": 0.17748260498046875, - "kl": 0.0269775390625, - "learning_rate": 9.319659108707958e-08, - "loss": 0.0556, - "reward": 0.4229910895228386, - "reward_std": 0.13180895894765854, - "rewards/accuracy_reward": 0.05580357555299997, + "grad_norm": 10.82313346862793, + "kl": 4.078125, + "learning_rate": 4.659829554353979e-07, + "loss": 0.3409, + "reward": 0.3755580484867096, + "reward_std": 0.18060410022735596, + "rewards/accuracy_reward": 0.037946428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3671875149011612, + "rewards/tag_count_reward": 0.3376116156578064, "step": 841 }, { "clip_ratio": 0.0, - "completion_length": 1864.6697082519531, + "completion_length": 1595.7389221191406, "epoch": 0.2515122096930774, - "grad_norm": 0.20205603539943695, - "kl": 0.02789306640625, - "learning_rate": 9.31703037560972e-08, - "loss": 0.0816, - "reward": 0.4291294738650322, - "reward_std": 0.16667439602315426, - "rewards/accuracy_reward": 0.06696428591385484, + "grad_norm": 61.55778503417969, + "kl": 5.9921875, + "learning_rate": 4.65851518780486e-07, + "loss": 0.4848, + "reward": 0.398995541036129, + "reward_std": 0.22408530116081238, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3621651977300644, + "rewards/tag_count_reward": 0.3164062649011612, "step": 842 }, { "clip_ratio": 0.0, - "completion_length": 1768.8817749023438, + "completion_length": 1510.1272888183594, "epoch": 0.2518109177805989, - "grad_norm": 0.1749260127544403, - "kl": 0.02874755859375, - "learning_rate": 9.31439694600555e-08, - "loss": 0.0631, - "reward": 0.4229910895228386, - "reward_std": 0.10713176801800728, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 31.55916976928711, + "kl": 5.5625, + "learning_rate": 4.6571984730027746e-07, + "loss": 0.4475, + "reward": 0.385044664144516, + "reward_std": 0.1869351491332054, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3761160895228386, + "rewards/tag_count_reward": 0.3314732238650322, "step": 843 }, { "clip_ratio": 0.0, - "completion_length": 1814.5447387695312, + "completion_length": 1548.4710388183594, "epoch": 0.25210962586812036, - "grad_norm": 0.17627635598182678, - "kl": 0.028778076171875, - "learning_rate": 9.311758822760356e-08, - "loss": 0.0633, - "reward": 0.4268973395228386, - "reward_std": 0.11982248164713383, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 25.668895721435547, + "kl": 5.0859375, + "learning_rate": 4.655879411380178e-07, + "loss": 0.4342, + "reward": 0.3989955484867096, + "reward_std": 0.2041613571345806, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.364397332072258, + "rewards/tag_count_reward": 0.3320312649011612, "step": 844 }, { "clip_ratio": 0.0, - "completion_length": 1752.5045166015625, + "completion_length": 1518.5960693359375, "epoch": 0.25240833395564183, - "grad_norm": 0.21078191697597504, - "kl": 0.0306396484375, - "learning_rate": 9.309116008744163e-08, - "loss": 0.0727, - "reward": 0.5150669887661934, - "reward_std": 0.12844160199165344, - "rewards/accuracy_reward": 0.13839286309666932, + "grad_norm": 18.992769241333008, + "kl": 4.625, + "learning_rate": 4.654558004372081e-07, + "loss": 0.431, + "reward": 0.4559151977300644, + "reward_std": 0.16763852536678314, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376674123108387, + "rewards/tag_count_reward": 0.337611623108387, "step": 845 }, { "clip_ratio": 0.0, - "completion_length": 1804.3773193359375, + "completion_length": 1540.0514221191406, "epoch": 0.2527070420431633, - "grad_norm": 0.1890835016965866, - "kl": 0.02978515625, - "learning_rate": 9.306468506832089e-08, - "loss": 0.0657, - "reward": 0.4308035969734192, - "reward_std": 0.17982682585716248, - "rewards/accuracy_reward": 0.06473214505240321, + "grad_norm": 11.999451637268066, + "kl": 4.16015625, + "learning_rate": 4.6532342534160444e-07, + "loss": 0.3618, + "reward": 0.4263393059372902, + "reward_std": 0.22428957745432854, + "rewards/accuracy_reward": 0.07142857601866126, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3660714402794838, + "rewards/tag_count_reward": 0.3549107238650322, "step": 846 }, { "clip_ratio": 0.0, - "completion_length": 1836.4621276855469, + "completion_length": 1566.72998046875, "epoch": 0.25300575013068477, - "grad_norm": 0.2003653645515442, - "kl": 0.0294189453125, - "learning_rate": 9.303816319904361e-08, - "loss": 0.074, - "reward": 0.3789062723517418, - "reward_std": 0.11367790587246418, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 10.131855964660645, + "kl": 4.37109375, + "learning_rate": 4.65190815995218e-07, + "loss": 0.3703, + "reward": 0.3643973395228386, + "reward_std": 0.1941681168973446, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3454241305589676, + "rewards/tag_count_reward": 0.3242187649011612, "step": 847 }, { "clip_ratio": 0.0, - "completion_length": 1905.2925109863281, + "completion_length": 1625.2857666015625, "epoch": 0.25330445821820624, - "grad_norm": 0.16319864988327026, - "kl": 0.027069091796875, - "learning_rate": 9.301159450846294e-08, - "loss": 0.0502, - "reward": 0.4363839477300644, - "reward_std": 0.11594457551836967, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 31.050949096679688, + "kl": 3.9375, + "learning_rate": 4.650579725423147e-07, + "loss": 0.3689, + "reward": 0.417410746216774, + "reward_std": 0.18708643317222595, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.3169643059372902, "step": 848 }, { "clip_ratio": 0.0, - "completion_length": 1834.0201721191406, + "completion_length": 1606.1965026855469, "epoch": 0.2536031663057277, - "grad_norm": 0.17172440886497498, - "kl": 0.029693603515625, - "learning_rate": 9.298497902548305e-08, - "loss": 0.0577, - "reward": 0.4056919887661934, - "reward_std": 0.15666073374450207, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 28.903181076049805, + "kl": 4.58984375, + "learning_rate": 4.649248951274153e-07, + "loss": 0.3544, + "reward": 0.361607164144516, + "reward_std": 0.20924041792750359, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812649011612, + "rewards/tag_count_reward": 0.325892873108387, "step": 849 }, { "clip_ratio": 0.0, - "completion_length": 1883.9375915527344, + "completion_length": 1578.0179443359375, "epoch": 0.2539018743932492, - "grad_norm": 0.18048697710037231, - "kl": 0.0277099609375, - "learning_rate": 9.295831677905896e-08, - "loss": 0.0519, - "reward": 0.4575893133878708, - "reward_std": 0.15983187407255173, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 28.99771499633789, + "kl": 3.41796875, + "learning_rate": 4.647915838952948e-07, + "loss": 0.3073, + "reward": 0.4202009066939354, + "reward_std": 0.19332637637853622, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107387661934, + "rewards/tag_count_reward": 0.3309151828289032, "step": 850 }, { "clip_ratio": 0.0, - "completion_length": 1807.4889221191406, + "completion_length": 1596.38623046875, "epoch": 0.25420058248077065, - "grad_norm": 0.21416670083999634, - "kl": 0.030792236328125, - "learning_rate": 9.293160779819658e-08, - "loss": 0.0537, - "reward": 0.5351562723517418, - "reward_std": 0.17235197499394417, - "rewards/accuracy_reward": 0.16071429569274187, + "grad_norm": 47.24482727050781, + "kl": 3.6875, + "learning_rate": 4.6465803899098287e-07, + "loss": 0.379, + "reward": 0.479910746216774, + "reward_std": 0.21462104842066765, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419738650322, + "rewards/tag_count_reward": 0.3370535895228386, "step": 851 }, { "clip_ratio": 0.0, - "completion_length": 1795.8103637695312, + "completion_length": 1608.7567749023438, "epoch": 0.2544992905682921, - "grad_norm": 0.21063025295734406, - "kl": 0.0303955078125, - "learning_rate": 9.290485211195265e-08, - "loss": 0.0733, - "reward": 0.4994419887661934, - "reward_std": 0.16337705217301846, - "rewards/accuracy_reward": 0.10937500791624188, + "grad_norm": 45.97939682006836, + "kl": 3.7578125, + "learning_rate": 4.645242605597633e-07, + "loss": 0.3593, + "reward": 0.4369419738650322, + "reward_std": 0.20487766340374947, + "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669738650322, + "rewards/tag_count_reward": 0.3342634066939354, "step": 852 }, { "clip_ratio": 0.0, - "completion_length": 1802.9598999023438, + "completion_length": 1534.0915832519531, "epoch": 0.2547979986558136, - "grad_norm": 0.18265041708946228, - "kl": 0.031890869140625, - "learning_rate": 9.287804974943478e-08, - "loss": 0.0591, - "reward": 0.4927455708384514, - "reward_std": 0.16988563165068626, - "rewards/accuracy_reward": 0.11830357741564512, + "grad_norm": 51.74606704711914, + "kl": 3.6640625, + "learning_rate": 4.6439024874717384e-07, + "loss": 0.3804, + "reward": 0.4854911044239998, + "reward_std": 0.24209454283118248, + "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419887661934, + "rewards/tag_count_reward": 0.3426339402794838, "step": 853 }, { "clip_ratio": 0.0, - "completion_length": 1839.22998046875, + "completion_length": 1588.6920471191406, "epoch": 0.25509670674333507, - "grad_norm": 0.21398518979549408, - "kl": 0.03167724609375, - "learning_rate": 9.285120073980126e-08, - "loss": 0.0745, - "reward": 0.401227705180645, - "reward_std": 0.126412658020854, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 27.400096893310547, + "kl": 4.8671875, + "learning_rate": 4.642560036990063e-07, + "loss": 0.3819, + "reward": 0.376674123108387, + "reward_std": 0.18695980310440063, + "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3521205559372902, + "rewards/tag_count_reward": 0.3253348395228386, "step": 854 }, { "clip_ratio": 0.0, - "completion_length": 1833.7233276367188, + "completion_length": 1560.6741943359375, "epoch": 0.25539541483085654, - "grad_norm": 0.18987785279750824, - "kl": 0.03082275390625, - "learning_rate": 9.28243051122612e-08, - "loss": 0.0678, - "reward": 0.4185267984867096, - "reward_std": 0.16909685358405113, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 14.810881614685059, + "kl": 4.5859375, + "learning_rate": 4.64121525561306e-07, + "loss": 0.4016, + "reward": 0.3878348469734192, + "reward_std": 0.21267437189817429, + "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3649553805589676, + "rewards/tag_count_reward": 0.336495541036129, "step": 855 }, { "clip_ratio": 0.0, - "completion_length": 1869.8348999023438, + "completion_length": 1642.3103637695312, "epoch": 0.255694122918378, - "grad_norm": 0.17657481133937836, - "kl": 0.030303955078125, - "learning_rate": 9.279736289607443e-08, - "loss": 0.0558, - "reward": 0.3822544887661934, - "reward_std": 0.1507671568542719, - "rewards/accuracy_reward": 0.029017859138548374, + "grad_norm": 35.78523635864258, + "kl": 5.2734375, + "learning_rate": 4.6398681448037213e-07, + "loss": 0.4151, + "reward": 0.3236607313156128, + "reward_std": 0.1806083656847477, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.353236623108387, + "rewards/tag_count_reward": 0.3147321566939354, "step": 856 }, { "clip_ratio": 0.0, - "completion_length": 1748.2656860351562, + "completion_length": 1486.2032165527344, "epoch": 0.2559928310058995, - "grad_norm": 0.21069137752056122, - "kl": 0.03338623046875, - "learning_rate": 9.277037412055142e-08, - "loss": 0.0882, - "reward": 0.474330373108387, - "reward_std": 0.16893328353762627, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 9.741594314575195, + "kl": 4.6640625, + "learning_rate": 4.638518706027571e-07, + "loss": 0.4249, + "reward": 0.4285714477300644, + "reward_std": 0.22246531024575233, + "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3939732313156128, + "rewards/tag_count_reward": 0.3504464402794838, "step": 857 }, { "clip_ratio": 0.0, - "completion_length": 1863.5179443359375, + "completion_length": 1597.0982971191406, "epoch": 0.25629153909342095, - "grad_norm": 0.18640393018722534, - "kl": 0.031219482421875, - "learning_rate": 9.27433388150533e-08, - "loss": 0.0652, - "reward": 0.4112723469734192, - "reward_std": 0.13642830960452557, - "rewards/accuracy_reward": 0.05133928940631449, + "grad_norm": 38.505401611328125, + "kl": 5.3125, + "learning_rate": 4.6371669407526655e-07, + "loss": 0.4239, + "reward": 0.3777901977300644, + "reward_std": 0.2057284191250801, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3599330559372902, + "rewards/tag_count_reward": 0.317522332072258, "step": 858 }, { "clip_ratio": 0.0, - "completion_length": 1874.6429138183594, + "completion_length": 1642.8661193847656, "epoch": 0.2565902471809424, - "grad_norm": 0.1816374659538269, - "kl": 0.03192138671875, - "learning_rate": 9.271625700899186e-08, - "loss": 0.0639, - "reward": 0.4129464402794838, - "reward_std": 0.1370825096964836, - "rewards/accuracy_reward": 0.060267859138548374, + "grad_norm": 4.1782379150390625, + "kl": 4.48046875, + "learning_rate": 4.635812850449593e-07, + "loss": 0.384, + "reward": 0.3727678656578064, + "reward_std": 0.18589527904987335, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3526785895228386, + "rewards/tag_count_reward": 0.3236607313156128, "step": 859 }, { "clip_ratio": 0.0, - "completion_length": 1824.3817749023438, + "completion_length": 1523.4353637695312, "epoch": 0.2568889552684639, - "grad_norm": 0.20100770890712738, - "kl": 0.032501220703125, - "learning_rate": 9.268912873182943e-08, - "loss": 0.0707, - "reward": 0.4905134066939354, - "reward_std": 0.1772986352443695, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 9.808791160583496, + "kl": 4.60546875, + "learning_rate": 4.634456436591472e-07, + "loss": 0.4167, + "reward": 0.4179687649011612, + "reward_std": 0.19432710483670235, + "rewards/accuracy_reward": 0.08928571967408061, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455484867096, + "rewards/tag_count_reward": 0.3286830559372902, "step": 860 }, { "clip_ratio": 0.0, - "completion_length": 1893.1719665527344, + "completion_length": 1614.134033203125, "epoch": 0.25718766335598536, - "grad_norm": 0.19283203780651093, - "kl": 0.03106689453125, - "learning_rate": 9.266195401307892e-08, - "loss": 0.0643, - "reward": 0.4380580633878708, - "reward_std": 0.12389553152024746, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 8.961434364318848, + "kl": 4.58984375, + "learning_rate": 4.6330977006539465e-07, + "loss": 0.4032, + "reward": 0.3973214477300644, + "reward_std": 0.16868189722299576, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357700914144516, + "rewards/tag_count_reward": 0.321428582072258, "step": 861 }, { "clip_ratio": 0.0, - "completion_length": 1811.0648498535156, + "completion_length": 1535.04248046875, "epoch": 0.25748637144350683, - "grad_norm": 0.17668937146663666, - "kl": 0.0330810546875, - "learning_rate": 9.263473288230377e-08, - "loss": 0.0643, - "reward": 0.5251116305589676, - "reward_std": 0.14564711973071098, - "rewards/accuracy_reward": 0.1540178619325161, + "grad_norm": 13.457070350646973, + "kl": 4.6640625, + "learning_rate": 4.631736644115189e-07, + "loss": 0.4222, + "reward": 0.471540205180645, + "reward_std": 0.2019900269806385, + "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937723517418, + "rewards/tag_count_reward": 0.3376116156578064, "step": 862 }, { "clip_ratio": 0.0, - "completion_length": 1779.7411499023438, + "completion_length": 1526.7433776855469, "epoch": 0.2577850795310283, - "grad_norm": 0.19020162522792816, - "kl": 0.03363037109375, - "learning_rate": 9.26074653691179e-08, - "loss": 0.0738, - "reward": 0.4765625298023224, - "reward_std": 0.16820980794727802, - "rewards/accuracy_reward": 0.1026785783469677, + "grad_norm": 103.91754913330078, + "kl": 7.1796875, + "learning_rate": 4.630373268455895e-07, + "loss": 0.5891, + "reward": 0.400669664144516, + "reward_std": 0.21284732222557068, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839402794838, + "rewards/tag_count_reward": 0.3247767984867096, "step": 863 }, { "clip_ratio": 0.0, - "completion_length": 1835.9174499511719, + "completion_length": 1574.8840026855469, "epoch": 0.2580837876185498, - "grad_norm": 0.20799416303634644, - "kl": 0.03497314453125, - "learning_rate": 9.25801515031857e-08, - "loss": 0.0645, - "reward": 0.439732164144516, - "reward_std": 0.14835952408611774, - "rewards/accuracy_reward": 0.07589286006987095, + "grad_norm": 143.74099731445312, + "kl": 7.5859375, + "learning_rate": 4.629007575159285e-07, + "loss": 0.5454, + "reward": 0.3978794813156128, + "reward_std": 0.22335519641637802, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3638392984867096, + "rewards/tag_count_reward": 0.3197544813156128, "step": 864 }, { "clip_ratio": 0.0, - "completion_length": 1845.5737609863281, + "completion_length": 1615.5736999511719, "epoch": 0.25838249570607125, - "grad_norm": 0.18319500982761383, - "kl": 0.0313720703125, - "learning_rate": 9.255279131422195e-08, - "loss": 0.0555, - "reward": 0.4564732238650322, - "reward_std": 0.16063355095684528, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 49.08283233642578, + "kl": 5.359375, + "learning_rate": 4.6276395657110974e-07, + "loss": 0.3929, + "reward": 0.3978794813156128, + "reward_std": 0.19036248326301575, + "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716517984867096, + "rewards/tag_count_reward": 0.3331473395228386, "step": 865 }, { "clip_ratio": 0.0, - "completion_length": 1842.7947692871094, + "completion_length": 1633.6027526855469, "epoch": 0.2586812037935927, - "grad_norm": 0.1910918951034546, - "kl": 0.031036376953125, - "learning_rate": 9.252538483199189e-08, - "loss": 0.0715, - "reward": 0.4536830559372902, - "reward_std": 0.19996736757457256, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 92.3703384399414, + "kl": 6.4375, + "learning_rate": 4.626269241599594e-07, + "loss": 0.4543, + "reward": 0.3995535895228386, + "reward_std": 0.2442575879395008, + "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3688616305589676, + "rewards/tag_count_reward": 0.3303571566939354, "step": 866 }, { "clip_ratio": 0.0, - "completion_length": 1791.9130249023438, + "completion_length": 1547.4844360351562, "epoch": 0.2589799118811142, - "grad_norm": 0.19795557856559753, - "kl": 0.035125732421875, - "learning_rate": 9.249793208631103e-08, - "loss": 0.0638, - "reward": 0.5597098544239998, - "reward_std": 0.18015828169882298, - "rewards/accuracy_reward": 0.1897321529686451, + "grad_norm": 67.17633056640625, + "kl": 5.8671875, + "learning_rate": 4.624896604315552e-07, + "loss": 0.449, + "reward": 0.509486623108387, + "reward_std": 0.20164956897497177, + "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3699776977300644, + "rewards/tag_count_reward": 0.3398437649011612, "step": 867 }, { "clip_ratio": 0.0, - "completion_length": 1838.3952026367188, + "completion_length": 1594.7389221191406, "epoch": 0.25927861996863566, - "grad_norm": 0.19782795011997223, - "kl": 0.03472900390625, - "learning_rate": 9.247043310704534e-08, - "loss": 0.0795, - "reward": 0.420758955180645, - "reward_std": 0.1800556257367134, - "rewards/accuracy_reward": 0.058035718742758036, + "grad_norm": 9.927915573120117, + "kl": 4.63671875, + "learning_rate": 4.623521655352267e-07, + "loss": 0.3897, + "reward": 0.3917410895228386, + "reward_std": 0.22729899361729622, + "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232238650322, + "rewards/tag_count_reward": 0.3314732313156128, "step": 868 }, { "clip_ratio": 0.0, - "completion_length": 1806.8550109863281, + "completion_length": 1534.8929443359375, "epoch": 0.25957732805615713, - "grad_norm": 0.19063548743724823, - "kl": 0.035125732421875, - "learning_rate": 9.244288792411096e-08, - "loss": 0.0649, - "reward": 0.491629496216774, - "reward_std": 0.14356574602425098, - "rewards/accuracy_reward": 0.12500000302679837, + "grad_norm": 65.99718475341797, + "kl": 3.15625, + "learning_rate": 4.6221443962055483e-07, + "loss": 0.347, + "reward": 0.4620535895228386, + "reward_std": 0.18996809795498848, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.352678582072258, "step": 869 }, { "clip_ratio": 0.0, - "completion_length": 1831.7210693359375, + "completion_length": 1532.1942749023438, "epoch": 0.2598760361436786, - "grad_norm": 0.20112796127796173, - "kl": 0.03582763671875, - "learning_rate": 9.241529656747438e-08, - "loss": 0.0802, - "reward": 0.4218750223517418, - "reward_std": 0.16582637280225754, - "rewards/accuracy_reward": 0.042410717345774174, + "grad_norm": 57.06461715698242, + "kl": 3.23046875, + "learning_rate": 4.6207648283737186e-07, + "loss": 0.3392, + "reward": 0.3565848395228386, + "reward_std": 0.20425190404057503, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3794642984867096, + "rewards/tag_count_reward": 0.3320312649011612, "step": 870 }, { "clip_ratio": 0.0, - "completion_length": 1789.2724304199219, + "completion_length": 1577.2032165527344, "epoch": 0.2601747442312001, - "grad_norm": 0.1935015171766281, - "kl": 0.034912109375, - "learning_rate": 9.23876590671523e-08, - "loss": 0.0706, - "reward": 0.5184151977300644, - "reward_std": 0.16638948768377304, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 9.386937141418457, + "kl": 4.19140625, + "learning_rate": 4.6193829533576145e-07, + "loss": 0.3738, + "reward": 0.4620535969734192, + "reward_std": 0.22620641440153122, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.380022332072258, + "rewards/tag_count_reward": 0.3437500149011612, "step": 871 }, { "clip_ratio": 0.0, - "completion_length": 1752.4822387695312, + "completion_length": 1563.1161193847656, "epoch": 0.26047345231872154, - "grad_norm": 0.21605625748634338, - "kl": 0.03802490234375, - "learning_rate": 9.235997545321155e-08, - "loss": 0.0797, - "reward": 0.4213169813156128, - "reward_std": 0.15868264995515347, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 10.678582191467285, + "kl": 4.2421875, + "learning_rate": 4.6179987726605776e-07, + "loss": 0.3757, + "reward": 0.3616071566939354, + "reward_std": 0.18918341398239136, + "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3789062649011612, + "rewards/tag_count_reward": 0.3370535895228386, "step": 872 }, { "clip_ratio": 0.0, - "completion_length": 1801.4509887695312, + "completion_length": 1536.9107971191406, "epoch": 0.260772160406243, - "grad_norm": 0.21496638655662537, - "kl": 0.03717041015625, - "learning_rate": 9.233224575576926e-08, - "loss": 0.0761, - "reward": 0.4559151977300644, - "reward_std": 0.11859410256147385, + "grad_norm": 57.1290283203125, + "kl": 3.24609375, + "learning_rate": 4.616612287788463e-07, + "loss": 0.3475, + "reward": 0.4335937649011612, + "reward_std": 0.17472489550709724, "rewards/accuracy_reward": 0.0848214328289032, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937649011612, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3465401977300644, "step": 873 }, { "clip_ratio": 0.0, - "completion_length": 1887.4041137695312, + "completion_length": 1681.0692749023438, "epoch": 0.2610708684937645, - "grad_norm": 0.1795157790184021, - "kl": 0.0347900390625, - "learning_rate": 9.230447000499257e-08, - "loss": 0.0528, - "reward": 0.3722098469734192, - "reward_std": 0.1340169757604599, - "rewards/accuracy_reward": 0.020089286845177412, + "grad_norm": 17.58644676208496, + "kl": 3.8046875, + "learning_rate": 4.615223500249629e-07, + "loss": 0.3113, + "reward": 0.322544664144516, + "reward_std": 0.17516427487134933, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3521205484867096, + "rewards/tag_count_reward": 0.3091518059372902, "step": 874 }, { "clip_ratio": 0.0, - "completion_length": 1777.41748046875, + "completion_length": 1536.540283203125, "epoch": 0.26136957658128596, - "grad_norm": 0.19216486811637878, - "kl": 0.0364990234375, - "learning_rate": 9.227664823109882e-08, - "loss": 0.0671, - "reward": 0.4776785895228386, - "reward_std": 0.1579104047268629, - "rewards/accuracy_reward": 0.10044643236324191, + "grad_norm": 15.848076820373535, + "kl": 4.09765625, + "learning_rate": 4.613832411554941e-07, + "loss": 0.3865, + "reward": 0.434709832072258, + "reward_std": 0.18973276391625404, + "rewards/accuracy_reward": 0.0937500074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3772321566939354, + "rewards/tag_count_reward": 0.340959832072258, "step": 875 }, { "clip_ratio": 0.0, - "completion_length": 1872.165283203125, + "completion_length": 1649.4331359863281, "epoch": 0.2616682846688074, - "grad_norm": 0.18274374306201935, - "kl": 0.0362548828125, - "learning_rate": 9.224878046435535e-08, - "loss": 0.0669, - "reward": 0.4492187649011612, - "reward_std": 0.1743379384279251, - "rewards/accuracy_reward": 0.10267857555299997, + "grad_norm": 26.35906219482422, + "kl": 5.1484375, + "learning_rate": 4.6124390232177674e-07, + "loss": 0.4202, + "reward": 0.4095982313156128, + "reward_std": 0.2281733825802803, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401902794838, + "rewards/tag_count_reward": 0.313616082072258, "step": 876 }, { "clip_ratio": 0.0, - "completion_length": 1867.30810546875, + "completion_length": 1639.0670166015625, "epoch": 0.2619669927563289, - "grad_norm": 0.16516731679439545, - "kl": 0.0352783203125, - "learning_rate": 9.222086673507955e-08, - "loss": 0.0426, - "reward": 0.502232164144516, - "reward_std": 0.1619853377342224, - "rewards/accuracy_reward": 0.1339285783469677, + "grad_norm": 49.567283630371094, + "kl": 5.046875, + "learning_rate": 4.611043336753978e-07, + "loss": 0.3583, + "reward": 0.4458705559372902, + "reward_std": 0.2183237224817276, + "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3683035895228386, + "rewards/tag_count_reward": 0.325334832072258, "step": 877 }, { "clip_ratio": 0.0, - "completion_length": 1748.727783203125, + "completion_length": 1454.5491638183594, "epoch": 0.26226570084385037, - "grad_norm": 0.1914760172367096, - "kl": 0.03887939453125, - "learning_rate": 9.219290707363884e-08, - "loss": 0.0605, - "reward": 0.4832589477300644, - "reward_std": 0.19367793202400208, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 35.591224670410156, + "kl": 3.9453125, + "learning_rate": 4.6096453536819416e-07, + "loss": 0.3849, + "reward": 0.4542410895228386, + "reward_std": 0.22669217735528946, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3850446566939354, + "rewards/tag_count_reward": 0.364955373108387, "step": 878 }, { "clip_ratio": 0.0, - "completion_length": 1768.5692749023438, + "completion_length": 1553.9554138183594, "epoch": 0.26256440893137184, - "grad_norm": 0.22004805505275726, - "kl": 0.0377197265625, - "learning_rate": 9.216490151045058e-08, - "loss": 0.0775, - "reward": 0.5362723395228386, - "reward_std": 0.13144957646727562, - "rewards/accuracy_reward": 0.1562500037252903, + "grad_norm": 12.461967468261719, + "kl": 4.625, + "learning_rate": 4.608245075522529e-07, + "loss": 0.4208, + "reward": 0.498325914144516, + "reward_std": 0.2063368298113346, + "rewards/accuracy_reward": 0.1540178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3800223395228386, + "rewards/tag_count_reward": 0.3443080559372902, "step": 879 }, { "clip_ratio": 0.0, - "completion_length": 1825.3438415527344, + "completion_length": 1671.4197082519531, "epoch": 0.2628631170188933, - "grad_norm": 0.19383002817630768, - "kl": 0.03643798828125, - "learning_rate": 9.213685007598207e-08, - "loss": 0.071, - "reward": 0.434151791036129, - "reward_std": 0.1629512943327427, - "rewards/accuracy_reward": 0.06026786030270159, + "grad_norm": 39.97450637817383, + "kl": 5.4375, + "learning_rate": 4.606842503799104e-07, + "loss": 0.4136, + "reward": 0.3733259066939354, + "reward_std": 0.20198510587215424, + "rewards/accuracy_reward": 0.05357142980210483, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "rewards/tag_count_reward": 0.3197544813156128, "step": 880 }, { "clip_ratio": 0.0, - "completion_length": 1792.7991638183594, + "completion_length": 1590.0759582519531, "epoch": 0.2631618251064148, - "grad_norm": 0.17686040699481964, - "kl": 0.037109375, - "learning_rate": 9.210875280075055e-08, - "loss": 0.0498, - "reward": 0.4492187723517418, - "reward_std": 0.1314806416630745, - "rewards/accuracy_reward": 0.08258929196745157, + "grad_norm": 29.42139434814453, + "kl": 4.5859375, + "learning_rate": 4.6054376400375274e-07, + "loss": 0.3478, + "reward": 0.4241071566939354, + "reward_std": 0.19159143790602684, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294887661934, + "rewards/tag_count_reward": 0.341517873108387, "step": 881 }, { "clip_ratio": 0.0, - "completion_length": 1843.2567749023438, + "completion_length": 1672.7232971191406, "epoch": 0.26346053319393625, - "grad_norm": 0.18367107212543488, - "kl": 0.03839111328125, - "learning_rate": 9.208060971532308e-08, - "loss": 0.0534, - "reward": 0.455357164144516, - "reward_std": 0.13395571894943714, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 46.092708587646484, + "kl": 5.05078125, + "learning_rate": 4.6040304857661536e-07, + "loss": 0.3656, + "reward": 0.3828125149011612, + "reward_std": 0.17567341774702072, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3593750223517418, + "rewards/tag_count_reward": 0.302455373108387, "step": 882 }, { "clip_ratio": 0.0, - "completion_length": 1800.2076416015625, + "completion_length": 1562.0715026855469, "epoch": 0.26375924128145767, - "grad_norm": 0.21271038055419922, - "kl": 0.0411376953125, - "learning_rate": 9.205242085031657e-08, - "loss": 0.0703, - "reward": 0.4564732313156128, - "reward_std": 0.16668525710701942, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 13.574013710021973, + "kl": 4.14453125, + "learning_rate": 4.6026210425158284e-07, + "loss": 0.379, + "reward": 0.4185267984867096, + "reward_std": 0.2103034295141697, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3805803656578064, + "rewards/tag_count_reward": 0.3314732238650322, "step": 883 }, { "clip_ratio": 0.0, - "completion_length": 1837.1161804199219, + "completion_length": 1569.2188110351562, "epoch": 0.26405794936897914, - "grad_norm": 0.21935541927814484, - "kl": 0.039306640625, - "learning_rate": 9.202418623639777e-08, - "loss": 0.0747, - "reward": 0.4001116305589676, - "reward_std": 0.148781668394804, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 51.06179428100586, + "kl": 3.78515625, + "learning_rate": 4.6012093118198885e-07, + "loss": 0.3718, + "reward": 0.357142873108387, + "reward_std": 0.2059067003428936, + "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3844866305589676, + "rewards/tag_count_reward": 0.3392857238650322, "step": 884 }, { "clip_ratio": 0.0, - "completion_length": 1746.9465026855469, + "completion_length": 1576.3572082519531, "epoch": 0.2643566574565006, - "grad_norm": 0.21078231930732727, - "kl": 0.03961181640625, - "learning_rate": 9.199590590428316e-08, - "loss": 0.0672, - "reward": 0.6082589626312256, - "reward_std": 0.19026564620435238, - "rewards/accuracy_reward": 0.2031250074505806, + "grad_norm": 27.725021362304688, + "kl": 4.0234375, + "learning_rate": 4.5997952952141575e-07, + "loss": 0.3647, + "reward": 0.494977705180645, + "reward_std": 0.2317986637353897, + "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4051339402794838, + "rewards/tag_count_reward": 0.3409598395228386, "step": 885 }, { "clip_ratio": 0.0, - "completion_length": 1843.3594360351562, + "completion_length": 1621.7411499023438, "epoch": 0.2646553655440221, - "grad_norm": 0.19052492082118988, - "kl": 0.03961181640625, - "learning_rate": 9.196757988473895e-08, - "loss": 0.0682, - "reward": 0.5011160895228386, - "reward_std": 0.1339202430099249, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 19.939455032348633, + "kl": 4.20703125, + "learning_rate": 4.598378994236948e-07, + "loss": 0.3771, + "reward": 0.447544664144516, + "reward_std": 0.20751018449664116, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3671875149011612, + "rewards/tag_count_reward": 0.3180803656578064, "step": 886 }, { "clip_ratio": 0.0, - "completion_length": 1751.9822387695312, + "completion_length": 1546.3259887695312, "epoch": 0.26495407363154355, - "grad_norm": 0.21186694502830505, - "kl": 0.0391845703125, - "learning_rate": 9.193920820858111e-08, - "loss": 0.0642, - "reward": 0.5089285969734192, - "reward_std": 0.14370287582278252, - "rewards/accuracy_reward": 0.11160715040750802, + "grad_norm": 29.666196823120117, + "kl": 4.23046875, + "learning_rate": 4.5969604104290556e-07, + "loss": 0.409, + "reward": 0.4408482387661934, + "reward_std": 0.1940433271229267, + "rewards/accuracy_reward": 0.09375000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3973214402794838, + "rewards/tag_count_reward": 0.3470982313156128, "step": 887 }, { "clip_ratio": 0.0, - "completion_length": 1807.3795166015625, + "completion_length": 1641.8058776855469, "epoch": 0.265252781719065, - "grad_norm": 0.19040346145629883, - "kl": 0.03985595703125, - "learning_rate": 9.191079090667521e-08, - "loss": 0.0512, - "reward": 0.4793526977300644, - "reward_std": 0.16538347862660885, - "rewards/accuracy_reward": 0.1138392873108387, + "grad_norm": 21.547897338867188, + "kl": 4.7578125, + "learning_rate": 4.5955395453337607e-07, + "loss": 0.3839, + "reward": 0.4335937723517418, + "reward_std": 0.23218850046396255, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3655134066939354, + "rewards/tag_count_reward": 0.3242187574505806, "step": 888 }, { "clip_ratio": 0.0, - "completion_length": 1836.9889221191406, + "completion_length": 1607.3616943359375, "epoch": 0.2655514898065865, - "grad_norm": 0.1903780847787857, - "kl": 0.03857421875, - "learning_rate": 9.18823280099365e-08, - "loss": 0.0629, - "reward": 0.4135044813156128, - "reward_std": 0.17045453190803528, + "grad_norm": 14.991734504699707, + "kl": 4.1953125, + "learning_rate": 4.5941164004968253e-07, + "loss": 0.3399, + "reward": 0.3783482313156128, + "reward_std": 0.21754376962780952, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580559372902, + "rewards/tag_count_reward": 0.3404018059372902, "step": 889 }, { "clip_ratio": 0.0, - "completion_length": 1719.1474304199219, + "completion_length": 1503.9353637695312, "epoch": 0.26585019789410796, - "grad_norm": 0.21338827908039093, - "kl": 0.042236328125, - "learning_rate": 9.185381954932984e-08, - "loss": 0.0755, - "reward": 0.513392873108387, - "reward_std": 0.16522199660539627, - "rewards/accuracy_reward": 0.11607143096625805, + "grad_norm": 32.39583206176758, + "kl": 5.28125, + "learning_rate": 4.592690977466492e-07, + "loss": 0.4288, + "reward": 0.4631696715950966, + "reward_std": 0.206858791410923, + "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3973214477300644, + "rewards/tag_count_reward": 0.353794664144516, "step": 890 }, { "clip_ratio": 0.0, - "completion_length": 1742.5224304199219, + "completion_length": 1602.7545471191406, "epoch": 0.26614890598162944, - "grad_norm": 0.19454915821552277, - "kl": 0.0430908203125, - "learning_rate": 9.182526555586962e-08, - "loss": 0.0698, - "reward": 0.5228794813156128, - "reward_std": 0.12405900843441486, - "rewards/accuracy_reward": 0.1517857201397419, + "grad_norm": 28.859947204589844, + "kl": 5.0703125, + "learning_rate": 4.5912632777934805e-07, + "loss": 0.4043, + "reward": 0.4581473544239998, + "reward_std": 0.18166309595108032, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937649011612, + "rewards/tag_count_reward": 0.321986623108387, "step": 891 }, { "clip_ratio": 0.0, - "completion_length": 1794.8103637695312, + "completion_length": 1550.5603637695312, "epoch": 0.2664476140691509, - "grad_norm": 0.21492701768875122, - "kl": 0.043212890625, - "learning_rate": 9.179666606061977e-08, - "loss": 0.087, - "reward": 0.5011161044239998, - "reward_std": 0.16608821041882038, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 16.414947509765625, + "kl": 4.2734375, + "learning_rate": 4.589833303030989e-07, + "loss": 0.3909, + "reward": 0.4871651977300644, + "reward_std": 0.22048326209187508, + "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232238650322, + "rewards/tag_count_reward": 0.3465401902794838, "step": 892 }, { "clip_ratio": 0.0, - "completion_length": 1833.3014526367188, + "completion_length": 1603.2947082519531, "epoch": 0.2667463221566724, - "grad_norm": 0.2123369723558426, - "kl": 0.04229736328125, - "learning_rate": 9.17680210946938e-08, - "loss": 0.0691, - "reward": 0.497767873108387, - "reward_std": 0.13076888024806976, - "rewards/accuracy_reward": 0.12500000558793545, + "grad_norm": 27.078393936157227, + "kl": 4.734375, + "learning_rate": 4.58840105473469e-07, + "loss": 0.3638, + "reward": 0.4637276977300644, + "reward_std": 0.1959126926958561, + "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3727678805589676, + "rewards/tag_count_reward": 0.334263414144516, "step": 893 }, { "clip_ratio": 0.0, - "completion_length": 1793.0826721191406, + "completion_length": 1569.8438110351562, "epoch": 0.26704503024419385, - "grad_norm": 0.20086079835891724, - "kl": 0.04302978515625, - "learning_rate": 9.173933068925455e-08, - "loss": 0.0687, - "reward": 0.4268973395228386, - "reward_std": 0.1206517182290554, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 27.020221710205078, + "kl": 3.64453125, + "learning_rate": 4.5869665344627273e-07, + "loss": 0.3384, + "reward": 0.3911830559372902, + "reward_std": 0.1636234149336815, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3777901902794838, + "rewards/tag_count_reward": 0.348772332072258, "step": 894 }, { "clip_ratio": 0.0, - "completion_length": 1825.5916137695312, + "completion_length": 1563.2947082519531, "epoch": 0.2673437383317153, - "grad_norm": 0.20222163200378418, - "kl": 0.04364013671875, - "learning_rate": 9.171059487551442e-08, - "loss": 0.0838, - "reward": 0.412388414144516, - "reward_std": 0.16380810365080833, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 22.678203582763672, + "kl": 4.40234375, + "learning_rate": 4.585529743775721e-07, + "loss": 0.4158, + "reward": 0.3761160895228386, + "reward_std": 0.2076995000243187, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376674123108387, + "rewards/tag_count_reward": 0.3470982313156128, "step": 895 }, { "clip_ratio": 0.0, - "completion_length": 1705.6853332519531, + "completion_length": 1473.7098999023438, "epoch": 0.2676424464192368, - "grad_norm": 0.217881441116333, - "kl": 0.04376220703125, - "learning_rate": 9.168181368473513e-08, - "loss": 0.0863, - "reward": 0.4888393208384514, - "reward_std": 0.17327825911343098, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 40.3590087890625, + "kl": 3.9453125, + "learning_rate": 4.5840906842367566e-07, + "loss": 0.3781, + "reward": 0.448660746216774, + "reward_std": 0.2113383449614048, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3816964477300644, + "rewards/tag_count_reward": 0.3660714477300644, "step": 896 }, { "clip_ratio": 0.0, - "completion_length": 1830.74560546875, + "completion_length": 1625.2366943359375, "epoch": 0.26794115450675826, - "grad_norm": 0.1887834221124649, - "kl": 0.04681396484375, - "learning_rate": 9.165298714822783e-08, - "loss": 0.0475, - "reward": 0.4614955484867096, - "reward_std": 0.12558943964540958, - "rewards/accuracy_reward": 0.11383928940631449, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "grad_norm": 46.94992446899414, + "kl": 5.31640625, + "learning_rate": 4.582649357411391e-07, + "loss": 0.3932, + "reward": 0.4308035895228386, + "reward_std": 0.17740712314844131, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3169643059372902, "step": 897 }, { "clip_ratio": 0.0, - "completion_length": 1791.0469360351562, + "completion_length": 1626.8415832519531, "epoch": 0.26823986259427973, - "grad_norm": 0.19919659197330475, - "kl": 0.04302978515625, - "learning_rate": 9.162411529735292e-08, - "loss": 0.0629, - "reward": 0.4598214477300644, - "reward_std": 0.1326950304210186, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 59.473388671875, + "kl": 5.9453125, + "learning_rate": 4.581205764867646e-07, + "loss": 0.4386, + "reward": 0.4302455484867096, + "reward_std": 0.21160991489887238, + "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3727678805589676, + "rewards/tag_count_reward": 0.334263414144516, "step": 898 }, { "clip_ratio": 0.0, - "completion_length": 1646.30810546875, + "completion_length": 1446.087158203125, "epoch": 0.2685385706818012, - "grad_norm": 0.18986748158931732, - "kl": 0.04522705078125, - "learning_rate": 9.15951981635202e-08, - "loss": 0.0719, - "reward": 0.5100446715950966, - "reward_std": 0.1206699088215828, - "rewards/accuracy_reward": 0.11383928963914514, + "grad_norm": 9.765243530273438, + "kl": 4.890625, + "learning_rate": 4.57975990817601e-07, + "loss": 0.4404, + "reward": 0.4726562723517418, + "reward_std": 0.19033697247505188, + "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.396205373108387, + "rewards/tag_count_reward": 0.3655134066939354, "step": 899 }, { "clip_ratio": 0.0, - "completion_length": 1813.6987609863281, + "completion_length": 1605.4286193847656, "epoch": 0.2688372787693227, - "grad_norm": 0.21242138743400574, - "kl": 0.04473876953125, - "learning_rate": 9.156623577818862e-08, - "loss": 0.0789, - "reward": 0.447544664144516, - "reward_std": 0.19115923717617989, - "rewards/accuracy_reward": 0.05803571781143546, + "grad_norm": 23.97719383239746, + "kl": 4.8828125, + "learning_rate": 4.578311788909431e-07, + "loss": 0.4086, + "reward": 0.3995535969734192, + "reward_std": 0.2671196758747101, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3895089477300644, + "rewards/tag_count_reward": 0.3303571566939354, "step": 900 }, { "clip_ratio": 0.0, - "completion_length": 1806.5112609863281, + "completion_length": 1597.0313110351562, "epoch": 0.26913598685684414, - "grad_norm": 0.184341698884964, - "kl": 0.043701171875, - "learning_rate": 9.153722817286644e-08, - "loss": 0.0506, - "reward": 0.4525669887661934, - "reward_std": 0.15082189813256264, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 5.647151470184326, + "kl": 4.50390625, + "learning_rate": 4.576861408643322e-07, + "loss": 0.3944, + "reward": 0.3984375223517418, + "reward_std": 0.20241311565041542, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3811384066939354, + "rewards/tag_count_reward": 0.3359375149011612, "step": 901 }, { "clip_ratio": 0.0, - "completion_length": 1695.4152526855469, + "completion_length": 1469.3415832519531, "epoch": 0.2694346949443656, - "grad_norm": 0.18538391590118408, - "kl": 0.0467529296875, - "learning_rate": 9.15081753791111e-08, - "loss": 0.0518, - "reward": 0.4296875223517418, - "reward_std": 0.14206168428063393, - "rewards/accuracy_reward": 0.024553572991862893, + "grad_norm": 45.46286392211914, + "kl": 3.4453125, + "learning_rate": 4.5754087689555555e-07, + "loss": 0.367, + "reward": 0.3777901977300644, + "reward_std": 0.18921225145459175, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4051339477300644, + "rewards/tag_count_reward": 0.357700914144516, "step": 902 }, { "clip_ratio": 0.0, - "completion_length": 1819.977783203125, + "completion_length": 1589.77685546875, "epoch": 0.2697334030318871, - "grad_norm": 0.20707178115844727, - "kl": 0.047119140625, - "learning_rate": 9.14790774285292e-08, - "loss": 0.0614, - "reward": 0.455357164144516, - "reward_std": 0.16760529205203056, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 39.711246490478516, + "kl": 3.41015625, + "learning_rate": 4.5739538714264596e-07, + "loss": 0.3196, + "reward": 0.3934151977300644, + "reward_std": 0.19973937422037125, + "rewards/accuracy_reward": 0.051339286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3839285895228386, + "rewards/tag_count_reward": 0.3420758992433548, "step": 903 }, { "clip_ratio": 0.0, - "completion_length": 1753.2076721191406, + "completion_length": 1503.4286193847656, "epoch": 0.27003211111940856, - "grad_norm": 0.18187332153320312, - "kl": 0.04486083984375, - "learning_rate": 9.144993435277642e-08, - "loss": 0.0477, - "reward": 0.5128348544239998, - "reward_std": 0.12007712014019489, - "rewards/accuracy_reward": 0.11607143515720963, + "grad_norm": 32.525455474853516, + "kl": 4.7578125, + "learning_rate": 4.572496717638821e-07, + "loss": 0.4639, + "reward": 0.4765625149011612, + "reward_std": 0.18457292020320892, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.396763414144516, + "rewards/tag_count_reward": 0.3515625149011612, "step": 904 }, { "clip_ratio": 0.0, - "completion_length": 1764.8772888183594, + "completion_length": 1551.21435546875, "epoch": 0.27033081920693003, - "grad_norm": 0.2107110619544983, - "kl": 0.04571533203125, - "learning_rate": 9.142074618355761e-08, - "loss": 0.0734, - "reward": 0.4146205559372902, - "reward_std": 0.14428993314504623, - "rewards/accuracy_reward": 0.029017858440056443, + "grad_norm": 44.244903564453125, + "kl": 4.0078125, + "learning_rate": 4.571037309177881e-07, + "loss": 0.3775, + "reward": 0.3588169738650322, + "reward_std": 0.17804919928312302, + "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026902794838, + "rewards/tag_count_reward": 0.345424123108387, "step": 905 }, { "clip_ratio": 0.0, - "completion_length": 1785.3840026855469, + "completion_length": 1622.2322387695312, "epoch": 0.2706295272944515, - "grad_norm": 0.1986534148454666, - "kl": 0.044921875, - "learning_rate": 9.139151295262662e-08, - "loss": 0.0616, - "reward": 0.4481026977300644, - "reward_std": 0.13415024057030678, - "rewards/accuracy_reward": 0.06473214598372579, + "grad_norm": 43.21623992919922, + "kl": 6.0234375, + "learning_rate": 4.5695756476313305e-07, + "loss": 0.4662, + "reward": 0.4045759215950966, + "reward_std": 0.22162174433469772, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3833705484867096, + "rewards/tag_count_reward": 0.3264509066939354, "step": 906 }, { "clip_ratio": 0.0, - "completion_length": 1867.7835693359375, + "completion_length": 1575.3817749023438, "epoch": 0.27092823538197297, - "grad_norm": 0.21816878020763397, - "kl": 0.04669189453125, - "learning_rate": 9.136223469178635e-08, - "loss": 0.0753, - "reward": 0.4408482313156128, - "reward_std": 0.13498619571328163, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 37.05295181274414, + "kl": 5.8125, + "learning_rate": 4.568111734589317e-07, + "loss": 0.486, + "reward": 0.4168526977300644, + "reward_std": 0.19942563772201538, + "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3694196566939354, + "rewards/tag_count_reward": 0.334263414144516, "step": 907 }, { "clip_ratio": 0.0, - "completion_length": 1797.5960388183594, + "completion_length": 1552.71435546875, "epoch": 0.27122694346949444, - "grad_norm": 0.2230449765920639, - "kl": 0.04742431640625, - "learning_rate": 9.133291143288865e-08, - "loss": 0.0893, - "reward": 0.4787946566939354, - "reward_std": 0.1821923851966858, - "rewards/accuracy_reward": 0.08705357508733869, + "grad_norm": 26.789337158203125, + "kl": 5.3671875, + "learning_rate": 4.5666455716444327e-07, + "loss": 0.4578, + "reward": 0.4285714477300644, + "reward_std": 0.2162286713719368, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3917410895228386, + "rewards/tag_count_reward": 0.3459821566939354, "step": 908 }, { "clip_ratio": 0.0, - "completion_length": 1803.0491638183594, + "completion_length": 1542.8907165527344, "epoch": 0.2715256515570159, - "grad_norm": 0.17275847494602203, - "kl": 0.04705810546875, - "learning_rate": 9.130354320783437e-08, - "loss": 0.0492, - "reward": 0.4737723395228386, - "reward_std": 0.1692373137921095, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 20.8460636138916, + "kl": 5.0703125, + "learning_rate": 4.5651771603917187e-07, + "loss": 0.4099, + "reward": 0.3989955559372902, + "reward_std": 0.20633529499173164, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.380022332072258, + "rewards/tag_count_reward": 0.3364955484867096, "step": 909 }, { "clip_ratio": 0.0, - "completion_length": 1874.5737915039062, + "completion_length": 1619.6987609863281, "epoch": 0.2718243596445374, - "grad_norm": 0.202819362282753, - "kl": 0.04620361328125, - "learning_rate": 9.12741300485733e-08, - "loss": 0.0587, - "reward": 0.4291294887661934, - "reward_std": 0.16978992894291878, - "rewards/accuracy_reward": 0.06473214575089514, + "grad_norm": 33.90433120727539, + "kl": 5.4921875, + "learning_rate": 4.5637065024286645e-07, + "loss": 0.4124, + "reward": 0.3872767984867096, + "reward_std": 0.20532633736729622, + "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.364397332072258, + "rewards/tag_count_reward": 0.3292410895228386, "step": 910 }, { "clip_ratio": 0.0, - "completion_length": 1639.41748046875, + "completion_length": 1465.5067443847656, "epoch": 0.27212306773205885, - "grad_norm": 0.2068469226360321, - "kl": 0.0484619140625, - "learning_rate": 9.1244671987104e-08, - "loss": 0.0718, - "reward": 0.4882812723517418, - "reward_std": 0.17074638791382313, - "rewards/accuracy_reward": 0.08928571944124997, + "grad_norm": 11.435953140258789, + "kl": 4.171875, + "learning_rate": 4.5622335993552e-07, + "loss": 0.3541, + "reward": 0.4375000223517418, + "reward_std": 0.21598805859684944, + "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3989955559372902, + "rewards/tag_count_reward": 0.3683035895228386, "step": 911 }, { "clip_ratio": 0.0, - "completion_length": 1808.7009582519531, + "completion_length": 1635.4152526855469, "epoch": 0.2724217758195803, - "grad_norm": 0.20608308911323547, - "kl": 0.0494384765625, - "learning_rate": 9.121516905547399e-08, - "loss": 0.069, - "reward": 0.4570312723517418, - "reward_std": 0.1597371455281973, - "rewards/accuracy_reward": 0.08258929033763707, + "grad_norm": 32.17258071899414, + "kl": 4.8125, + "learning_rate": 4.5607584527737e-07, + "loss": 0.3754, + "reward": 0.4146205559372902, + "reward_std": 0.22238923609256744, + "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419887661934, + "rewards/tag_count_reward": 0.325334832072258, "step": 912 }, { "clip_ratio": 0.0, - "completion_length": 1716.6875915527344, + "completion_length": 1571.7232666015625, "epoch": 0.2727204839071018, - "grad_norm": 0.17646154761314392, - "kl": 0.04779052734375, - "learning_rate": 9.118562128577958e-08, - "loss": 0.0466, - "reward": 0.466517873108387, - "reward_std": 0.1296781087294221, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 7.548865795135498, + "kl": 4.2109375, + "learning_rate": 4.559281064288979e-07, + "loss": 0.3535, + "reward": 0.4062500149011612, + "reward_std": 0.184102401137352, + "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.388392873108387, + "rewards/tag_count_reward": 0.3459821566939354, "step": 913 }, { "clip_ratio": 0.0, - "completion_length": 1727.05810546875, + "completion_length": 1542.5067749023438, "epoch": 0.27301919199462327, - "grad_norm": 0.20391173660755157, - "kl": 0.04864501953125, - "learning_rate": 9.115602871016584e-08, - "loss": 0.0581, - "reward": 0.5055803805589676, - "reward_std": 0.13740618713200092, - "rewards/accuracy_reward": 0.10267857741564512, + "grad_norm": 49.508846282958984, + "kl": 3.5546875, + "learning_rate": 4.557801435508292e-07, + "loss": 0.3603, + "reward": 0.4447544813156128, + "reward_std": 0.18034479394555092, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029017984867096, + "rewards/tag_count_reward": 0.3510044738650322, "step": 914 }, { "clip_ratio": 0.0, - "completion_length": 1825.8996276855469, + "completion_length": 1548.79248046875, "epoch": 0.27331790008214474, - "grad_norm": 0.20569054782390594, - "kl": 0.04833984375, - "learning_rate": 9.112639136082658e-08, - "loss": 0.0591, - "reward": 0.4380580559372902, - "reward_std": 0.1333206370472908, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 52.294124603271484, + "kl": 3.66015625, + "learning_rate": 4.556319568041329e-07, + "loss": 0.3422, + "reward": 0.399553582072258, + "reward_std": 0.17013169452548027, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3777901977300644, + "rewards/tag_count_reward": 0.3504464477300644, "step": 915 }, { "clip_ratio": 0.0, - "completion_length": 1787.6228332519531, + "completion_length": 1589.1920471191406, "epoch": 0.2736166081696662, - "grad_norm": 0.1910785436630249, - "kl": 0.04888916015625, - "learning_rate": 9.109670927000434e-08, - "loss": 0.0561, - "reward": 0.4693080559372902, - "reward_std": 0.14594096690416336, - "rewards/accuracy_reward": 0.0758928598370403, + "grad_norm": 31.779356002807617, + "kl": 3.73046875, + "learning_rate": 4.5548354635002175e-07, + "loss": 0.3366, + "reward": 0.4224330559372902, + "reward_std": 0.2041812315583229, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3934151977300644, + "rewards/tag_count_reward": 0.3465401902794838, "step": 916 }, { "clip_ratio": 0.0, - "completion_length": 1806.71435546875, + "completion_length": 1648.5581359863281, "epoch": 0.2739153162571877, - "grad_norm": 0.22103440761566162, - "kl": 0.05084228515625, - "learning_rate": 9.106698246999034e-08, - "loss": 0.0751, - "reward": 0.4771205559372902, - "reward_std": 0.14362798817455769, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 10.567296028137207, + "kl": 4.5703125, + "learning_rate": 4.553349123499517e-07, + "loss": 0.3667, + "reward": 0.408482164144516, + "reward_std": 0.192521333694458, + "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.3169642984867096, "step": 917 }, { "clip_ratio": 0.0, - "completion_length": 1807.6541137695312, + "completion_length": 1586.5469360351562, "epoch": 0.27421402434470915, - "grad_norm": 0.19028374552726746, - "kl": 0.0533447265625, - "learning_rate": 9.103721099312439e-08, - "loss": 0.0641, - "reward": 0.3794643059372902, - "reward_std": 0.13141991384327412, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 33.52521514892578, + "kl": 3.65625, + "learning_rate": 4.55186054965622e-07, + "loss": 0.317, + "reward": 0.3532366305589676, + "reward_std": 0.18983694538474083, + "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3616071566939354, + "rewards/tag_count_reward": 0.3286830559372902, "step": 918 }, { "clip_ratio": 0.0, - "completion_length": 1633.8884887695312, + "completion_length": 1386.97998046875, "epoch": 0.2745127324322306, - "grad_norm": 0.22644905745983124, - "kl": 0.054443359375, - "learning_rate": 9.100739487179496e-08, - "loss": 0.0977, - "reward": 0.5340402126312256, - "reward_std": 0.15496957674622536, - "rewards/accuracy_reward": 0.13616071757860482, + "grad_norm": 72.76332092285156, + "kl": 3.46484375, + "learning_rate": 4.550369743589748e-07, + "loss": 0.3988, + "reward": 0.4960937723517418, + "reward_std": 0.19807007908821106, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3978794813156128, + "rewards/tag_count_reward": 0.3777901977300644, "step": 919 }, { "clip_ratio": 0.0, - "completion_length": 1787.0715026855469, + "completion_length": 1574.7165832519531, "epoch": 0.2748114405197521, - "grad_norm": 0.2312707006931305, - "kl": 0.05267333984375, - "learning_rate": 9.097753413843908e-08, - "loss": 0.0861, - "reward": 0.553013414144516, - "reward_std": 0.18805299699306488, - "rewards/accuracy_reward": 0.1674107201397419, + "grad_norm": 6.297797679901123, + "kl": 4.5234375, + "learning_rate": 4.5488767069219536e-07, + "loss": 0.3779, + "reward": 0.4821428805589676, + "reward_std": 0.19668077304959297, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.3526785895228386, "step": 920 }, { "clip_ratio": 0.0, - "completion_length": 1765.3683776855469, + "completion_length": 1551.185302734375, "epoch": 0.27511014860727356, - "grad_norm": 0.2218387871980667, - "kl": 0.0550537109375, - "learning_rate": 9.094762882554228e-08, - "loss": 0.0907, - "reward": 0.4782366380095482, - "reward_std": 0.119849544018507, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 38.58011245727539, + "kl": 4.43359375, + "learning_rate": 4.547381441277114e-07, + "loss": 0.4124, + "reward": 0.4330357313156128, + "reward_std": 0.17036039009690285, + "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187649011612, + "rewards/tag_count_reward": 0.3526785895228386, "step": 921 }, { "clip_ratio": 0.0, - "completion_length": 1893.7835693359375, + "completion_length": 1682.8773193359375, "epoch": 0.27540885669479503, - "grad_norm": 0.21561160683631897, - "kl": 0.05364990234375, - "learning_rate": 9.09176789656386e-08, - "loss": 0.0528, - "reward": 0.4213169813156128, - "reward_std": 0.1403740532696247, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 26.623886108398438, + "kl": 5.04296875, + "learning_rate": 4.54588394828193e-07, + "loss": 0.3817, + "reward": 0.3593750149011612, + "reward_std": 0.18209074065089226, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3722098395228386, + "rewards/tag_count_reward": 0.3125000149011612, "step": 922 }, { "clip_ratio": 0.0, - "completion_length": 1625.82373046875, + "completion_length": 1448.2054138183594, "epoch": 0.2757075647823165, - "grad_norm": 0.2404583990573883, - "kl": 0.054443359375, - "learning_rate": 9.08876845913106e-08, - "loss": 0.094, - "reward": 0.5128348469734192, - "reward_std": 0.12331418320536613, - "rewards/accuracy_reward": 0.10491071757860482, + "grad_norm": 6.805716514587402, + "kl": 4.765625, + "learning_rate": 4.54438422956553e-07, + "loss": 0.4422, + "reward": 0.4447544887661934, + "reward_std": 0.17976408079266548, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.407924123108387, + "rewards/tag_count_reward": 0.3577009066939354, "step": 923 }, { "clip_ratio": 0.0, - "completion_length": 1798.5491943359375, + "completion_length": 1560.5692749023438, "epoch": 0.276006272869838, - "grad_norm": 0.2079465240240097, - "kl": 0.0535888671875, - "learning_rate": 9.085764573518917e-08, - "loss": 0.0552, - "reward": 0.4983259066939354, - "reward_std": 0.17235327884554863, - "rewards/accuracy_reward": 0.11607143213041127, + "grad_norm": 30.562122344970703, + "kl": 5.203125, + "learning_rate": 4.5428822867594585e-07, + "loss": 0.4148, + "reward": 0.4324776977300644, + "reward_std": 0.21806598454713821, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3822544813156128, + "rewards/tag_count_reward": 0.329799123108387, "step": 924 }, { "clip_ratio": 0.0, - "completion_length": 1756.2589721679688, + "completion_length": 1539.1920166015625, "epoch": 0.27630498095735945, - "grad_norm": 0.21783027052879333, - "kl": 0.05682373046875, - "learning_rate": 9.082756242995366e-08, - "loss": 0.0617, - "reward": 0.3978794738650322, - "reward_std": 0.11321617476642132, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 7.877579689025879, + "kl": 4.8828125, + "learning_rate": 4.5413781214976826e-07, + "loss": 0.4179, + "reward": 0.3582589402794838, + "reward_std": 0.15835804492235184, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187649011612, + "rewards/tag_count_reward": 0.3515625074505806, "step": 925 }, { "clip_ratio": 0.0, - "completion_length": 1849.8750915527344, + "completion_length": 1645.4130249023438, "epoch": 0.27660368904488086, - "grad_norm": 0.21648912131786346, - "kl": 0.05682373046875, - "learning_rate": 9.079743470833175e-08, - "loss": 0.0682, - "reward": 0.4034598395228386, - "reward_std": 0.1652013212442398, - "rewards/accuracy_reward": 0.035714287078008056, + "grad_norm": 42.64602279663086, + "kl": 5.6875, + "learning_rate": 4.539871735416588e-07, + "loss": 0.4129, + "reward": 0.3560267984867096, + "reward_std": 0.19773318618535995, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455484867096, + "rewards/tag_count_reward": 0.3270089402794838, "step": 926 }, { "clip_ratio": 0.0, - "completion_length": 1776.5670776367188, + "completion_length": 1555.7188110351562, "epoch": 0.27690239713240233, - "grad_norm": 0.23289348185062408, - "kl": 0.0565185546875, - "learning_rate": 9.076726260309947e-08, - "loss": 0.0788, - "reward": 0.4760044887661934, - "reward_std": 0.15980072878301144, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 12.59932804107666, + "kl": 5.4765625, + "learning_rate": 4.5383631301549733e-07, + "loss": 0.4738, + "reward": 0.4129464402794838, + "reward_std": 0.2063601128757, + "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830484867096, + "rewards/tag_count_reward": 0.3459821566939354, "step": 927 }, { "clip_ratio": 0.0, - "completion_length": 1742.47998046875, + "completion_length": 1579.6273193359375, "epoch": 0.2772011052199238, - "grad_norm": 0.21850818395614624, - "kl": 0.0596923828125, - "learning_rate": 9.073704614708109e-08, - "loss": 0.0839, - "reward": 0.5000000223517418, - "reward_std": 0.1983421966433525, - "rewards/accuracy_reward": 0.11383929196745157, + "grad_norm": 32.66411590576172, + "kl": 5.34375, + "learning_rate": 4.5368523073540543e-07, + "loss": 0.4364, + "reward": 0.4185268133878708, + "reward_std": 0.2288465015590191, + "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3861607313156128, + "rewards/tag_count_reward": 0.3359375149011612, "step": 928 }, { "clip_ratio": 0.0, - "completion_length": 1787.2567749023438, + "completion_length": 1590.2634582519531, "epoch": 0.2774998133074453, - "grad_norm": 0.24788573384284973, - "kl": 0.05755615234375, - "learning_rate": 9.070678537314918e-08, - "loss": 0.0807, - "reward": 0.4185267984867096, - "reward_std": 0.1096727978438139, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 24.75836944580078, + "kl": 4.1328125, + "learning_rate": 4.535339268657459e-07, + "loss": 0.3508, + "reward": 0.3995535895228386, + "reward_std": 0.17974304035305977, + "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482238650322, + "rewards/tag_count_reward": 0.352678582072258, "step": 929 }, { "clip_ratio": 0.0, - "completion_length": 1752.0291137695312, + "completion_length": 1547.9978637695312, "epoch": 0.27779852139496675, - "grad_norm": 0.23121437430381775, - "kl": 0.05596923828125, - "learning_rate": 9.067648031422445e-08, - "loss": 0.0783, - "reward": 0.5435268133878708, - "reward_std": 0.14009770564734936, - "rewards/accuracy_reward": 0.14285714784637094, + "grad_norm": 24.30841064453125, + "kl": 3.9921875, + "learning_rate": 4.5338240157112226e-07, + "loss": 0.3684, + "reward": 0.4983259066939354, + "reward_std": 0.18460837379097939, + "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4006696566939354, + "rewards/tag_count_reward": 0.3621651977300644, "step": 930 }, { "clip_ratio": 0.0, - "completion_length": 1795.3527526855469, + "completion_length": 1500.9442749023438, "epoch": 0.2780972294824882, - "grad_norm": 0.24493761360645294, - "kl": 0.05889892578125, - "learning_rate": 9.064613100327588e-08, - "loss": 0.0835, - "reward": 0.4838169887661934, - "reward_std": 0.16577173396945, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 55.247737884521484, + "kl": 3.66015625, + "learning_rate": 4.532306550163794e-07, + "loss": 0.378, + "reward": 0.444196455180645, + "reward_std": 0.17786908149719238, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3989955484867096, + "rewards/tag_count_reward": 0.372767873108387, "step": 931 }, { "clip_ratio": 0.0, - "completion_length": 1814.4911499023438, + "completion_length": 1631.9889221191406, "epoch": 0.2783959375700097, - "grad_norm": 0.21988573670387268, - "kl": 0.05987548828125, - "learning_rate": 9.061573747332052e-08, - "loss": 0.063, - "reward": 0.3967634066939354, - "reward_std": 0.15727835148572922, - "rewards/accuracy_reward": 0.029017857741564512, + "grad_norm": 23.401912689208984, + "kl": 4.90625, + "learning_rate": 4.530786873666026e-07, + "loss": 0.3831, + "reward": 0.3476562574505806, + "reward_std": 0.1953258216381073, + "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455559372902, + "rewards/tag_count_reward": 0.3231026902794838, "step": 932 }, { "clip_ratio": 0.0, - "completion_length": 1753.3862609863281, + "completion_length": 1534.9822387695312, "epoch": 0.27869464565753116, - "grad_norm": 0.23407068848609924, - "kl": 0.06036376953125, - "learning_rate": 9.058529975742356e-08, - "loss": 0.0788, - "reward": 0.4843750149011612, - "reward_std": 0.12481491826474667, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 24.194305419921875, + "kl": 4.47265625, + "learning_rate": 4.5292649878711783e-07, + "loss": 0.4093, + "reward": 0.4375000149011612, + "reward_std": 0.17405148595571518, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3995535895228386, + "rewards/tag_count_reward": 0.3549107313156128, "step": 933 }, { "clip_ratio": 0.0, - "completion_length": 1768.7857666015625, + "completion_length": 1578.6630249023438, "epoch": 0.27899335374505263, - "grad_norm": 0.23342807590961456, - "kl": 0.062255859375, - "learning_rate": 9.055481788869825e-08, - "loss": 0.0727, - "reward": 0.533482164144516, - "reward_std": 0.13505843468010426, - "rewards/accuracy_reward": 0.13839286682195961, + "grad_norm": 100.38318634033203, + "kl": 7.1953125, + "learning_rate": 4.5277408944349126e-07, + "loss": 0.5469, + "reward": 0.4693080633878708, + "reward_std": 0.1900699958205223, + "rewards/accuracy_reward": 0.13616072433069348, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3950893059372902, + "rewards/tag_count_reward": 0.3331473395228386, "step": 934 }, { "clip_ratio": 0.0, - "completion_length": 1769.1406860351562, + "completion_length": 1619.7567749023438, "epoch": 0.2792920618325741, - "grad_norm": 0.2210870087146759, - "kl": 0.06201171875, - "learning_rate": 9.052429190030589e-08, - "loss": 0.0573, - "reward": 0.4648437649011612, - "reward_std": 0.16147638112306595, - "rewards/accuracy_reward": 0.06919643329456449, + "grad_norm": 11.667678833007812, + "kl": 4.8828125, + "learning_rate": 4.526214595015294e-07, + "loss": 0.3957, + "reward": 0.4056919738650322, + "reward_std": 0.21412968263030052, + "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.395647332072258, + "rewards/tag_count_reward": 0.3387276902794838, "step": 935 }, { "clip_ratio": 0.0, - "completion_length": 1839.9554443359375, + "completion_length": 1616.3861999511719, "epoch": 0.27959076992009557, - "grad_norm": 0.2160664051771164, - "kl": 0.06390380859375, - "learning_rate": 9.049372182545573e-08, - "loss": 0.068, - "reward": 0.5156250223517418, - "reward_std": 0.17959412559866905, - "rewards/accuracy_reward": 0.125000003259629, + "grad_norm": 12.179121017456055, + "kl": 4.640625, + "learning_rate": 4.524686091272787e-07, + "loss": 0.3883, + "reward": 0.4648437723517418, + "reward_std": 0.22595339640974998, + "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3906250223517418, + "rewards/tag_count_reward": 0.3465401902794838, "step": 936 }, { "clip_ratio": 0.0, - "completion_length": 1758.0692749023438, + "completion_length": 1521.7187805175781, "epoch": 0.27988947800761704, - "grad_norm": 0.23590609431266785, - "kl": 0.06134033203125, - "learning_rate": 9.046310769740504e-08, - "loss": 0.0914, - "reward": 0.4771205559372902, - "reward_std": 0.18018774315714836, - "rewards/accuracy_reward": 0.08482143329456449, + "grad_norm": 30.797372817993164, + "kl": 4.44921875, + "learning_rate": 4.523155384870252e-07, + "loss": 0.4092, + "reward": 0.4296875149011612, + "reward_std": 0.1795186921954155, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392299123108387, + "rewards/tag_count_reward": 0.364955373108387, "step": 937 }, { "clip_ratio": 0.0, - "completion_length": 1698.2478332519531, + "completion_length": 1487.57373046875, "epoch": 0.2801881860951385, - "grad_norm": 0.17528730630874634, - "kl": 0.057373046875, - "learning_rate": 9.043244954945898e-08, - "loss": 0.0397, - "reward": 0.4570312723517418, - "reward_std": 0.16448046453297138, - "rewards/accuracy_reward": 0.04464285844005644, + "grad_norm": 19.977609634399414, + "kl": 3.85546875, + "learning_rate": 4.5216224774729497e-07, + "loss": 0.3328, + "reward": 0.4107142984867096, + "reward_std": 0.21190990880131721, + "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.412388414144516, + "rewards/tag_count_reward": 0.3727678805589676, "step": 938 }, { "clip_ratio": 0.0, - "completion_length": 1789.8862609863281, + "completion_length": 1599.6986999511719, "epoch": 0.28048689418266, - "grad_norm": 0.20718160271644592, - "kl": 0.06207275390625, - "learning_rate": 9.040174741497064e-08, - "loss": 0.0535, - "reward": 0.380580373108387, - "reward_std": 0.10218922980129719, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 97.52548217773438, + "kl": 6.546875, + "learning_rate": 4.520087370748532e-07, + "loss": 0.4424, + "reward": 0.3487723469734192, + "reward_std": 0.16231832653284073, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3761160895228386, + "rewards/tag_count_reward": 0.3420759066939354, "step": 939 }, { "clip_ratio": 0.0, - "completion_length": 1766.3482971191406, + "completion_length": 1566.5179138183594, "epoch": 0.28078560227018146, - "grad_norm": 0.2664683759212494, - "kl": 0.06536865234375, - "learning_rate": 9.03710013273409e-08, - "loss": 0.0761, - "reward": 0.4620535969734192, - "reward_std": 0.15096318162977695, + "grad_norm": 102.2385482788086, + "kl": 6.9296875, + "learning_rate": 4.518550066367045e-07, + "loss": 0.5074, + "reward": 0.4207589477300644, + "reward_std": 0.20485138893127441, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3973214402794838, + "rewards/tag_count_reward": 0.3560268059372902, "step": 940 }, { "clip_ratio": 0.0, - "completion_length": 1664.1808776855469, + "completion_length": 1472.9688110351562, "epoch": 0.2810843103577029, - "grad_norm": 0.2612977623939514, - "kl": 0.0638427734375, - "learning_rate": 9.03402113200185e-08, - "loss": 0.0948, - "reward": 0.5005580484867096, - "reward_std": 0.1435855869203806, - "rewards/accuracy_reward": 0.10491071501746774, + "grad_norm": 46.230628967285156, + "kl": 6.359375, + "learning_rate": 4.517010566000924e-07, + "loss": 0.5199, + "reward": 0.4514509215950966, + "reward_std": 0.15559089556336403, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.3666294813156128, "step": 941 }, { "clip_ratio": 0.0, - "completion_length": 1754.977783203125, + "completion_length": 1545.6317443847656, "epoch": 0.2813830184452244, - "grad_norm": 0.22245031595230103, - "kl": 0.06689453125, - "learning_rate": 9.030937742649993e-08, - "loss": 0.068, - "reward": 0.4453125223517418, - "reward_std": 0.13327210023999214, - "rewards/accuracy_reward": 0.037946431431919336, + "grad_norm": 15.406947135925293, + "kl": 5.5, + "learning_rate": 4.5154688713249964e-07, + "loss": 0.4416, + "reward": 0.3917410969734192, + "reward_std": 0.17182153835892677, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4073660969734192, + "rewards/tag_count_reward": 0.3671875074505806, "step": 942 }, { "clip_ratio": 0.0, - "completion_length": 1774.2813415527344, + "completion_length": 1576.47998046875, "epoch": 0.28168172653274587, - "grad_norm": 0.19380183517932892, - "kl": 0.06591796875, - "learning_rate": 9.027849968032948e-08, - "loss": 0.0644, - "reward": 0.4313616305589676, - "reward_std": 0.12820390611886978, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 19.918075561523438, + "kl": 5.359375, + "learning_rate": 4.513924984016474e-07, + "loss": 0.4169, + "reward": 0.403459832072258, + "reward_std": 0.18727761134505272, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3733259066939354, + "rewards/tag_count_reward": 0.3498884066939354, "step": 943 }, { "clip_ratio": 0.0, - "completion_length": 1805.9353637695312, + "completion_length": 1630.6764221191406, "epoch": 0.28198043462026734, - "grad_norm": 0.2214522361755371, - "kl": 0.064453125, - "learning_rate": 9.024757811509909e-08, - "loss": 0.0593, - "reward": 0.4056919813156128, - "reward_std": 0.12629977613687515, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 13.503260612487793, + "kl": 4.34375, + "learning_rate": 4.5123789057549544e-07, + "loss": 0.3424, + "reward": 0.3588169813156128, + "reward_std": 0.1735735982656479, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392299123108387, + "rewards/tag_count_reward": 0.3476562723517418, "step": 944 }, { "clip_ratio": 0.0, - "completion_length": 1665.5647888183594, + "completion_length": 1497.5648193359375, "epoch": 0.2822791427077888, - "grad_norm": 0.2137376070022583, - "kl": 0.06787109375, - "learning_rate": 9.02166127644484e-08, - "loss": 0.0668, - "reward": 0.4575893059372902, - "reward_std": 0.1289401352405548, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 44.53622817993164, + "kl": 4.21875, + "learning_rate": 4.51083063822242e-07, + "loss": 0.4182, + "reward": 0.4017857387661934, + "reward_std": 0.18243606202304363, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4174107313156128, + "rewards/tag_count_reward": 0.3727678805589676, "step": 945 }, { "clip_ratio": 0.0, - "completion_length": 1734.0157165527344, + "completion_length": 1498.1629943847656, "epoch": 0.2825778507953103, - "grad_norm": 0.24637185037136078, - "kl": 0.065185546875, - "learning_rate": 9.018560366206468e-08, - "loss": 0.0771, - "reward": 0.4112723469734192, - "reward_std": 0.13418780453503132, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 76.17205810546875, + "kl": 2.95703125, + "learning_rate": 4.509280183103234e-07, + "loss": 0.2919, + "reward": 0.3895089402794838, + "reward_std": 0.16386397182941437, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.3783482313156128, "step": 946 }, { "clip_ratio": 0.0, - "completion_length": 1750.3795471191406, + "completion_length": 1507.0692749023438, "epoch": 0.28287655888283175, - "grad_norm": 0.21654334664344788, - "kl": 0.06561279296875, - "learning_rate": 9.015455084168278e-08, - "loss": 0.0567, - "reward": 0.4581473395228386, - "reward_std": 0.16663485579192638, - "rewards/accuracy_reward": 0.07142857392318547, + "grad_norm": 54.65409469604492, + "kl": 3.671875, + "learning_rate": 4.507727542084139e-07, + "loss": 0.3581, + "reward": 0.4425223469734192, + "reward_std": 0.20599256455898285, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187723517418, + "rewards/tag_count_reward": 0.3710937649011612, "step": 947 }, { "clip_ratio": 0.0, - "completion_length": 1793.3639221191406, + "completion_length": 1624.0938415527344, "epoch": 0.2831752669703532, - "grad_norm": 0.21558420360088348, - "kl": 0.06732177734375, - "learning_rate": 9.012345433708512e-08, - "loss": 0.0601, - "reward": 0.4637276977300644, - "reward_std": 0.1650265622884035, - "rewards/accuracy_reward": 0.07812500465661287, + "grad_norm": 42.41265106201172, + "kl": 4.7421875, + "learning_rate": 4.506172716854256e-07, + "loss": 0.3438, + "reward": 0.4129464402794838, + "reward_std": 0.19797534495592117, + "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.3437500223517418, "step": 948 }, { "clip_ratio": 0.0, - "completion_length": 1860.82373046875, + "completion_length": 1641.9063110351562, "epoch": 0.2834739750578747, - "grad_norm": 0.212761789560318, - "kl": 0.0706787109375, - "learning_rate": 9.009231418210169e-08, - "loss": 0.0565, - "reward": 0.3761160895228386, - "reward_std": 0.11330828629434109, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 58.391300201416016, + "kl": 3.50390625, + "learning_rate": 4.5046157091050843e-07, + "loss": 0.3174, + "reward": 0.344866082072258, + "reward_std": 0.15947942808270454, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716518059372902, + "rewards/tag_count_reward": 0.3426339402794838, "step": 949 }, { "clip_ratio": 0.0, - "completion_length": 1781.0067749023438, + "completion_length": 1557.6050109863281, "epoch": 0.28377268314539617, - "grad_norm": 0.2170807421207428, - "kl": 0.0712890625, - "learning_rate": 9.006113041060989e-08, - "loss": 0.062, - "reward": 0.472098246216774, - "reward_std": 0.11003417894244194, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 75.96441650390625, + "kl": 3.1875, + "learning_rate": 4.503056520530494e-07, + "loss": 0.318, + "reward": 0.4492187723517418, + "reward_std": 0.15650830045342445, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.396205373108387, + "rewards/tag_count_reward": 0.3755580484867096, "step": 950 }, { "clip_ratio": 0.0, - "completion_length": 1712.654052734375, + "completion_length": 1515.0648193359375, "epoch": 0.28407139123291764, - "grad_norm": 0.23905488848686218, - "kl": 0.0736083984375, - "learning_rate": 9.002990305653462e-08, - "loss": 0.0633, - "reward": 0.5150669887661934, - "reward_std": 0.15841125510632992, - "rewards/accuracy_reward": 0.1026785746216774, + "grad_norm": 61.73088073730469, + "kl": 3.171875, + "learning_rate": 4.501495152826731e-07, + "loss": 0.3248, + "reward": 0.4587053805589676, + "reward_std": 0.2060912624001503, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.412388414144516, + "rewards/tag_count_reward": 0.3671875149011612, "step": 951 }, { "clip_ratio": 0.0, - "completion_length": 1776.2723999023438, + "completion_length": 1574.7210693359375, "epoch": 0.2843700993204391, - "grad_norm": 0.22599834203720093, - "kl": 0.0697021484375, - "learning_rate": 8.999863215384816e-08, - "loss": 0.0583, - "reward": 0.482142873108387, - "reward_std": 0.14340846240520477, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 11.68893051147461, + "kl": 4.92578125, + "learning_rate": 4.4999316076924075e-07, + "loss": 0.407, + "reward": 0.419084832072258, + "reward_std": 0.189629964530468, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.404017873108387, + "rewards/tag_count_reward": 0.361049123108387, "step": 952 }, { "clip_ratio": 0.0, - "completion_length": 1703.888427734375, + "completion_length": 1473.0201416015625, "epoch": 0.2846688074079606, - "grad_norm": 0.26630130410194397, - "kl": 0.0721435546875, - "learning_rate": 8.996731773657022e-08, - "loss": 0.0931, - "reward": 0.5184151902794838, - "reward_std": 0.18049802631139755, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 22.036500930786133, + "kl": 4.734375, + "learning_rate": 4.498365886828511e-07, + "loss": 0.4481, + "reward": 0.4520089477300644, + "reward_std": 0.2107294723391533, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366305589676, + "rewards/tag_count_reward": 0.3716517984867096, "step": 953 }, { "clip_ratio": 0.0, - "completion_length": 1719.1741943359375, + "completion_length": 1486.7702026367188, "epoch": 0.28496751549548205, - "grad_norm": 0.27357572317123413, - "kl": 0.074951171875, - "learning_rate": 8.993595983876779e-08, - "loss": 0.0953, - "reward": 0.4402901977300644, - "reward_std": 0.137423288077116, - "rewards/accuracy_reward": 0.044642857974395156, + "grad_norm": 18.980680465698242, + "kl": 4.44921875, + "learning_rate": 4.4967979919383894e-07, + "loss": 0.3634, + "reward": 0.4302455559372902, + "reward_std": 0.1791878640651703, + "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.395647332072258, + "rewards/tag_count_reward": 0.3900669813156128, "step": 954 }, { "clip_ratio": 0.0, - "completion_length": 1738.2210693359375, + "completion_length": 1533.7411499023438, "epoch": 0.2852662235830035, - "grad_norm": 0.25074896216392517, - "kl": 0.0745849609375, - "learning_rate": 8.99045584945552e-08, - "loss": 0.0722, - "reward": 0.459821455180645, - "reward_std": 0.16916027292609215, - "rewards/accuracy_reward": 0.06696428777649999, + "grad_norm": 54.8022575378418, + "kl": 6.3828125, + "learning_rate": 4.4952279247277603e-07, + "loss": 0.493, + "reward": 0.4107143059372902, + "reward_std": 0.22598502039909363, + "rewards/accuracy_reward": 0.04910714575089514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "rewards/tag_count_reward": 0.361607164144516, "step": 955 }, { "clip_ratio": 0.0, - "completion_length": 1776.83935546875, + "completion_length": 1552.0580749511719, "epoch": 0.285564931670525, - "grad_norm": 0.23001816868782043, - "kl": 0.0716552734375, - "learning_rate": 8.987311373809404e-08, - "loss": 0.0623, - "reward": 0.4453125074505806, - "reward_std": 0.18295440450310707, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 65.22401428222656, + "kl": 5.83984375, + "learning_rate": 4.493655686904702e-07, + "loss": 0.415, + "reward": 0.4436384066939354, + "reward_std": 0.22857194393873215, + "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3850446566939354, + "rewards/tag_count_reward": 0.361049123108387, "step": 956 }, { "clip_ratio": 0.0, - "completion_length": 1802.6920471191406, + "completion_length": 1624.6116638183594, "epoch": 0.28586363975804646, - "grad_norm": 0.2208871841430664, - "kl": 0.077880859375, - "learning_rate": 8.984162560359312e-08, - "loss": 0.0631, - "reward": 0.4302455559372902, - "reward_std": 0.1449550110846758, - "rewards/accuracy_reward": 0.037946430034935474, + "grad_norm": 105.107666015625, + "kl": 6.9609375, + "learning_rate": 4.492081280179656e-07, + "loss": 0.4641, + "reward": 0.3911830484867096, + "reward_std": 0.20640220120549202, + "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392299123108387, + "rewards/tag_count_reward": 0.3487723395228386, "step": 957 }, { "clip_ratio": 0.0, - "completion_length": 1703.8818054199219, + "completion_length": 1497.2634582519531, "epoch": 0.28616234784556793, - "grad_norm": 0.24453632533550262, - "kl": 0.0726318359375, - "learning_rate": 8.981009412530844e-08, - "loss": 0.0606, - "reward": 0.545758955180645, - "reward_std": 0.13683597929775715, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 10.020505905151367, + "kl": 5.625, + "learning_rate": 4.490504706265422e-07, + "loss": 0.4824, + "reward": 0.4882812723517418, + "reward_std": 0.191363837569952, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4341518059372902, + "rewards/tag_count_reward": 0.3789062649011612, "step": 958 }, { "clip_ratio": 0.0, - "completion_length": 1774.9286804199219, + "completion_length": 1601.649658203125, "epoch": 0.2864610559330894, - "grad_norm": 0.23841537535190582, - "kl": 0.0751953125, - "learning_rate": 8.977851933754316e-08, - "loss": 0.0751, - "reward": 0.5055803656578064, - "reward_std": 0.16229723952710629, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 35.52611541748047, + "kl": 5.37109375, + "learning_rate": 4.488925966877158e-07, + "loss": 0.4114, + "reward": 0.478794664144516, + "reward_std": 0.22123905643820763, + "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3984375149011612, + "rewards/tag_count_reward": 0.364955373108387, "step": 959 }, { "clip_ratio": 0.0, - "completion_length": 1781.4509582519531, + "completion_length": 1526.0022888183594, "epoch": 0.2867597640206109, - "grad_norm": 0.2894308865070343, - "kl": 0.08203125, - "learning_rate": 8.974690127464755e-08, - "loss": 0.0923, - "reward": 0.4787946715950966, - "reward_std": 0.15185745246708393, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 30.565229415893555, + "kl": 6.0078125, + "learning_rate": 4.487345063732377e-07, + "loss": 0.5076, + "reward": 0.4280134066939354, + "reward_std": 0.18954980745911598, + "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029018133878708, + "rewards/tag_count_reward": 0.3588169813156128, "step": 960 }, { "clip_ratio": 0.0, - "completion_length": 1735.7210693359375, + "completion_length": 1549.66748046875, "epoch": 0.28705847210813235, - "grad_norm": 0.25189149379730225, - "kl": 0.08154296875, - "learning_rate": 8.971523997101897e-08, - "loss": 0.083, - "reward": 0.4983259066939354, - "reward_std": 0.1648953091353178, - "rewards/accuracy_reward": 0.09151786379516125, + "grad_norm": 12.109847068786621, + "kl": 4.96875, + "learning_rate": 4.485761998550949e-07, + "loss": 0.4055, + "reward": 0.446986623108387, + "reward_std": 0.17507700063288212, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4068080559372902, + "rewards/tag_count_reward": 0.3777901977300644, "step": 961 }, { "clip_ratio": 0.0, - "completion_length": 1678.4666137695312, + "completion_length": 1553.6228637695312, "epoch": 0.2873571801956538, - "grad_norm": 0.23576343059539795, - "kl": 0.0745849609375, - "learning_rate": 8.96835354611018e-08, - "loss": 0.0739, - "reward": 0.5954241380095482, - "reward_std": 0.18078538589179516, - "rewards/accuracy_reward": 0.16741072502918541, + "grad_norm": 13.964942932128906, + "kl": 4.24609375, + "learning_rate": 4.4841767730550904e-07, + "loss": 0.3489, + "reward": 0.4916294813156128, + "reward_std": 0.22823909297585487, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.428013414144516, + "rewards/tag_count_reward": 0.3577009066939354, "step": 962 }, { "clip_ratio": 0.0, - "completion_length": 1666.8147888183594, + "completion_length": 1479.46435546875, "epoch": 0.2876558882831753, - "grad_norm": 0.22804342210292816, - "kl": 0.0804443359375, - "learning_rate": 8.965178777938749e-08, - "loss": 0.0685, - "reward": 0.4827009215950966, - "reward_std": 0.15786248072981834, - "rewards/accuracy_reward": 0.07142857322469354, + "grad_norm": 21.5960636138916, + "kl": 4.5859375, + "learning_rate": 4.4825893889693746e-07, + "loss": 0.398, + "reward": 0.4308035895228386, + "reward_std": 0.1750134490430355, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4112723395228386, + "rewards/tag_count_reward": 0.3772321566939354, "step": 963 }, { "clip_ratio": 0.0, - "completion_length": 1765.0224304199219, + "completion_length": 1552.9732666015625, "epoch": 0.28795459637069676, - "grad_norm": 0.24008038640022278, - "kl": 0.0772705078125, - "learning_rate": 8.961999696041435e-08, - "loss": 0.067, - "reward": 0.439174123108387, - "reward_std": 0.1530044749379158, - "rewards/accuracy_reward": 0.029017859138548374, + "grad_norm": 44.17295837402344, + "kl": 3.70703125, + "learning_rate": 4.4809998480207174e-07, + "loss": 0.3348, + "reward": 0.4112723395228386, + "reward_std": 0.19608675688505173, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4101562649011612, + "rewards/tag_count_reward": 0.3777901977300644, "step": 964 }, { "clip_ratio": 0.0, - "completion_length": 1684.9085388183594, + "completion_length": 1489.4197082519531, "epoch": 0.28825330445821823, - "grad_norm": 0.2501318156719208, - "kl": 0.079833984375, - "learning_rate": 8.958816303876768e-08, - "loss": 0.0758, - "reward": 0.4235491305589676, - "reward_std": 0.11233421973884106, - "rewards/accuracy_reward": 0.0133928582072258, + "grad_norm": 48.88083267211914, + "kl": 4.015625, + "learning_rate": 4.479408151938384e-07, + "loss": 0.3753, + "reward": 0.3761160895228386, + "reward_std": 0.1502522975206375, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4101562723517418, + "rewards/tag_count_reward": 0.3716518059372902, "step": 965 }, { "clip_ratio": 0.0, - "completion_length": 1678.6473999023438, + "completion_length": 1529.0513916015625, "epoch": 0.2885520125457397, - "grad_norm": 0.2547341287136078, - "kl": 0.0810546875, - "learning_rate": 8.95562860490797e-08, - "loss": 0.0816, - "reward": 0.545758955180645, - "reward_std": 0.1669504977762699, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 28.302608489990234, + "kl": 5.0546875, + "learning_rate": 4.4778143024539847e-07, + "loss": 0.3911, + "reward": 0.5027902126312256, + "reward_std": 0.21731223165988922, + "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4185268133878708, + "rewards/tag_count_reward": 0.3822544813156128, "step": 966 }, { "clip_ratio": 0.0, - "completion_length": 1700.7054138183594, + "completion_length": 1562.0804138183594, "epoch": 0.28885072063326117, - "grad_norm": 0.2696707844734192, - "kl": 0.08349609375, - "learning_rate": 8.952436602602942e-08, - "loss": 0.0717, - "reward": 0.4151785895228386, - "reward_std": 0.09124979004263878, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 73.09947967529297, + "kl": 6.359375, + "learning_rate": 4.476218301301471e-07, + "loss": 0.4723, + "reward": 0.3532366305589676, + "reward_std": 0.16485610231757164, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4129464477300644, + "rewards/tag_count_reward": 0.3487723395228386, "step": 967 }, { "clip_ratio": 0.0, - "completion_length": 1790.2925109863281, + "completion_length": 1620.7634582519531, "epoch": 0.28914942872078264, - "grad_norm": 0.262761652469635, - "kl": 0.0882568359375, - "learning_rate": 8.949240300434271e-08, - "loss": 0.078, - "reward": 0.4414062649011612, - "reward_std": 0.1703332457691431, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 33.76585006713867, + "kl": 5.234375, + "learning_rate": 4.4746201502171355e-07, + "loss": 0.3977, + "reward": 0.4179687649011612, + "reward_std": 0.22551409900188446, + "rewards/accuracy_reward": 0.055803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3967634066939354, + "rewards/tag_count_reward": 0.3621651977300644, "step": 968 }, { "clip_ratio": 0.0, - "completion_length": 1707.2991638183594, + "completion_length": 1590.0357666015625, "epoch": 0.28944813680830406, - "grad_norm": 0.2623555660247803, - "kl": 0.084716796875, - "learning_rate": 8.946039701879221e-08, - "loss": 0.0808, - "reward": 0.5161830633878708, - "reward_std": 0.16414041444659233, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 9.169942855834961, + "kl": 5.0703125, + "learning_rate": 4.4730198509396104e-07, + "loss": 0.4131, + "reward": 0.4453125223517418, + "reward_std": 0.23576795309782028, + "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4224330559372902, + "rewards/tag_count_reward": 0.3582589477300644, "step": 969 }, { "clip_ratio": 0.0, - "completion_length": 1680.3527526855469, + "completion_length": 1523.6763916015625, "epoch": 0.28974684489582553, - "grad_norm": 0.22559918463230133, - "kl": 0.08154296875, - "learning_rate": 8.942834810419729e-08, - "loss": 0.0644, - "reward": 0.4966518133878708, - "reward_std": 0.15261599607765675, - "rewards/accuracy_reward": 0.06919643003493547, + "grad_norm": 41.497596740722656, + "kl": 4.0, + "learning_rate": 4.471417405209864e-07, + "loss": 0.351, + "reward": 0.4637276977300644, + "reward_std": 0.23182670772075653, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4274553805589676, + "rewards/tag_count_reward": 0.3811384066939354, "step": 970 }, { "clip_ratio": 0.0, - "completion_length": 1714.1473999023438, + "completion_length": 1561.7188110351562, "epoch": 0.290045552983347, - "grad_norm": 0.28129154443740845, - "kl": 0.090087890625, - "learning_rate": 8.939625629542401e-08, - "loss": 0.0752, - "reward": 0.5753348395228386, - "reward_std": 0.19045444950461388, - "rewards/accuracy_reward": 0.16517858300358057, + "grad_norm": 14.74995231628418, + "kl": 4.91015625, + "learning_rate": 4.4698128147712004e-07, + "loss": 0.4235, + "reward": 0.5089285969734192, + "reward_std": 0.2200334146618843, + "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4101562723517418, + "rewards/tag_count_reward": 0.361607164144516, "step": 971 }, { "clip_ratio": 0.0, - "completion_length": 1804.57373046875, + "completion_length": 1591.1340026855469, "epoch": 0.29034426107086847, - "grad_norm": 0.2815034091472626, - "kl": 0.093994140625, - "learning_rate": 8.936412162738514e-08, - "loss": 0.0703, - "reward": 0.415736623108387, - "reward_std": 0.1430931482464075, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 13.123516082763672, + "kl": 5.078125, + "learning_rate": 4.468206081369257e-07, + "loss": 0.4161, + "reward": 0.3660714477300644, + "reward_std": 0.17188263684511185, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3978794813156128, + "rewards/tag_count_reward": 0.3571428805589676, "step": 972 }, { "clip_ratio": 0.0, - "completion_length": 1709.6295471191406, + "completion_length": 1512.4866638183594, "epoch": 0.29064296915838994, - "grad_norm": 0.2852582633495331, - "kl": 0.092041015625, - "learning_rate": 8.933194413504003e-08, - "loss": 0.0719, - "reward": 0.5094866305589676, - "reward_std": 0.1549396924674511, - "rewards/accuracy_reward": 0.0803571492433548, + "grad_norm": 35.884769439697266, + "kl": 4.2421875, + "learning_rate": 4.466597206752002e-07, + "loss": 0.4024, + "reward": 0.4453125298023224, + "reward_std": 0.22344591841101646, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4291294813156128, + "rewards/tag_count_reward": 0.3582589477300644, "step": 973 }, { "clip_ratio": 0.0, - "completion_length": 1804.9532165527344, + "completion_length": 1608.1407165527344, "epoch": 0.2909416772459114, - "grad_norm": 0.2334723323583603, - "kl": 0.0958251953125, - "learning_rate": 8.929972385339465e-08, - "loss": 0.0434, - "reward": 0.4804687574505806, - "reward_std": 0.10459140501916409, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 25.00407600402832, + "kl": 4.23828125, + "learning_rate": 4.464986192669733e-07, + "loss": 0.3379, + "reward": 0.4469866156578064, + "reward_std": 0.1797037310898304, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.3599330484867096, "step": 974 }, { "clip_ratio": 0.0, - "completion_length": 1623.6719360351562, + "completion_length": 1451.8259582519531, "epoch": 0.2912403853334329, - "grad_norm": 0.2864517867565155, - "kl": 0.091064453125, - "learning_rate": 8.926746081750152e-08, - "loss": 0.0935, - "reward": 0.506696455180645, - "reward_std": 0.11614243313670158, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 42.25094985961914, + "kl": 3.64453125, + "learning_rate": 4.463373040875076e-07, + "loss": 0.3223, + "reward": 0.4804687723517418, + "reward_std": 0.1670316755771637, + "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.424107164144516, + "rewards/tag_count_reward": 0.3889509066939354, "step": 975 }, { "clip_ratio": 0.0, - "completion_length": 1635.7367248535156, + "completion_length": 1466.7344665527344, "epoch": 0.29153909342095435, - "grad_norm": 0.3578152060508728, - "kl": 0.0927734375, - "learning_rate": 8.923515506245962e-08, - "loss": 0.1227, - "reward": 0.4776785969734192, - "reward_std": 0.13276739791035652, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 38.1767692565918, + "kl": 4.31640625, + "learning_rate": 4.461757753122981e-07, + "loss": 0.4222, + "reward": 0.4626116305589676, + "reward_std": 0.18099632114171982, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4218750223517418, + "rewards/tag_count_reward": 0.3956473395228386, "step": 976 }, { "clip_ratio": 0.0, - "completion_length": 1727.7478332519531, + "completion_length": 1517.8505249023438, "epoch": 0.2918378015084758, - "grad_norm": 0.23902197182178497, - "kl": 0.0960693359375, - "learning_rate": 8.920280662341448e-08, - "loss": 0.0557, - "reward": 0.498325914144516, - "reward_std": 0.15327100455760956, - "rewards/accuracy_reward": 0.07142857182770967, + "grad_norm": 28.05716896057129, + "kl": 4.59765625, + "learning_rate": 4.460140331170724e-07, + "loss": 0.4062, + "reward": 0.446986623108387, + "reward_std": 0.1984858587384224, + "rewards/accuracy_reward": 0.06473214761354029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4268973395228386, + "rewards/tag_count_reward": 0.3822544813156128, "step": 977 }, { "clip_ratio": 0.0, - "completion_length": 1739.8348999023438, + "completion_length": 1560.8929443359375, "epoch": 0.2921365095959973, - "grad_norm": 0.26423704624176025, - "kl": 0.09765625, - "learning_rate": 8.917041553555803e-08, - "loss": 0.0656, - "reward": 0.5206473469734192, - "reward_std": 0.11990879289805889, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 14.244260787963867, + "kl": 5.203125, + "learning_rate": 4.4585207767779016e-07, + "loss": 0.4359, + "reward": 0.4693080633878708, + "reward_std": 0.16065412759780884, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4202009066939354, + "rewards/tag_count_reward": 0.3710937649011612, "step": 978 }, { "clip_ratio": 0.0, - "completion_length": 1730.3929443359375, + "completion_length": 1561.8661499023438, "epoch": 0.29243521768351877, - "grad_norm": 0.20941242575645447, - "kl": 0.093994140625, - "learning_rate": 8.913798183412858e-08, - "loss": 0.0471, - "reward": 0.5390625223517418, - "reward_std": 0.15125593543052673, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 71.85823822021484, + "kl": 5.59375, + "learning_rate": 4.4568990917064294e-07, + "loss": 0.3834, + "reward": 0.5167410969734192, + "reward_std": 0.20071503147482872, + "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4073660895228386, + "rewards/tag_count_reward": 0.3850446492433548, "step": 979 }, { "clip_ratio": 0.0, - "completion_length": 1751.43310546875, + "completion_length": 1588.7679138183594, "epoch": 0.29273392577104024, - "grad_norm": 0.29243195056915283, - "kl": 0.0950927734375, - "learning_rate": 8.910550555441083e-08, - "loss": 0.0819, - "reward": 0.474888414144516, - "reward_std": 0.12978000938892365, - "rewards/accuracy_reward": 0.06250000186264515, + "grad_norm": 24.071256637573242, + "kl": 5.5546875, + "learning_rate": 4.4552752777205414e-07, + "loss": 0.4099, + "reward": 0.4335937723517418, + "reward_std": 0.1797768399119377, + "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4123884066939354, + "rewards/tag_count_reward": 0.3733259066939354, "step": 980 }, { "clip_ratio": 0.0, - "completion_length": 1618.0982666015625, + "completion_length": 1480.4375915527344, "epoch": 0.2930326338585617, - "grad_norm": 0.2802864611148834, - "kl": 0.0953369140625, - "learning_rate": 8.90729867317358e-08, - "loss": 0.0702, - "reward": 0.5669643133878708, - "reward_std": 0.1191399060189724, + "grad_norm": 75.88976287841797, + "kl": 6.81640625, + "learning_rate": 4.45364933658679e-07, + "loss": 0.5181, + "reward": 0.517857164144516, + "reward_std": 0.1631302312016487, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.419642873108387, + "rewards/tag_count_reward": 0.3705357387661934, "step": 981 }, { "clip_ratio": 0.0, - "completion_length": 1809.6608276367188, + "completion_length": 1627.0335693359375, "epoch": 0.2933313419460832, - "grad_norm": 0.2596522271633148, - "kl": 0.10400390625, - "learning_rate": 8.904042540148075e-08, - "loss": 0.0693, - "reward": 0.4363839477300644, - "reward_std": 0.11599664576351643, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 103.74700164794922, + "kl": 7.671875, + "learning_rate": 4.452021270074038e-07, + "loss": 0.5307, + "reward": 0.3777901902794838, + "reward_std": 0.15603306517004967, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3939732387661934, + "rewards/tag_count_reward": 0.337611623108387, "step": 982 }, { "clip_ratio": 0.0, - "completion_length": 1693.9085693359375, + "completion_length": 1560.0380249023438, "epoch": 0.29363005003360465, - "grad_norm": 0.2504347860813141, - "kl": 0.1011962890625, - "learning_rate": 8.900782159906925e-08, - "loss": 0.0671, - "reward": 0.5212053805589676, - "reward_std": 0.10055191069841385, - "rewards/accuracy_reward": 0.10267857578583062, + "grad_norm": 70.32150268554688, + "kl": 6.328125, + "learning_rate": 4.4503910799534626e-07, + "loss": 0.4856, + "reward": 0.4704241305589676, + "reward_std": 0.19276686757802963, + "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4185268059372902, + "rewards/tag_count_reward": 0.3677455559372902, "step": 983 }, { "clip_ratio": 0.0, - "completion_length": 1645.352783203125, + "completion_length": 1486.7656860351562, "epoch": 0.2939287581211261, - "grad_norm": 0.2420835793018341, - "kl": 0.096435546875, - "learning_rate": 8.897517535997103e-08, - "loss": 0.0704, - "reward": 0.5373884290456772, - "reward_std": 0.18229354172945023, - "rewards/accuracy_reward": 0.10491072246804833, + "grad_norm": 20.87571907043457, + "kl": 4.15234375, + "learning_rate": 4.448758767998552e-07, + "loss": 0.3801, + "reward": 0.4687500149011612, + "reward_std": 0.1901467889547348, + "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4324776977300644, + "rewards/tag_count_reward": 0.3906250223517418, "step": 984 }, { "clip_ratio": 0.0, - "completion_length": 1665.1697387695312, + "completion_length": 1568.76123046875, "epoch": 0.2942274662086476, - "grad_norm": 0.25104933977127075, - "kl": 0.1011962890625, - "learning_rate": 8.8942486719702e-08, - "loss": 0.0627, - "reward": 0.5245535969734192, - "reward_std": 0.15337442606687546, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 39.32463455200195, + "kl": 5.6796875, + "learning_rate": 4.4471243359851e-07, + "loss": 0.4367, + "reward": 0.4430803805589676, + "reward_std": 0.23091403022408485, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4352678805589676, + "rewards/tag_count_reward": 0.3582589402794838, "step": 985 }, { "clip_ratio": 0.0, - "completion_length": 1622.6272888183594, + "completion_length": 1467.9688110351562, "epoch": 0.29452617429616906, - "grad_norm": 0.3036876916885376, - "kl": 0.0966796875, - "learning_rate": 8.890975571382417e-08, - "loss": 0.0756, - "reward": 0.5608259290456772, - "reward_std": 0.16955275274813175, - "rewards/accuracy_reward": 0.12053571827709675, + "grad_norm": 29.723173141479492, + "kl": 3.82421875, + "learning_rate": 4.4454877856912086e-07, + "loss": 0.3649, + "reward": 0.518415205180645, + "reward_std": 0.20138828083872795, + "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.440290205180645, + "rewards/tag_count_reward": 0.404575914144516, "step": 986 }, { "clip_ratio": 0.0, - "completion_length": 1677.7188415527344, + "completion_length": 1500.9018249511719, "epoch": 0.29482488238369053, - "grad_norm": 0.28219255805015564, - "kl": 0.1053466796875, - "learning_rate": 8.887698237794571e-08, - "loss": 0.0718, - "reward": 0.5518973395228386, - "reward_std": 0.13723501935601234, - "rewards/accuracy_reward": 0.13169643376022577, + "grad_norm": 19.145767211914062, + "kl": 5.0859375, + "learning_rate": 4.4438491188972853e-07, + "loss": 0.4107, + "reward": 0.510044664144516, + "reward_std": 0.1924300603568554, + "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.420200914144516, + "rewards/tag_count_reward": 0.3738839477300644, "step": 987 }, { "clip_ratio": 0.0, - "completion_length": 1735.6964721679688, + "completion_length": 1585.5246276855469, "epoch": 0.295123590471212, - "grad_norm": 0.279223769903183, - "kl": 0.1131591796875, - "learning_rate": 8.884416674772077e-08, - "loss": 0.0595, - "reward": 0.5022321566939354, - "reward_std": 0.15307026728987694, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 27.292633056640625, + "kl": 4.046875, + "learning_rate": 4.442208337386039e-07, + "loss": 0.3557, + "reward": 0.4414062649011612, + "reward_std": 0.22749869152903557, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4129464477300644, + "rewards/tag_count_reward": 0.3588169813156128, "step": 988 }, { "clip_ratio": 0.0, - "completion_length": 1767.0804443359375, + "completion_length": 1594.3906860351562, "epoch": 0.2954222985587335, - "grad_norm": 0.28154799342155457, - "kl": 0.1092529296875, - "learning_rate": 8.881130885884954e-08, - "loss": 0.0631, - "reward": 0.5000000149011612, - "reward_std": 0.12521225586533546, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 28.84008026123047, + "kl": 4.2734375, + "learning_rate": 4.4405654429424774e-07, + "loss": 0.375, + "reward": 0.4296875074505806, + "reward_std": 0.18899821117520332, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4062500149011612, + "rewards/tag_count_reward": 0.3426339477300644, "step": 989 }, { "clip_ratio": 0.0, - "completion_length": 1662.2255249023438, + "completion_length": 1470.0580749511719, "epoch": 0.29572100664625495, - "grad_norm": 0.2366633415222168, - "kl": 0.1075439453125, - "learning_rate": 8.87784087470782e-08, - "loss": 0.0507, - "reward": 0.4698660969734192, - "reward_std": 0.13635482639074326, - "rewards/accuracy_reward": 0.04017857299186289, + "grad_norm": 8.223483085632324, + "kl": 4.4609375, + "learning_rate": 4.43892043735391e-07, + "loss": 0.3745, + "reward": 0.424665205180645, + "reward_std": 0.17849614843726158, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4296875149011612, + "rewards/tag_count_reward": 0.3822544813156128, "step": 990 }, { "clip_ratio": 0.0, - "completion_length": 1735.24560546875, + "completion_length": 1550.1898193359375, "epoch": 0.2960197147337764, - "grad_norm": 0.278126060962677, - "kl": 0.1087646484375, - "learning_rate": 8.874546644819883e-08, - "loss": 0.0788, - "reward": 0.5479910969734192, - "reward_std": 0.15367202647030354, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 17.552621841430664, + "kl": 4.7734375, + "learning_rate": 4.4372733224099413e-07, + "loss": 0.4114, + "reward": 0.4648437798023224, + "reward_std": 0.20557358115911484, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.416294664144516, + "rewards/tag_count_reward": 0.3554687649011612, "step": 991 }, { "clip_ratio": 0.0, - "completion_length": 1766.26123046875, + "completion_length": 1606.5022583007812, "epoch": 0.2963184228212979, - "grad_norm": 0.3086409866809845, - "kl": 0.1158447265625, - "learning_rate": 8.871248199804942e-08, - "loss": 0.0758, - "reward": 0.5078125298023224, - "reward_std": 0.16361425444483757, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 63.84352111816406, + "kl": 6.5546875, + "learning_rate": 4.4356240999024715e-07, + "loss": 0.4849, + "reward": 0.4308035895228386, + "reward_std": 0.1798197366297245, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4274553805589676, + "rewards/tag_count_reward": 0.3638392984867096, "step": 992 }, { "clip_ratio": 0.0, - "completion_length": 1790.4666137695312, + "completion_length": 1604.8348693847656, "epoch": 0.29661713090881936, - "grad_norm": 0.23227554559707642, - "kl": 0.11962890625, - "learning_rate": 8.867945543251385e-08, - "loss": 0.0494, - "reward": 0.4693080708384514, - "reward_std": 0.1362256295979023, - "rewards/accuracy_reward": 0.058035715483129025, + "grad_norm": 21.80406379699707, + "kl": 6.125, + "learning_rate": 4.4339727716256927e-07, + "loss": 0.4763, + "reward": 0.4034598395228386, + "reward_std": 0.16411307454109192, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4112723395228386, + "rewards/tag_count_reward": 0.3565848395228386, "step": 993 }, { "clip_ratio": 0.0, - "completion_length": 1646.0603332519531, + "completion_length": 1539.1942443847656, "epoch": 0.29691583899634083, - "grad_norm": 0.26702776551246643, - "kl": 0.1121826171875, - "learning_rate": 8.864638678752174e-08, - "loss": 0.0546, - "reward": 0.5636160969734192, - "reward_std": 0.1535173449665308, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 51.44820022583008, + "kl": 6.0703125, + "learning_rate": 4.432319339376087e-07, + "loss": 0.4495, + "reward": 0.4799107313156128, + "reward_std": 0.21330787986516953, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.443080373108387, + "rewards/tag_count_reward": 0.3794643059372902, "step": 994 }, { "clip_ratio": 0.0, - "completion_length": 1725.58935546875, + "completion_length": 1639.6094360351562, "epoch": 0.2972145470838623, - "grad_norm": 0.26914486289024353, - "kl": 0.110595703125, - "learning_rate": 8.861327609904857e-08, - "loss": 0.0647, - "reward": 0.5161830633878708, - "reward_std": 0.1295673232525587, - "rewards/accuracy_reward": 0.10044643213041127, + "grad_norm": 79.29431915283203, + "kl": 6.3515625, + "learning_rate": 4.4306638049524286e-07, + "loss": 0.4432, + "reward": 0.4425223395228386, + "reward_std": 0.1793471798300743, + "rewards/accuracy_reward": 0.09598214412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366156578064, + "rewards/tag_count_reward": 0.3465401902794838, "step": 995 }, { "clip_ratio": 0.0, - "completion_length": 1797.7344665527344, + "completion_length": 1640.0134582519531, "epoch": 0.2975132551713838, - "grad_norm": 0.2760086953639984, - "kl": 0.122802734375, - "learning_rate": 8.858012340311551e-08, - "loss": 0.0642, - "reward": 0.5563616380095482, - "reward_std": 0.1758715808391571, - "rewards/accuracy_reward": 0.15401786798611283, + "grad_norm": 58.035011291503906, + "kl": 6.1015625, + "learning_rate": 4.4290061701557756e-07, + "loss": 0.4487, + "reward": 0.4983259215950966, + "reward_std": 0.21006934344768524, + "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437649011612, + "rewards/tag_count_reward": 0.3510044887661934, "step": 996 }, { "clip_ratio": 0.0, - "completion_length": 1712.493408203125, + "completion_length": 1547.165283203125, "epoch": 0.29781196325890524, - "grad_norm": 0.27730369567871094, - "kl": 0.115966796875, - "learning_rate": 8.854692873578943e-08, - "loss": 0.0591, - "reward": 0.5184151902794838, - "reward_std": 0.15799195878207684, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 11.679484367370605, + "kl": 4.67578125, + "learning_rate": 4.427346436789472e-07, + "loss": 0.3812, + "reward": 0.4642857387661934, + "reward_std": 0.20103354007005692, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4268973395228386, + "rewards/tag_count_reward": 0.377232164144516, "step": 997 }, { "clip_ratio": 0.0, - "completion_length": 1650.3081359863281, + "completion_length": 1519.5759582519531, "epoch": 0.2981106713464267, - "grad_norm": 0.26459646224975586, - "kl": 0.114013671875, - "learning_rate": 8.851369213318291e-08, - "loss": 0.0637, - "reward": 0.5998884215950966, - "reward_std": 0.15004508569836617, - "rewards/accuracy_reward": 0.15178572479635477, + "grad_norm": 14.813997268676758, + "kl": 4.4140625, + "learning_rate": 4.425684606659146e-07, + "loss": 0.3565, + "reward": 0.5150669813156128, + "reward_std": 0.19538959488272667, + "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.448102705180645, + "rewards/tag_count_reward": 0.376674123108387, "step": 998 }, { "clip_ratio": 0.0, - "completion_length": 1684.1853637695312, + "completion_length": 1577.0045166015625, "epoch": 0.2984093794339482, - "grad_norm": 0.2363187074661255, - "kl": 0.1143798828125, - "learning_rate": 8.848041363145411e-08, - "loss": 0.0578, - "reward": 0.553013414144516, - "reward_std": 0.1673597116023302, - "rewards/accuracy_reward": 0.11160714738070965, + "grad_norm": 34.87970733642578, + "kl": 3.97265625, + "learning_rate": 4.4240206815727054e-07, + "loss": 0.3433, + "reward": 0.4525669887661934, + "reward_std": 0.20338879898190498, + "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062649011612, + "rewards/tag_count_reward": 0.365513414144516, "step": 999 }, { "clip_ratio": 0.0, - "completion_length": 1735.1697082519531, + "completion_length": 1577.1027526855469, "epoch": 0.29870808752146966, - "grad_norm": 0.3038051724433899, - "kl": 0.1246337890625, - "learning_rate": 8.844709326680675e-08, - "loss": 0.0713, - "reward": 0.5161830708384514, - "reward_std": 0.11465869098901749, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 8.51466178894043, + "kl": 4.74609375, + "learning_rate": 4.4223546633403373e-07, + "loss": 0.3881, + "reward": 0.4375000223517418, + "reward_std": 0.16045677661895752, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4380580559372902, + "rewards/tag_count_reward": 0.3593750149011612, "step": 1000 }, { "clip_ratio": 0.0, - "completion_length": 1590.7791137695312, + "completion_length": 1469.6429443359375, "epoch": 0.2990067956089911, - "grad_norm": 0.1942986249923706, - "kl": 0.10986328125, - "learning_rate": 8.841373107549013e-08, - "loss": 0.0431, - "reward": 0.540736623108387, - "reward_std": 0.13716292195022106, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 51.540889739990234, + "kl": 3.95703125, + "learning_rate": 4.420686553774506e-07, + "loss": 0.3679, + "reward": 0.4531250223517418, + "reward_std": 0.19104110449552536, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.451450914144516, + "rewards/tag_count_reward": 0.388392873108387, "step": 1001 }, { "clip_ratio": 0.0, - "completion_length": 1558.0759582519531, + "completion_length": 1485.7745971679688, "epoch": 0.2993055036965126, - "grad_norm": 0.22444775700569153, - "kl": 0.1058349609375, - "learning_rate": 8.838032709379906e-08, - "loss": 0.0448, - "reward": 0.5630580559372902, - "reward_std": 0.159863643348217, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 9.281465530395508, + "kl": 4.4453125, + "learning_rate": 4.4190163546899527e-07, + "loss": 0.3859, + "reward": 0.4676339477300644, + "reward_std": 0.22507762908935547, + "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455915205180645, + "rewards/tag_count_reward": 0.3738839477300644, "step": 1002 }, { "clip_ratio": 0.0, - "completion_length": 1623.3683776855469, - "epoch": 0.29960421178403407, - "grad_norm": 0.31433752179145813, - "kl": 0.12060546875, - "learning_rate": 8.834688135807376e-08, - "loss": 0.0701, - "reward": 0.589285746216774, - "reward_std": 0.1321140993386507, - "rewards/accuracy_reward": 0.1428571492433548, + "completion_length": 1518.8772888183594, + "epoch": 0.29960421178403407, + "grad_norm": 42.958457946777344, + "kl": 4.28515625, + "learning_rate": 4.4173440679036885e-07, + "loss": 0.3902, + "reward": 0.5251116305589676, + "reward_std": 0.20143787190318108, + "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4464285969734192, + "rewards/tag_count_reward": 0.3822544813156128, "step": 1003 }, { "clip_ratio": 0.0, - "completion_length": 1758.5759887695312, + "completion_length": 1593.94873046875, "epoch": 0.29990291987155554, - "grad_norm": 0.2489154040813446, - "kl": 0.1302490234375, - "learning_rate": 8.831339390469996e-08, - "loss": 0.048, - "reward": 0.5513393208384514, - "reward_std": 0.15616323426365852, - "rewards/accuracy_reward": 0.11383928824216127, + "grad_norm": 65.59077453613281, + "kl": 5.59375, + "learning_rate": 4.415669695234998e-07, + "loss": 0.3752, + "reward": 0.4754464477300644, + "reward_std": 0.20330213755369186, + "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4375000223517418, + "rewards/tag_count_reward": 0.3705357313156128, "step": 1004 }, { "clip_ratio": 0.0, - "completion_length": 1728.9152221679688, + "completion_length": 1592.5670166015625, "epoch": 0.300201627959077, - "grad_norm": 0.2689759433269501, - "kl": 0.1279296875, - "learning_rate": 8.827986477010871e-08, - "loss": 0.0659, - "reward": 0.568080373108387, - "reward_std": 0.13339640758931637, - "rewards/accuracy_reward": 0.13392857578583062, + "grad_norm": 16.7629451751709, + "kl": 4.984375, + "learning_rate": 4.413993238505436e-07, + "loss": 0.4012, + "reward": 0.4955357313156128, + "reward_std": 0.16545912250876427, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4341517984867096, + "rewards/tag_count_reward": 0.377232164144516, "step": 1005 }, { "clip_ratio": 0.0, - "completion_length": 1734.9152526855469, + "completion_length": 1588.5179443359375, "epoch": 0.3005003360465985, - "grad_norm": 0.26862844824790955, - "kl": 0.1279296875, - "learning_rate": 8.824629399077644e-08, - "loss": 0.0592, - "reward": 0.4681919813156128, - "reward_std": 0.14025800675153732, - "rewards/accuracy_reward": 0.03348214388824999, + "grad_norm": 108.32943725585938, + "kl": 6.8046875, + "learning_rate": 4.4123146995388215e-07, + "loss": 0.4744, + "reward": 0.3710937723517418, + "reward_std": 0.2016550935804844, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4347098469734192, + "rewards/tag_count_reward": 0.3487723469734192, "step": 1006 }, { "clip_ratio": 0.0, - "completion_length": 1627.4955749511719, + "completion_length": 1499.3951721191406, "epoch": 0.30079904413411995, - "grad_norm": 0.24022650718688965, - "kl": 0.1209716796875, - "learning_rate": 8.821268160322481e-08, - "loss": 0.0462, - "reward": 0.5585937798023224, - "reward_std": 0.17055577039718628, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 32.783966064453125, + "kl": 5.46875, + "learning_rate": 4.4106340801612405e-07, + "loss": 0.4417, + "reward": 0.4564732238650322, + "reward_std": 0.21038522943854332, + "rewards/accuracy_reward": 0.08035714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4670759066939354, + "rewards/tag_count_reward": 0.3761160895228386, "step": 1007 }, { "clip_ratio": 0.0, - "completion_length": 1727.43310546875, + "completion_length": 1608.5782165527344, "epoch": 0.3010977522216414, - "grad_norm": 0.26220059394836426, - "kl": 0.135986328125, - "learning_rate": 8.817902764402086e-08, - "loss": 0.0516, - "reward": 0.5412946715950966, - "reward_std": 0.11173194833099842, - "rewards/accuracy_reward": 0.09375000465661287, + "grad_norm": 48.1810417175293, + "kl": 5.8828125, + "learning_rate": 4.4089513822010435e-07, + "loss": 0.4162, + "reward": 0.448660746216774, + "reward_std": 0.15346087515354156, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4475446715950966, + "rewards/tag_count_reward": 0.3683035895228386, "step": 1008 }, { "clip_ratio": 0.0, - "completion_length": 1715.3951721191406, + "completion_length": 1617.8125610351562, "epoch": 0.3013964603091629, - "grad_norm": 0.23130014538764954, - "kl": 0.12158203125, - "learning_rate": 8.814533214977679e-08, - "loss": 0.0378, - "reward": 0.4977678805589676, - "reward_std": 0.11140180379152298, - "rewards/accuracy_reward": 0.04910714388824999, + "grad_norm": 26.268178939819336, + "kl": 4.609375, + "learning_rate": 4.407266607488839e-07, + "loss": 0.3183, + "reward": 0.426897332072258, + "reward_std": 0.1750059649348259, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4486607238650322, + "rewards/tag_count_reward": 0.3777901902794838, "step": 1009 }, { "clip_ratio": 0.0, - "completion_length": 1680.7545166015625, + "completion_length": 1541.1049499511719, "epoch": 0.30169516839668437, - "grad_norm": 0.2405877411365509, - "kl": 0.1280517578125, - "learning_rate": 8.811159515714997e-08, - "loss": 0.0564, - "reward": 0.5251116305589676, - "reward_std": 0.1084767933934927, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 17.426828384399414, + "kl": 4.125, + "learning_rate": 4.4055797578574983e-07, + "loss": 0.343, + "reward": 0.4743303805589676, + "reward_std": 0.1794714257121086, + "rewards/accuracy_reward": 0.08928572107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4425223395228386, + "rewards/tag_count_reward": 0.3850446566939354, "step": 1010 }, { "clip_ratio": 0.0, - "completion_length": 1684.4911499023438, + "completion_length": 1525.3795471191406, "epoch": 0.30199387648420584, - "grad_norm": 0.2592833638191223, - "kl": 0.12939453125, - "learning_rate": 8.807781670284295e-08, - "loss": 0.0526, - "reward": 0.5507812649011612, - "reward_std": 0.12560125067830086, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 50.402915954589844, + "kl": 3.671875, + "learning_rate": 4.403890835142148e-07, + "loss": 0.3282, + "reward": 0.4966518059372902, + "reward_std": 0.16985390335321426, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.454799123108387, + "rewards/tag_count_reward": 0.4006696492433548, "step": 1011 }, { "clip_ratio": 0.0, - "completion_length": 1667.6072082519531, + "completion_length": 1591.0380249023438, "epoch": 0.30229258457172725, - "grad_norm": 0.24880562722682953, - "kl": 0.1309814453125, - "learning_rate": 8.804399682360341e-08, - "loss": 0.0453, - "reward": 0.5848214477300644, - "reward_std": 0.1679949965327978, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 7.6185736656188965, + "kl": 4.71875, + "learning_rate": 4.4021998411801705e-07, + "loss": 0.3687, + "reward": 0.5139509215950966, + "reward_std": 0.23822612315416336, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575892984867096, + "rewards/tag_count_reward": 0.3755580484867096, "step": 1012 }, { "clip_ratio": 0.0, - "completion_length": 1699.3616943359375, + "completion_length": 1554.3215026855469, "epoch": 0.3025912926592487, - "grad_norm": 0.23419608175754547, - "kl": 0.1341552734375, - "learning_rate": 8.801013555622402e-08, - "loss": 0.0535, - "reward": 0.5535714626312256, - "reward_std": 0.1635556574910879, - "rewards/accuracy_reward": 0.09598214644938707, + "grad_norm": 53.159149169921875, + "kl": 3.890625, + "learning_rate": 4.400506777811201e-07, + "loss": 0.3674, + "reward": 0.446986623108387, + "reward_std": 0.20431933924555779, + "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893059372902, + "rewards/tag_count_reward": 0.3755580484867096, "step": 1013 }, { "clip_ratio": 0.0, - "completion_length": 1672.8907165527344, + "completion_length": 1580.9085388183594, "epoch": 0.3028900007467702, - "grad_norm": 0.23270738124847412, - "kl": 0.1300048828125, - "learning_rate": 8.797623293754255e-08, - "loss": 0.0497, - "reward": 0.6043526977300644, - "reward_std": 0.1255186665803194, - "rewards/accuracy_reward": 0.1406250037252903, + "grad_norm": 9.96097183227539, + "kl": 4.517578125, + "learning_rate": 4.3988116468771275e-07, + "loss": 0.3683, + "reward": 0.499441996216774, + "reward_std": 0.18105580657720566, + "rewards/accuracy_reward": 0.12946429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463727705180645, + "rewards/tag_count_reward": 0.3699776902794838, "step": 1014 }, { "clip_ratio": 0.0, - "completion_length": 1622.9777526855469, + "completion_length": 1578.1004943847656, "epoch": 0.30318870883429166, - "grad_norm": 0.2262869030237198, - "kl": 0.12939453125, - "learning_rate": 8.794228900444169e-08, - "loss": 0.0456, - "reward": 0.5625000223517418, - "reward_std": 0.11579805426299572, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 9.517431259155273, + "kl": 4.7421875, + "learning_rate": 4.397114450222085e-07, + "loss": 0.3903, + "reward": 0.4614955559372902, + "reward_std": 0.20815807580947876, + "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.361049123108387, "step": 1015 }, { "clip_ratio": 0.0, - "completion_length": 1492.0357666015625, + "completion_length": 1455.5982666015625, "epoch": 0.30348741692181314, - "grad_norm": 0.24470803141593933, - "kl": 0.1204833984375, - "learning_rate": 8.790830379384917e-08, - "loss": 0.0533, - "reward": 0.6032366454601288, - "reward_std": 0.1529507152736187, - "rewards/accuracy_reward": 0.1339285746216774, + "grad_norm": 49.265785217285156, + "kl": 3.33984375, + "learning_rate": 4.3954151896924586e-07, + "loss": 0.316, + "reward": 0.5212053805589676, + "reward_std": 0.19894807785749435, + "rewards/accuracy_reward": 0.10937500651925802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.411830373108387, "step": 1016 }, { "clip_ratio": 0.0, - "completion_length": 1632.4598693847656, + "completion_length": 1542.2009582519531, "epoch": 0.3037861250093346, - "grad_norm": 0.22619178891181946, - "kl": 0.1309814453125, - "learning_rate": 8.787427734273752e-08, - "loss": 0.0662, - "reward": 0.5613839626312256, - "reward_std": 0.16910400986671448, - "rewards/accuracy_reward": 0.09821429336443543, + "grad_norm": 14.753308296203613, + "kl": 3.91796875, + "learning_rate": 4.3937138671368756e-07, + "loss": 0.3152, + "reward": 0.4994419887661934, + "reward_std": 0.21065231412649155, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.396763414144516, "step": 1017 }, { "clip_ratio": 0.0, - "completion_length": 1601.6094360351562, + "completion_length": 1526.35498046875, "epoch": 0.3040848330968561, - "grad_norm": 0.2310887724161148, - "kl": 0.1300048828125, - "learning_rate": 8.78402096881242e-08, - "loss": 0.0473, - "reward": 0.6579241380095482, - "reward_std": 0.14792967773973942, - "rewards/accuracy_reward": 0.196428582072258, + "grad_norm": 6.20982027053833, + "kl": 4.34375, + "learning_rate": 4.39201048440621e-07, + "loss": 0.3671, + "reward": 0.5825893059372902, + "reward_std": 0.20545059069991112, + "rewards/accuracy_reward": 0.20312500977888703, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4614955633878708, + "rewards/tag_count_reward": 0.3794642984867096, "step": 1018 }, { "clip_ratio": 0.0, - "completion_length": 1566.3572082519531, + "completion_length": 1475.0581359863281, "epoch": 0.30438354118437755, - "grad_norm": 0.23909752070903778, - "kl": 0.121826171875, - "learning_rate": 8.780610086707147e-08, - "loss": 0.0435, - "reward": 0.6132812649011612, - "reward_std": 0.1267966851592064, - "rewards/accuracy_reward": 0.14508929289877415, + "grad_norm": 35.25672149658203, + "kl": 3.994140625, + "learning_rate": 4.390305043353574e-07, + "loss": 0.3371, + "reward": 0.5440848469734192, + "reward_std": 0.17119980603456497, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919887661934, + "rewards/tag_count_reward": 0.4056919813156128, "step": 1019 }, { "clip_ratio": 0.0, - "completion_length": 1685.4018859863281, + "completion_length": 1546.8438415527344, "epoch": 0.304682249271899, - "grad_norm": 0.2660771608352661, - "kl": 0.14013671875, - "learning_rate": 8.77719509166864e-08, - "loss": 0.0472, - "reward": 0.5959821715950966, - "reward_std": 0.12307244725525379, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 34.416229248046875, + "kl": 5.8359375, + "learning_rate": 4.38859754583432e-07, + "loss": 0.4598, + "reward": 0.5066964477300644, + "reward_std": 0.1853574588894844, + "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535895228386, + "rewards/tag_count_reward": 0.383928582072258, "step": 1020 }, { "clip_ratio": 0.0, - "completion_length": 1580.3594360351562, + "completion_length": 1491.5625610351562, "epoch": 0.3049809573594205, - "grad_norm": 0.2174219787120819, - "kl": 0.128662109375, - "learning_rate": 8.773775987412078e-08, - "loss": 0.0473, - "reward": 0.5368303805589676, - "reward_std": 0.11867643147706985, - "rewards/accuracy_reward": 0.06473214528523386, + "grad_norm": 86.2718505859375, + "kl": 6.28125, + "learning_rate": 4.386887993706039e-07, + "loss": 0.4302, + "reward": 0.459821455180645, + "reward_std": 0.19159509614109993, + "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982313156128, + "rewards/tag_count_reward": 0.388392873108387, "step": 1021 }, { "clip_ratio": 0.0, - "completion_length": 1638.0625610351562, + "completion_length": 1563.9844665527344, "epoch": 0.30527966544694196, - "grad_norm": 0.18605037033557892, - "kl": 0.132080078125, - "learning_rate": 8.770352777657112e-08, - "loss": 0.0266, - "reward": 0.5000000298023224, - "reward_std": 0.12246980052441359, - "rewards/accuracy_reward": 0.03348214412108064, + "grad_norm": 82.9708480834961, + "kl": 6.203125, + "learning_rate": 4.385176388828556e-07, + "loss": 0.4386, + "reward": 0.4068080484867096, + "reward_std": 0.18860863521695137, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.3777901902794838, "step": 1022 }, { "clip_ratio": 0.0, - "completion_length": 1663.0715026855469, + "completion_length": 1546.7523193359375, "epoch": 0.30557837353446343, - "grad_norm": 0.2319536656141281, - "kl": 0.132568359375, - "learning_rate": 8.766925466127858e-08, - "loss": 0.0446, - "reward": 0.6205357313156128, - "reward_std": 0.18215098977088928, - "rewards/accuracy_reward": 0.15178571827709675, + "grad_norm": 40.68043899536133, + "kl": 5.1328125, + "learning_rate": 4.383462733063929e-07, + "loss": 0.3875, + "reward": 0.553571455180645, + "reward_std": 0.24302224814891815, + "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500149011612, + "rewards/tag_count_reward": 0.3950893059372902, "step": 1023 }, { "clip_ratio": 0.0, - "completion_length": 1621.618408203125, + "completion_length": 1539.2255249023438, "epoch": 0.3058770816219849, - "grad_norm": 0.2912188470363617, - "kl": 0.1416015625, - "learning_rate": 8.763494056552896e-08, - "loss": 0.0541, - "reward": 0.4838169813156128, - "reward_std": 0.10559122078120708, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 14.853148460388184, + "kl": 5.0859375, + "learning_rate": 4.381747028276448e-07, + "loss": 0.3968, + "reward": 0.4051339477300644, + "reward_std": 0.1969018653035164, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919813156128, + "rewards/tag_count_reward": 0.380580373108387, "step": 1024 }, { "clip_ratio": 0.0, - "completion_length": 1727.4577026367188, + "completion_length": 1591.21435546875, "epoch": 0.3061757897095064, - "grad_norm": 0.2009354680776596, - "kl": 0.136962890625, - "learning_rate": 8.760058552665262e-08, - "loss": 0.0383, - "reward": 0.5479910969734192, - "reward_std": 0.1677742749452591, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 19.180971145629883, + "kl": 3.9375, + "learning_rate": 4.3800292763326307e-07, + "loss": 0.2954, + "reward": 0.446986623108387, + "reward_std": 0.23462994769215584, + "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018133878708, + "rewards/tag_count_reward": 0.3822544813156128, "step": 1025 }, { "clip_ratio": 0.0, - "completion_length": 1683.3929138183594, + "completion_length": 1544.1652526855469, "epoch": 0.30647449779702785, - "grad_norm": 0.22285275161266327, - "kl": 0.141357421875, - "learning_rate": 8.756618958202446e-08, - "loss": 0.0492, - "reward": 0.6322545036673546, - "reward_std": 0.11132025346159935, - "rewards/accuracy_reward": 0.17187500488944352, + "grad_norm": 33.8619270324707, + "kl": 4.109375, + "learning_rate": 4.3783094791012233e-07, + "loss": 0.36, + "reward": 0.5513393133878708, + "reward_std": 0.1862843707203865, + "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794887661934, + "rewards/tag_count_reward": 0.3794642984867096, "step": 1026 }, { "clip_ratio": 0.0, - "completion_length": 1655.9175109863281, + "completion_length": 1590.2054138183594, "epoch": 0.3067732058845493, - "grad_norm": 0.22298765182495117, - "kl": 0.141357421875, - "learning_rate": 8.753175276906394e-08, - "loss": 0.0599, - "reward": 0.5195312723517418, - "reward_std": 0.1522261407226324, - "rewards/accuracy_reward": 0.04910714388824999, + "grad_norm": 9.645373344421387, + "kl": 4.8984375, + "learning_rate": 4.376587638453197e-07, + "loss": 0.3721, + "reward": 0.4084821566939354, + "reward_std": 0.20559799298644066, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241380095482, + "rewards/tag_count_reward": 0.3772321566939354, "step": 1027 }, { "clip_ratio": 0.0, - "completion_length": 1677.5045166015625, + "completion_length": 1626.8415832519531, "epoch": 0.3070719139720708, - "grad_norm": 0.2298247516155243, - "kl": 0.132568359375, - "learning_rate": 8.74972751252349e-08, - "loss": 0.0537, - "reward": 0.5189732313156128, - "reward_std": 0.11821895837783813, - "rewards/accuracy_reward": 0.05803571571595967, + "grad_norm": 8.181839942932129, + "kl": 4.46484375, + "learning_rate": 4.374863756261745e-07, + "loss": 0.3375, + "reward": 0.4218750223517418, + "reward_std": 0.18572933599352837, + "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.3660714477300644, "step": 1028 }, { "clip_ratio": 0.0, - "completion_length": 1667.3750610351562, + "completion_length": 1596.3348999023438, "epoch": 0.30737062205959226, - "grad_norm": 0.2521454989910126, - "kl": 0.1385498046875, - "learning_rate": 8.746275668804565e-08, - "loss": 0.0473, - "reward": 0.5239955708384514, - "reward_std": 0.11458808556199074, - "rewards/accuracy_reward": 0.0580357164144516, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598395228386, + "grad_norm": 17.298545837402344, + "kl": 4.2265625, + "learning_rate": 4.373137834402283e-07, + "loss": 0.3378, + "reward": 0.4235491380095482, + "reward_std": 0.18014146015048027, + "rewards/accuracy_reward": 0.0513392873108387, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.369977705180645, "step": 1029 }, { "clip_ratio": 0.0, - "completion_length": 1649.7902526855469, + "completion_length": 1569.7500610351562, "epoch": 0.30766933014711373, - "grad_norm": 0.22305232286453247, - "kl": 0.13818359375, - "learning_rate": 8.742819749504889e-08, - "loss": 0.0578, - "reward": 0.5323661044239998, - "reward_std": 0.10416554100811481, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 58.50421142578125, + "kl": 3.69140625, + "learning_rate": 4.3714098747524445e-07, + "loss": 0.3306, + "reward": 0.4391741305589676, + "reward_std": 0.18023748695850372, + "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.3856026977300644, "step": 1030 }, { "clip_ratio": 0.0, - "completion_length": 1579.2969360351562, + "completion_length": 1501.5871276855469, "epoch": 0.3079680382346352, - "grad_norm": 0.2226778268814087, - "kl": 0.1302490234375, - "learning_rate": 8.73935975838416e-08, - "loss": 0.0306, - "reward": 0.566964328289032, - "reward_std": 0.15532585978507996, - "rewards/accuracy_reward": 0.08705357275903225, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4799107387661934, + "grad_norm": 46.79623031616211, + "kl": 3.875, + "learning_rate": 4.36967987919208e-07, + "loss": 0.3609, + "reward": 0.4425223469734192, + "reward_std": 0.23066024482250214, + "rewards/accuracy_reward": 0.0602678582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3822544738650322, "step": 1031 }, { "clip_ratio": 0.0, - "completion_length": 1565.9442749023438, + "completion_length": 1528.0558776855469, "epoch": 0.30826674632215667, - "grad_norm": 0.23260025680065155, - "kl": 0.139892578125, - "learning_rate": 8.735895699206511e-08, - "loss": 0.0602, - "reward": 0.631138414144516, - "reward_std": 0.15208750031888485, - "rewards/accuracy_reward": 0.1495535783469677, + "grad_norm": 9.182872772216797, + "kl": 4.796875, + "learning_rate": 4.367947849603256e-07, + "loss": 0.388, + "reward": 0.5251116380095482, + "reward_std": 0.250782061368227, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4815848469734192, + "rewards/tag_count_reward": 0.3822544738650322, "step": 1032 }, { "clip_ratio": 0.0, - "completion_length": 1622.3170471191406, + "completion_length": 1529.5156860351562, "epoch": 0.30856545440967814, - "grad_norm": 0.21874159574508667, - "kl": 0.14111328125, - "learning_rate": 8.732427575740499e-08, - "loss": 0.038, - "reward": 0.5256696790456772, - "reward_std": 0.1068424079567194, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 33.41720962524414, + "kl": 5.34375, + "learning_rate": 4.366213787870249e-07, + "loss": 0.3879, + "reward": 0.4268973395228386, + "reward_std": 0.18928997591137886, + "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4765625223517418, + "rewards/tag_count_reward": 0.3867187574505806, "step": 1033 }, { "clip_ratio": 0.0, - "completion_length": 1586.8326721191406, + "completion_length": 1481.4219360351562, "epoch": 0.3088641624971996, - "grad_norm": 0.2351917326450348, - "kl": 0.143798828125, - "learning_rate": 8.7289553917591e-08, - "loss": 0.0478, - "reward": 0.6110491156578064, - "reward_std": 0.08688551187515259, - "rewards/accuracy_reward": 0.13616072060540318, + "grad_norm": 9.278277397155762, + "kl": 5.171875, + "learning_rate": 4.3644776958795503e-07, + "loss": 0.4309, + "reward": 0.5200893208384514, + "reward_std": 0.17495165392756462, + "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.404017873108387, "step": 1034 }, { "clip_ratio": 0.0, - "completion_length": 1584.7768249511719, + "completion_length": 1496.4464721679688, "epoch": 0.3091628705847211, - "grad_norm": 0.2243494987487793, - "kl": 0.1396484375, - "learning_rate": 8.725479151039713e-08, - "loss": 0.0401, - "reward": 0.581473246216774, - "reward_std": 0.08651338517665863, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 47.507598876953125, + "kl": 5.421875, + "learning_rate": 4.362739575519856e-07, + "loss": 0.4096, + "reward": 0.5005580559372902, + "reward_std": 0.179439514875412, + "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478794664144516, + "rewards/tag_count_reward": 0.400111623108387, "step": 1035 }, { "clip_ratio": 0.0, - "completion_length": 1652.8817443847656, + "completion_length": 1551.5491943359375, "epoch": 0.30946157867224255, - "grad_norm": 0.21011167764663696, - "kl": 0.134033203125, - "learning_rate": 8.721998857364146e-08, - "loss": 0.0204, - "reward": 0.5965401977300644, - "reward_std": 0.12884885352104902, - "rewards/accuracy_reward": 0.11383929220028222, + "grad_norm": 228.46095275878906, + "kl": 6.5, + "learning_rate": 4.360999428682073e-07, + "loss": 0.4317, + "reward": 0.4966518059372902, + "reward_std": 0.18391728959977627, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.482700914144516, + "rewards/tag_count_reward": 0.4073660895228386, "step": 1036 }, { "clip_ratio": 0.0, - "completion_length": 1551.91748046875, + "completion_length": 1454.8192443847656, "epoch": 0.309760286759764, - "grad_norm": 0.19026295840740204, - "kl": 0.132568359375, - "learning_rate": 8.718514514518615e-08, - "loss": 0.0279, - "reward": 0.6188616305589676, - "reward_std": 0.11850288230925798, - "rewards/accuracy_reward": 0.1383928619325161, + "grad_norm": 32.96681213378906, + "kl": 3.765625, + "learning_rate": 4.3592572572593076e-07, + "loss": 0.3312, + "reward": 0.5318080633878708, + "reward_std": 0.19193265959620476, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687649011612, + "rewards/tag_count_reward": 0.4023437723517418, "step": 1037 }, { "clip_ratio": 0.0, - "completion_length": 1525.90185546875, + "completion_length": 1468.8505249023438, "epoch": 0.3100589948472855, - "grad_norm": 0.27082666754722595, - "kl": 0.1318359375, - "learning_rate": 8.715026126293748e-08, - "loss": 0.0637, - "reward": 0.6300223469734192, - "reward_std": 0.1468890905380249, - "rewards/accuracy_reward": 0.15401786379516125, + "grad_norm": 28.990102767944336, + "kl": 3.87109375, + "learning_rate": 4.357513063146874e-07, + "loss": 0.3639, + "reward": 0.5318080559372902, + "reward_std": 0.22349325940012932, + "rewards/accuracy_reward": 0.13169643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4760044813156128, + "rewards/tag_count_reward": 0.400111623108387, "step": 1038 }, { "clip_ratio": 0.0, - "completion_length": 1662.1139221191406, + "completion_length": 1481.2679138183594, "epoch": 0.31035770293480697, - "grad_norm": 0.20841997861862183, - "kl": 0.142333984375, - "learning_rate": 8.711533696484567e-08, - "loss": 0.0304, - "reward": 0.5703125298023224, - "reward_std": 0.14924515783786774, - "rewards/accuracy_reward": 0.10491071827709675, + "grad_norm": 68.97732543945312, + "kl": 2.6484375, + "learning_rate": 4.3557668482422836e-07, + "loss": 0.2548, + "reward": 0.5106027126312256, + "reward_std": 0.19009055942296982, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.4146205559372902, "step": 1039 }, { "clip_ratio": 0.0, - "completion_length": 1560.1563110351562, + "completion_length": 1477.6116638183594, "epoch": 0.31065641102232844, - "grad_norm": 0.21746402978897095, - "kl": 0.131591796875, - "learning_rate": 8.708037228890493e-08, - "loss": 0.0446, - "reward": 0.5870535969734192, - "reward_std": 0.12367326579988003, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 13.56062126159668, + "kl": 5.13671875, + "learning_rate": 4.3540186144452466e-07, + "loss": 0.4411, + "reward": 0.5050223395228386, + "reward_std": 0.1911173053085804, + "rewards/accuracy_reward": 0.10044643632136285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4799107387661934, + "rewards/tag_count_reward": 0.4045759066939354, "step": 1040 }, { "clip_ratio": 0.0, - "completion_length": 1683.4107666015625, + "completion_length": 1573.7277526855469, "epoch": 0.3109551191098499, - "grad_norm": 0.22252409160137177, - "kl": 0.15283203125, - "learning_rate": 8.704536727315341e-08, - "loss": 0.0368, - "reward": 0.5139509215950966, - "reward_std": 0.12993921153247356, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 53.514617919921875, + "kl": 5.609375, + "learning_rate": 4.3522683636576706e-07, + "loss": 0.4143, + "reward": 0.4034598395228386, + "reward_std": 0.1875198446214199, + "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.365513414144516, "step": 1041 }, { "clip_ratio": 0.0, - "completion_length": 1487.7590026855469, + "completion_length": 1413.1362609863281, "epoch": 0.3112538271973714, - "grad_norm": 0.20208729803562164, - "kl": 0.139404296875, - "learning_rate": 8.701032195567312e-08, - "loss": 0.0235, - "reward": 0.5931919738650322, - "reward_std": 0.1058555543422699, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 31.889387130737305, + "kl": 4.9140625, + "learning_rate": 4.3505160977836566e-07, + "loss": 0.402, + "reward": 0.5172991380095482, + "reward_std": 0.18907925486564636, + "rewards/accuracy_reward": 0.11383928870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4838169813156128, + "rewards/tag_count_reward": 0.4034598469734192, "step": 1042 }, { "clip_ratio": 0.0, - "completion_length": 1605.2255554199219, + "completion_length": 1490.4844665527344, "epoch": 0.31155253528489285, - "grad_norm": 0.21188002824783325, - "kl": 0.144775390625, - "learning_rate": 8.697523637458996e-08, - "loss": 0.0479, - "reward": 0.5345982387661934, - "reward_std": 0.1580672264099121, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 9.542855262756348, + "kl": 4.94140625, + "learning_rate": 4.3487618187294983e-07, + "loss": 0.4084, + "reward": 0.4369419813156128, + "reward_std": 0.2156931683421135, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982313156128, + "rewards/tag_count_reward": 0.387834832072258, "step": 1043 }, { "clip_ratio": 0.0, - "completion_length": 1532.8504943847656, + "completion_length": 1484.6384582519531, "epoch": 0.3118512433724143, - "grad_norm": 0.2196653187274933, - "kl": 0.139404296875, - "learning_rate": 8.69401105680736e-08, - "loss": 0.0471, - "reward": 0.552455373108387, - "reward_std": 0.10103965923190117, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 30.693645477294922, + "kl": 5.2421875, + "learning_rate": 4.34700552840368e-07, + "loss": 0.4237, + "reward": 0.4492187723517418, + "reward_std": 0.2037702091038227, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.3822544813156128, "step": 1044 }, { "clip_ratio": 0.0, - "completion_length": 1602.62060546875, + "completion_length": 1456.7255249023438, "epoch": 0.3121499514599358, - "grad_norm": 0.21668173372745514, - "kl": 0.145751953125, - "learning_rate": 8.690494457433743e-08, - "loss": 0.0292, - "reward": 0.522321455180645, - "reward_std": 0.09266739338636398, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 10.81494140625, + "kl": 4.84765625, + "learning_rate": 4.345247228716872e-07, + "loss": 0.4383, + "reward": 0.4492187649011612, + "reward_std": 0.1630881167948246, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143059372902, + "rewards/tag_count_reward": 0.400111623108387, "step": 1045 }, { "clip_ratio": 0.0, - "completion_length": 1676.0156860351562, + "completion_length": 1531.1161499023438, "epoch": 0.31244865954745726, - "grad_norm": 0.20220039784908295, - "kl": 0.148193359375, - "learning_rate": 8.686973843163867e-08, - "loss": 0.0292, - "reward": 0.5580357387661934, - "reward_std": 0.1304870406165719, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 52.00796127319336, + "kl": 3.421875, + "learning_rate": 4.3434869215819334e-07, + "loss": 0.3163, + "reward": 0.4626116380095482, + "reward_std": 0.1817893274128437, + "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4799107313156128, + "rewards/tag_count_reward": 0.3978794738650322, "step": 1046 }, { "clip_ratio": 0.0, - "completion_length": 1694.3214721679688, + "completion_length": 1573.1473999023438, "epoch": 0.31274736763497873, - "grad_norm": 0.23085469007492065, - "kl": 0.14501953125, - "learning_rate": 8.68344921782781e-08, - "loss": 0.0373, - "reward": 0.5820312649011612, - "reward_std": 0.12825932912528515, - "rewards/accuracy_reward": 0.10714286379516125, + "grad_norm": 56.746829986572266, + "kl": 3.2421875, + "learning_rate": 4.341724608913905e-07, + "loss": 0.2804, + "reward": 0.4983259215950966, + "reward_std": 0.191475048661232, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.395647332072258, "step": 1047 }, { "clip_ratio": 0.0, - "completion_length": 1603.3638916015625, + "completion_length": 1457.8148193359375, "epoch": 0.3130460757225002, - "grad_norm": 0.2133127748966217, - "kl": 0.1405029296875, - "learning_rate": 8.679920585260021e-08, - "loss": 0.0364, - "reward": 0.5714285969734192, - "reward_std": 0.12100539728999138, - "rewards/accuracy_reward": 0.08928571734577417, + "grad_norm": 14.838501930236816, + "kl": 4.3515625, + "learning_rate": 4.3399602926300107e-07, + "loss": 0.3507, + "reward": 0.4944196715950966, + "reward_std": 0.1919793225824833, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4821428805589676, + "rewards/tag_count_reward": 0.400669664144516, "step": 1048 }, { "clip_ratio": 0.0, - "completion_length": 1570.5625610351562, + "completion_length": 1465.8259582519531, "epoch": 0.3133447838100217, - "grad_norm": 0.19637030363082886, - "kl": 0.1337890625, - "learning_rate": 8.676387949299306e-08, - "loss": 0.0277, - "reward": 0.561941996216774, - "reward_std": 0.1193447932600975, - "rewards/accuracy_reward": 0.07589286100119352, + "grad_norm": 19.73334312438965, + "kl": 3.8984375, + "learning_rate": 4.3381939746496533e-07, + "loss": 0.3351, + "reward": 0.4938616305589676, + "reward_std": 0.19539830833673477, + "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4860491305589676, + "rewards/tag_count_reward": 0.4135044738650322, "step": 1049 }, { "clip_ratio": 0.0, - "completion_length": 1665.7233276367188, + "completion_length": 1590.7969360351562, "epoch": 0.31364349189754315, - "grad_norm": 0.20334792137145996, - "kl": 0.139892578125, - "learning_rate": 8.672851313788827e-08, - "loss": 0.0345, - "reward": 0.5212053805589676, - "reward_std": 0.11143927648663521, - "rewards/accuracy_reward": 0.05580357206054032, + "grad_norm": 12.36715316772461, + "kl": 4.7734375, + "learning_rate": 4.3364256568944135e-07, + "loss": 0.3699, + "reward": 0.4441964477300644, + "reward_std": 0.19625502079725266, + "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.3861607387661934, "step": 1050 }, { "clip_ratio": 0.0, - "completion_length": 1543.4263916015625, + "completion_length": 1440.2210693359375, "epoch": 0.3139421999850646, - "grad_norm": 0.20278795063495636, - "kl": 0.135498046875, - "learning_rate": 8.669310682576097e-08, - "loss": 0.0428, - "reward": 0.5864955633878708, - "reward_std": 0.10943897813558578, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 7.537537574768066, + "kl": 3.87109375, + "learning_rate": 4.3346553412880483e-07, + "loss": 0.3235, + "reward": 0.5072544813156128, + "reward_std": 0.16967600211501122, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4838169813156128, + "rewards/tag_count_reward": 0.4090401977300644, "step": 1051 }, { "clip_ratio": 0.0, - "completion_length": 1692.7545471191406, + "completion_length": 1629.1272888183594, "epoch": 0.3142409080725861, - "grad_norm": 0.21849234402179718, - "kl": 0.1533203125, - "learning_rate": 8.665766059512976e-08, - "loss": 0.0298, - "reward": 0.539620578289032, - "reward_std": 0.11498196423053741, - "rewards/accuracy_reward": 0.06696429057046771, + "grad_norm": 60.840091705322266, + "kl": 6.2578125, + "learning_rate": 4.332883029756488e-07, + "loss": 0.4206, + "reward": 0.420200914144516, + "reward_std": 0.18977253884077072, + "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.3666294738650322, "step": 1052 }, { "clip_ratio": 0.0, - "completion_length": 1704.0402526855469, + "completion_length": 1557.0335388183594, "epoch": 0.31453961616010756, - "grad_norm": 0.2294243425130844, - "kl": 0.14794921875, - "learning_rate": 8.662217448455665e-08, - "loss": 0.0406, - "reward": 0.6735491454601288, - "reward_std": 0.12353468500077724, - "rewards/accuracy_reward": 0.2098214402794838, + "grad_norm": 33.55450439453125, + "kl": 4.13671875, + "learning_rate": 4.3311087242278324e-07, + "loss": 0.3377, + "reward": 0.5825893133878708, + "reward_std": 0.17647244408726692, + "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.388392873108387, "step": 1053 }, { "clip_ratio": 0.0, - "completion_length": 1708.8929443359375, + "completion_length": 1622.3594360351562, "epoch": 0.31483832424762903, - "grad_norm": 0.20989161729812622, - "kl": 0.140625, - "learning_rate": 8.658664853264706e-08, - "loss": 0.037, - "reward": 0.5446428805589676, - "reward_std": 0.12317505292594433, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 10.01598834991455, + "kl": 5.515625, + "learning_rate": 4.329332426632353e-07, + "loss": 0.4227, + "reward": 0.4592634066939354, + "reward_std": 0.19166633114218712, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4709821715950966, + "rewards/tag_count_reward": 0.3922991305589676, "step": 1054 }, { "clip_ratio": 0.0, - "completion_length": 1567.4107666015625, + "completion_length": 1486.3170471191406, "epoch": 0.31513703233515045, - "grad_norm": 0.20462621748447418, - "kl": 0.134521484375, - "learning_rate": 8.655108277804975e-08, - "loss": 0.0229, - "reward": 0.5000000298023224, - "reward_std": 0.09150703065097332, - "rewards/accuracy_reward": 0.022321428870782256, + "grad_norm": 19.091739654541016, + "kl": 5.171875, + "learning_rate": 4.3275541389024873e-07, + "loss": 0.4047, + "reward": 0.430803582072258, + "reward_std": 0.18778089806437492, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785969734192, + "rewards/tag_count_reward": 0.3973214402794838, "step": 1055 }, { "clip_ratio": 0.0, - "completion_length": 1650.0625915527344, + "completion_length": 1499.7076721191406, "epoch": 0.3154357404226719, - "grad_norm": 0.21526570618152618, - "kl": 0.146240234375, - "learning_rate": 8.651547725945675e-08, - "loss": 0.0428, - "reward": 0.5206473395228386, - "reward_std": 0.1428373847156763, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 28.23240852355957, + "kl": 5.2890625, + "learning_rate": 4.325773862972838e-07, + "loss": 0.4085, + "reward": 0.4296875149011612, + "reward_std": 0.19161390513181686, + "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401902794838, + "rewards/tag_count_reward": 0.3984375149011612, "step": 1056 }, { "clip_ratio": 0.0, - "completion_length": 1587.9732971191406, + "completion_length": 1469.9688110351562, "epoch": 0.3157344485101934, - "grad_norm": 0.21939793229103088, - "kl": 0.14013671875, - "learning_rate": 8.647983201560341e-08, - "loss": 0.033, - "reward": 0.549107164144516, - "reward_std": 0.1225476823747158, - "rewards/accuracy_reward": 0.07366071874275804, + "grad_norm": 54.29639434814453, + "kl": 3.6171875, + "learning_rate": 4.323991600780171e-07, + "loss": 0.3544, + "reward": 0.467075914144516, + "reward_std": 0.17298756167292595, + "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4754464477300644, + "rewards/tag_count_reward": 0.4045759066939354, "step": 1057 }, { "clip_ratio": 0.0, - "completion_length": 1707.8415832519531, + "completion_length": 1573.2656860351562, "epoch": 0.31603315659771486, - "grad_norm": 0.23028230667114258, - "kl": 0.151123046875, - "learning_rate": 8.644414708526823e-08, - "loss": 0.0544, - "reward": 0.4799107313156128, - "reward_std": 0.10070587880909443, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 40.57893753051758, + "kl": 3.8046875, + "learning_rate": 4.322207354263412e-07, + "loss": 0.3293, + "reward": 0.4146205559372902, + "reward_std": 0.16566652804613113, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4012276902794838, "step": 1058 }, { "clip_ratio": 0.0, - "completion_length": 1595.5357666015625, + "completion_length": 1496.7857971191406, "epoch": 0.31633186468523633, - "grad_norm": 0.228091299533844, - "kl": 0.140869140625, - "learning_rate": 8.640842250727296e-08, - "loss": 0.0505, - "reward": 0.6021205708384514, - "reward_std": 0.10154673922806978, - "rewards/accuracy_reward": 0.12053572200238705, + "grad_norm": 37.15719223022461, + "kl": 4.11328125, + "learning_rate": 4.320421125363648e-07, + "loss": 0.3689, + "reward": 0.5167410969734192, + "reward_std": 0.1951860785484314, + "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4815848469734192, + "rewards/tag_count_reward": 0.3939732387661934, "step": 1059 }, { "clip_ratio": 0.0, - "completion_length": 1660.8215026855469, + "completion_length": 1522.79248046875, "epoch": 0.3166305727727578, - "grad_norm": 0.20039041340351105, - "kl": 0.14306640625, - "learning_rate": 8.63726583204824e-08, - "loss": 0.033, - "reward": 0.5184151977300644, - "reward_std": 0.12130638025701046, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 19.989177703857422, + "kl": 4.04296875, + "learning_rate": 4.31863291602412e-07, + "loss": 0.3258, + "reward": 0.4296875298023224, + "reward_std": 0.18419376760721207, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4029018059372902, "step": 1060 }, { "clip_ratio": 0.0, - "completion_length": 1711.2880554199219, + "completion_length": 1619.35498046875, "epoch": 0.31692928086027927, - "grad_norm": 0.24215756356716156, - "kl": 0.14892578125, - "learning_rate": 8.633685456380448e-08, - "loss": 0.0534, - "reward": 0.522879496216774, - "reward_std": 0.130187070928514, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 91.51507568359375, + "kl": 6.6484375, + "learning_rate": 4.316842728190224e-07, + "loss": 0.4395, + "reward": 0.440290205180645, + "reward_std": 0.1580643653869629, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.3911830559372902, "step": 1061 }, { "clip_ratio": 0.0, - "completion_length": 1681.7054443359375, + "completion_length": 1551.9129943847656, "epoch": 0.31722798894780074, - "grad_norm": 0.1994626224040985, - "kl": 0.140869140625, - "learning_rate": 8.630101127619019e-08, - "loss": 0.0371, - "reward": 0.5301339477300644, - "reward_std": 0.09784667752683163, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 91.06034088134766, + "kl": 6.40625, + "learning_rate": 4.31505056380951e-07, + "loss": 0.457, + "reward": 0.4475446715950966, + "reward_std": 0.17632845789194107, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474330373108387, + "rewards/tag_count_reward": 0.391741082072258, "step": 1062 }, { "clip_ratio": 0.0, - "completion_length": 1701.6072387695312, + "completion_length": 1572.274658203125, "epoch": 0.3175266970353222, - "grad_norm": 0.23544426262378693, - "kl": 0.147216796875, - "learning_rate": 8.626512849663352e-08, - "loss": 0.0508, - "reward": 0.5898437723517418, - "reward_std": 0.09754138719290495, - "rewards/accuracy_reward": 0.1205357238650322, + "grad_norm": 85.88459014892578, + "kl": 6.6953125, + "learning_rate": 4.313256424831676e-07, + "loss": 0.4709, + "reward": 0.5011160895228386, + "reward_std": 0.1591567024588585, + "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.3850446566939354, "step": 1063 }, { "clip_ratio": 0.0, - "completion_length": 1599.2835388183594, + "completion_length": 1470.0781860351562, "epoch": 0.3178254051228437, - "grad_norm": 0.26245152950286865, - "kl": 0.144775390625, - "learning_rate": 8.622920626417141e-08, - "loss": 0.0503, - "reward": 0.5552455559372902, - "reward_std": 0.12019930221140385, - "rewards/accuracy_reward": 0.08258929313160479, + "grad_norm": 26.231706619262695, + "kl": 4.125, + "learning_rate": 4.3114603132085703e-07, + "loss": 0.3578, + "reward": 0.486049123108387, + "reward_std": 0.17275992408394814, + "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.407924123108387, "step": 1064 }, { "clip_ratio": 0.0, - "completion_length": 1571.7813415527344, + "completion_length": 1497.01123046875, "epoch": 0.31812411321036516, - "grad_norm": 0.24956545233726501, - "kl": 0.13916015625, - "learning_rate": 8.619324461788373e-08, - "loss": 0.0591, - "reward": 0.5753348469734192, - "reward_std": 0.09457357879728079, - "rewards/accuracy_reward": 0.09151786169968545, + "grad_norm": 9.794968605041504, + "kl": 4.78125, + "learning_rate": 4.3096622308941863e-07, + "loss": 0.3651, + "reward": 0.4726562723517418, + "reward_std": 0.15365530364215374, + "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4838169887661934, + "rewards/tag_count_reward": 0.396763414144516, "step": 1065 }, { "clip_ratio": 0.0, - "completion_length": 1561.52685546875, + "completion_length": 1451.2656860351562, "epoch": 0.3184228212978866, - "grad_norm": 0.220716655254364, - "kl": 0.137451171875, - "learning_rate": 8.615724359689322e-08, - "loss": 0.0439, - "reward": 0.5524553954601288, - "reward_std": 0.14803632721304893, - "rewards/accuracy_reward": 0.07142857369035482, + "grad_norm": 19.662031173706055, + "kl": 4.23046875, + "learning_rate": 4.3078621798446605e-07, + "loss": 0.3343, + "reward": 0.4888393133878708, + "reward_std": 0.22691982984542847, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1066 }, { "clip_ratio": 0.0, - "completion_length": 1563.7478332519531, + "completion_length": 1494.1897888183594, "epoch": 0.3187215293854081, - "grad_norm": 0.22266879677772522, - "kl": 0.144287109375, - "learning_rate": 8.612120324036547e-08, - "loss": 0.0376, - "reward": 0.562500037252903, - "reward_std": 0.07703181356191635, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 47.1057014465332, + "kl": 5.2109375, + "learning_rate": 4.306060162018274e-07, + "loss": 0.3819, + "reward": 0.5022321566939354, + "reward_std": 0.14717273227870464, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4821428805589676, + "rewards/tag_count_reward": 0.4129464477300644, "step": 1067 }, { "clip_ratio": 0.0, - "completion_length": 1669.0625610351562, + "completion_length": 1570.8974304199219, "epoch": 0.31902023747292957, - "grad_norm": 0.23162741959095, - "kl": 0.149169921875, - "learning_rate": 8.608512358750884e-08, - "loss": 0.0237, - "reward": 0.607700914144516, - "reward_std": 0.1461593620479107, - "rewards/accuracy_reward": 0.12946428963914514, + "grad_norm": 27.620433807373047, + "kl": 5.33984375, + "learning_rate": 4.3042561793754425e-07, + "loss": 0.4005, + "reward": 0.4905134215950966, + "reward_std": 0.18510233610868454, + "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478236623108387, + "rewards/tag_count_reward": 0.3856026902794838, "step": 1068 }, { "clip_ratio": 0.0, - "completion_length": 1634.4241638183594, + "completion_length": 1556.3415832519531, "epoch": 0.31931894556045104, - "grad_norm": 0.2210591733455658, - "kl": 0.1453857421875, - "learning_rate": 8.604900467757448e-08, - "loss": 0.048, - "reward": 0.6333705633878708, - "reward_std": 0.20324255153536797, - "rewards/accuracy_reward": 0.15625000931322575, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205484867096, + "grad_norm": 13.912795066833496, + "kl": 4.30859375, + "learning_rate": 4.3024502338787246e-07, + "loss": 0.3389, + "reward": 0.5440848469734192, + "reward_std": 0.2520570158958435, + "rewards/accuracy_reward": 0.1450892947614193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3989955559372902, "step": 1069 }, { "clip_ratio": 0.0, - "completion_length": 1626.2366943359375, + "completion_length": 1454.8170471191406, "epoch": 0.3196176536479725, - "grad_norm": 0.21677260100841522, - "kl": 0.1395263671875, - "learning_rate": 8.601284654985622e-08, - "loss": 0.0298, - "reward": 0.6350446492433548, - "reward_std": 0.10520734824240208, - "rewards/accuracy_reward": 0.1540178656578064, + "grad_norm": 93.93225860595703, + "kl": 2.39453125, + "learning_rate": 4.3006423274928103e-07, + "loss": 0.2753, + "reward": 0.5641741305589676, + "reward_std": 0.19079000689089298, + "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810267984867096, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1070 }, { "clip_ratio": 0.0, - "completion_length": 1608.7389221191406, + "completion_length": 1446.9129943847656, "epoch": 0.319916361735494, - "grad_norm": 0.21813276410102844, - "kl": 0.140380859375, - "learning_rate": 8.597664924369055e-08, - "loss": 0.0331, - "reward": 0.5954241454601288, - "reward_std": 0.11927215196192265, - "rewards/accuracy_reward": 0.1160714328289032, + "grad_norm": 79.51519775390625, + "kl": 2.365234375, + "learning_rate": 4.298832462184527e-07, + "loss": 0.2427, + "reward": 0.5357143133878708, + "reward_std": 0.1665214877575636, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4793526977300644, + "rewards/tag_count_reward": 0.4196428805589676, "step": 1071 }, { "clip_ratio": 0.0, - "completion_length": 1664.5648193359375, + "completion_length": 1529.8661193847656, "epoch": 0.32021506982301545, - "grad_norm": 0.21472123265266418, - "kl": 0.146240234375, - "learning_rate": 8.594041279845659e-08, - "loss": 0.0332, - "reward": 0.5329241305589676, - "reward_std": 0.08927637059241533, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 85.4726791381836, + "kl": 2.80859375, + "learning_rate": 4.297020639922829e-07, + "loss": 0.288, + "reward": 0.4492187798023224, + "reward_std": 0.1789465695619583, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.3956473469734192, "step": 1072 }, { "clip_ratio": 0.0, - "completion_length": 1724.5804138183594, + "completion_length": 1594.0201416015625, "epoch": 0.3205137779105369, - "grad_norm": 0.21259424090385437, - "kl": 0.15380859375, - "learning_rate": 8.590413725357604e-08, - "loss": 0.0383, - "reward": 0.568080373108387, - "reward_std": 0.13309526070952415, - "rewards/accuracy_reward": 0.1026785746216774, + "grad_norm": 103.33338165283203, + "kl": 2.62109375, + "learning_rate": 4.295206862678802e-07, + "loss": 0.2648, + "reward": 0.4771205484867096, + "reward_std": 0.17957113310694695, + "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018133878708, + "rewards/tag_count_reward": 0.392299123108387, "step": 1073 }, { "clip_ratio": 0.0, - "completion_length": 1682.66748046875, + "completion_length": 1518.8371276855469, "epoch": 0.3208124859980584, - "grad_norm": 0.23524077236652374, - "kl": 0.15380859375, - "learning_rate": 8.586782264851314e-08, - "loss": 0.041, - "reward": 0.5217634215950966, - "reward_std": 0.09670024830847979, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 81.19915008544922, + "kl": 2.78515625, + "learning_rate": 4.293391132425657e-07, + "loss": 0.2653, + "reward": 0.470982164144516, + "reward_std": 0.19119353592395782, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562798023224, + "rewards/tag_count_reward": 0.3995535969734192, "step": 1074 }, { "clip_ratio": 0.0, - "completion_length": 1692.26123046875, + "completion_length": 1513.6897888183594, "epoch": 0.32111119408557987, - "grad_norm": 0.24329142272472382, - "kl": 0.1494140625, - "learning_rate": 8.583146902277463e-08, - "loss": 0.0536, - "reward": 0.579241082072258, - "reward_std": 0.09486950375139713, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 75.15235900878906, + "kl": 3.078125, + "learning_rate": 4.291573451138731e-07, + "loss": 0.2963, + "reward": 0.506138414144516, + "reward_std": 0.15147174708545208, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4787946566939354, + "rewards/tag_count_reward": 0.4213169813156128, "step": 1075 }, { "clip_ratio": 0.0, - "completion_length": 1660.1072082519531, + "completion_length": 1552.9599304199219, "epoch": 0.32140990217310134, - "grad_norm": 0.2437535524368286, - "kl": 0.139404296875, - "learning_rate": 8.579507641590969e-08, - "loss": 0.0418, - "reward": 0.5569196566939354, - "reward_std": 0.07268767803907394, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 60.64521026611328, + "kl": 4.14453125, + "learning_rate": 4.289753820795484e-07, + "loss": 0.3601, + "reward": 0.483258955180645, + "reward_std": 0.15666737407445908, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4787946566939354, + "rewards/tag_count_reward": 0.4029018059372902, "step": 1076 }, { "clip_ratio": 0.0, - "completion_length": 1696.1117248535156, + "completion_length": 1544.1942749023438, "epoch": 0.3217086102606228, - "grad_norm": 0.19137516617774963, - "kl": 0.1455078125, - "learning_rate": 8.575864486750989e-08, - "loss": 0.0262, - "reward": 0.5145089626312256, - "reward_std": 0.1096066189929843, - "rewards/accuracy_reward": 0.040178574388846755, + "grad_norm": 29.060739517211914, + "kl": 4.23828125, + "learning_rate": 4.287932243375494e-07, + "loss": 0.2942, + "reward": 0.4436384066939354, + "reward_std": 0.17921943590044975, + "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.407924123108387, "step": 1077 }, { "clip_ratio": 0.0, - "completion_length": 1521.3639221191406, + "completion_length": 1404.9598999023438, "epoch": 0.3220073183481443, - "grad_norm": 0.23278234899044037, - "kl": 0.128662109375, - "learning_rate": 8.57221744172092e-08, - "loss": 0.0546, - "reward": 0.6356027126312256, - "reward_std": 0.16497310996055603, - "rewards/accuracy_reward": 0.1540178619325161, + "grad_norm": 56.80302047729492, + "kl": 3.1171875, + "learning_rate": 4.28610872086046e-07, + "loss": 0.3162, + "reward": 0.5753348469734192, + "reward_std": 0.2275884449481964, + "rewards/accuracy_reward": 0.14285714970901608, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4815848395228386, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1078 }, { "clip_ratio": 0.0, - "completion_length": 1592.8415832519531, + "completion_length": 1480.1183776855469, "epoch": 0.32230602643566575, - "grad_norm": 0.1973426342010498, - "kl": 0.1351318359375, - "learning_rate": 8.568566510468391e-08, - "loss": 0.0184, - "reward": 0.6746652126312256, - "reward_std": 0.13232408929616213, - "rewards/accuracy_reward": 0.1964285783469677, + "grad_norm": 27.2441349029541, + "kl": 3.95703125, + "learning_rate": 4.284283255234195e-07, + "loss": 0.3146, + "reward": 0.5625000298023224, + "reward_std": 0.21537933684885502, + "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478236623108387, + "rewards/tag_count_reward": 0.4151785895228386, "step": 1079 }, { "clip_ratio": 0.0, - "completion_length": 1585.8281860351562, + "completion_length": 1493.6853332519531, "epoch": 0.3226047345231872, - "grad_norm": 0.21832284331321716, - "kl": 0.140869140625, - "learning_rate": 8.564911696965253e-08, - "loss": 0.0436, - "reward": 0.6110491380095482, - "reward_std": 0.11569014005362988, - "rewards/accuracy_reward": 0.13169643771834671, + "grad_norm": 15.151577949523926, + "kl": 4.671875, + "learning_rate": 4.282455848482627e-07, + "loss": 0.3867, + "reward": 0.5351562649011612, + "reward_std": 0.17754266411066055, + "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4793526977300644, + "rewards/tag_count_reward": 0.4101562723517418, "step": 1080 }, { "clip_ratio": 0.0, - "completion_length": 1621.66748046875, + "completion_length": 1497.3348999023438, "epoch": 0.3229034426107087, - "grad_norm": 0.20028424263000488, - "kl": 0.139404296875, - "learning_rate": 8.56125300518759e-08, - "loss": 0.0411, - "reward": 0.5518973544239998, - "reward_std": 0.1325104981660843, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 11.79818344116211, + "kl": 4.625, + "learning_rate": 4.280626502593795e-07, + "loss": 0.38, + "reward": 0.4581473469734192, + "reward_std": 0.16819462552666664, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.4001116305589676, "step": 1081 }, { "clip_ratio": 0.0, - "completion_length": 1619.44873046875, + "completion_length": 1486.8259582519531, "epoch": 0.32320215069823016, - "grad_norm": 0.21080844104290009, - "kl": 0.137939453125, - "learning_rate": 8.557590439115696e-08, - "loss": 0.0553, - "reward": 0.6852678954601288, - "reward_std": 0.15102420933544636, - "rewards/accuracy_reward": 0.20758929569274187, + "grad_norm": 24.63823890686035, + "kl": 5.1796875, + "learning_rate": 4.2787952195578483e-07, + "loss": 0.4114, + "reward": 0.5937500223517418, + "reward_std": 0.21099060028791428, + "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785969734192, + "rewards/tag_count_reward": 0.4129464477300644, "step": 1082 }, { "clip_ratio": 0.0, - "completion_length": 1726.2880249023438, + "completion_length": 1541.3192443847656, "epoch": 0.32350085878575163, - "grad_norm": 0.22042948007583618, - "kl": 0.15234375, - "learning_rate": 8.553924002734088e-08, - "loss": 0.0288, - "reward": 0.5898437723517418, - "reward_std": 0.10861228965222836, - "rewards/accuracy_reward": 0.1250000058207661, + "grad_norm": 10.474635124206543, + "kl": 4.8359375, + "learning_rate": 4.2769620013670437e-07, + "loss": 0.389, + "reward": 0.5401786044239998, + "reward_std": 0.16725080832839012, + "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437798023224, + "rewards/tag_count_reward": 0.4062500223517418, "step": 1083 }, { "clip_ratio": 0.0, - "completion_length": 1670.4264221191406, + "completion_length": 1537.6340026855469, "epoch": 0.3237995668732731, - "grad_norm": 0.24308711290359497, - "kl": 0.13818359375, - "learning_rate": 8.550253700031484e-08, - "loss": 0.044, - "reward": 0.5926339626312256, - "reward_std": 0.14088147319853306, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 25.016260147094727, + "kl": 4.49609375, + "learning_rate": 4.275126850015742e-07, + "loss": 0.328, + "reward": 0.510602705180645, + "reward_std": 0.19061278365552425, + "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4101562723517418, "step": 1084 }, { "clip_ratio": 0.0, - "completion_length": 1617.49560546875, + "completion_length": 1492.9107666015625, "epoch": 0.3240982749607946, - "grad_norm": 0.23145626485347748, - "kl": 0.1416015625, - "learning_rate": 8.546579535000818e-08, - "loss": 0.0471, - "reward": 0.5563616305589676, - "reward_std": 0.12924637831747532, - "rewards/accuracy_reward": 0.07589286286383867, + "grad_norm": 11.41002082824707, + "kl": 4.15234375, + "learning_rate": 4.273289767500409e-07, + "loss": 0.3133, + "reward": 0.5106026977300644, + "reward_std": 0.19239627942442894, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687649011612, + "rewards/tag_count_reward": 0.428013414144516, "step": 1085 }, { "clip_ratio": 0.0, - "completion_length": 1644.4398193359375, + "completion_length": 1530.9085388183594, "epoch": 0.32439698304831605, - "grad_norm": 0.1986616849899292, - "kl": 0.146484375, - "learning_rate": 8.54290151163922e-08, - "loss": 0.0283, - "reward": 0.568638414144516, - "reward_std": 0.1515771709382534, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 20.341691970825195, + "kl": 4.25390625, + "learning_rate": 4.27145075581961e-07, + "loss": 0.3675, + "reward": 0.4799107387661934, + "reward_std": 0.20326539874076843, + "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205633878708, + "rewards/tag_count_reward": 0.4107143059372902, "step": 1086 }, { "clip_ratio": 0.0, - "completion_length": 1665.8371276855469, + "completion_length": 1578.0491638183594, "epoch": 0.3246956911358375, - "grad_norm": 0.23818789422512054, - "kl": 0.140380859375, - "learning_rate": 8.539219633948019e-08, - "loss": 0.0555, - "reward": 0.560825914144516, - "reward_std": 0.1231945101171732, - "rewards/accuracy_reward": 0.08705357275903225, + "grad_norm": 110.20577239990234, + "kl": 6.8203125, + "learning_rate": 4.2696098169740094e-07, + "loss": 0.434, + "reward": 0.474888414144516, + "reward_std": 0.19509606808423996, + "rewards/accuracy_reward": 0.08482143585570157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723469734192, + "rewards/tag_count_reward": 0.3900669738650322, "step": 1087 }, { "clip_ratio": 0.0, - "completion_length": 1594.6652526855469, + "completion_length": 1482.8014221191406, "epoch": 0.324994399223359, - "grad_norm": 0.21889689564704895, - "kl": 0.132568359375, - "learning_rate": 8.535533905932736e-08, - "loss": 0.0449, - "reward": 0.5859375298023224, - "reward_std": 0.13284688163548708, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 58.631919860839844, + "kl": 3.69921875, + "learning_rate": 4.2677669529663686e-07, + "loss": 0.3422, + "reward": 0.538504496216774, + "reward_std": 0.20462054014205933, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.4268973469734192, "step": 1088 }, { "clip_ratio": 0.0, - "completion_length": 1665.33935546875, + "completion_length": 1557.1161193847656, "epoch": 0.32529310731088046, - "grad_norm": 0.2241174429655075, - "kl": 0.147216796875, - "learning_rate": 8.531844331603085e-08, - "loss": 0.0259, - "reward": 0.6668526977300644, - "reward_std": 0.18373204208910465, - "rewards/accuracy_reward": 0.191964291036129, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "grad_norm": 25.2952938079834, + "kl": 4.94140625, + "learning_rate": 4.2659221658015426e-07, + "loss": 0.3885, + "reward": 0.5463170036673546, + "reward_std": 0.23365453258156776, + "rewards/accuracy_reward": 0.15178571827709675, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.3922991305589676, "step": 1089 }, { "clip_ratio": 0.0, - "completion_length": 1727.3438110351562, + "completion_length": 1581.4085388183594, "epoch": 0.32559181539840193, - "grad_norm": 0.22069121897220612, - "kl": 0.1494140625, - "learning_rate": 8.528150914972958e-08, - "loss": 0.0374, - "reward": 0.5022321566939354, - "reward_std": 0.09843061864376068, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 65.10674285888672, + "kl": 5.890625, + "learning_rate": 4.264075457486479e-07, + "loss": 0.3865, + "reward": 0.4330357313156128, + "reward_std": 0.18515907786786556, + "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785895228386, + "rewards/tag_count_reward": 0.4017857313156128, "step": 1090 }, { "clip_ratio": 0.0, - "completion_length": 1638.5759582519531, + "completion_length": 1451.9531555175781, "epoch": 0.3258905234859234, - "grad_norm": 0.2035917490720749, - "kl": 0.148681640625, - "learning_rate": 8.524453660060432e-08, - "loss": 0.023, - "reward": 0.554129496216774, - "reward_std": 0.10034097451716661, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 25.68756103515625, + "kl": 4.87109375, + "learning_rate": 4.262226830030216e-07, + "loss": 0.3932, + "reward": 0.497767873108387, + "reward_std": 0.18824592605233192, + "rewards/accuracy_reward": 0.08035714458674192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4849330559372902, + "rewards/tag_count_reward": 0.4174107313156128, "step": 1091 }, { "clip_ratio": 0.0, - "completion_length": 1619.29248046875, + "completion_length": 1511.1897888183594, "epoch": 0.32618923157344487, - "grad_norm": 0.20635809004306793, - "kl": 0.14306640625, - "learning_rate": 8.520752570887757e-08, - "loss": 0.0465, - "reward": 0.5407366305589676, - "reward_std": 0.10177634842693806, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 49.686153411865234, + "kl": 5.42578125, + "learning_rate": 4.2603762854438786e-07, + "loss": 0.4052, + "reward": 0.466517873108387, + "reward_std": 0.18446096777915955, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.482700914144516, + "rewards/tag_count_reward": 0.4062500223517418, "step": 1092 }, { "clip_ratio": 0.0, - "completion_length": 1670.4286499023438, + "completion_length": 1557.26123046875, "epoch": 0.32648793966096634, - "grad_norm": 0.2021569013595581, - "kl": 0.14501953125, - "learning_rate": 8.517047651481356e-08, - "loss": 0.0305, - "reward": 0.5033482387661934, - "reward_std": 0.12523314356803894, - "rewards/accuracy_reward": 0.03125000209547579, + "grad_norm": 43.23857498168945, + "kl": 4.65234375, + "learning_rate": 4.258523825740678e-07, + "loss": 0.3074, + "reward": 0.4218750149011612, + "reward_std": 0.1619756855070591, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4017857313156128, "step": 1093 }, { "clip_ratio": 0.0, - "completion_length": 1599.0402526855469, + "completion_length": 1458.2455749511719, "epoch": 0.3267866477484878, - "grad_norm": 0.20637178421020508, - "kl": 0.14111328125, - "learning_rate": 8.513338905871818e-08, - "loss": 0.0402, - "reward": 0.651785746216774, - "reward_std": 0.09144028834998608, - "rewards/accuracy_reward": 0.17633929592557251, + "grad_norm": 43.935081481933594, + "kl": 4.125, + "learning_rate": 4.2566694529359095e-07, + "loss": 0.3686, + "reward": 0.5825893208384514, + "reward_std": 0.13865990564227104, + "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.475446455180645, + "rewards/tag_count_reward": 0.4196428880095482, "step": 1094 }, { "clip_ratio": 0.0, - "completion_length": 1661.9777526855469, + "completion_length": 1507.2188110351562, "epoch": 0.3270853558360093, - "grad_norm": 0.23309361934661865, - "kl": 0.1461181640625, - "learning_rate": 8.509626338093894e-08, - "loss": 0.0492, - "reward": 0.5876116305589676, - "reward_std": 0.1228338535875082, - "rewards/accuracy_reward": 0.11383929336443543, + "grad_norm": 35.38380813598633, + "kl": 4.3125, + "learning_rate": 4.254813169046947e-07, + "loss": 0.3804, + "reward": 0.5066964402794838, + "reward_std": 0.17994016036391258, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4017857313156128, "step": 1095 }, { "clip_ratio": 0.0, - "completion_length": 1613.9397888183594, + "completion_length": 1453.6116638183594, "epoch": 0.32738406392353075, - "grad_norm": 0.23873338103294373, - "kl": 0.146728515625, - "learning_rate": 8.505909952186495e-08, - "loss": 0.0439, - "reward": 0.6289062947034836, - "reward_std": 0.17286293022334576, - "rewards/accuracy_reward": 0.15401786006987095, + "grad_norm": 55.5343017578125, + "kl": 2.96875, + "learning_rate": 4.2529549760932474e-07, + "loss": 0.3138, + "reward": 0.6054687649011612, + "reward_std": 0.19688594713807106, + "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1096 }, { "clip_ratio": 0.0, - "completion_length": 1630.6630249023438, + "completion_length": 1479.1005554199219, "epoch": 0.3276827720110522, - "grad_norm": 0.22864045202732086, - "kl": 0.1494140625, - "learning_rate": 8.502189752192684e-08, - "loss": 0.0307, - "reward": 0.5652901902794838, - "reward_std": 0.14119158033281565, - "rewards/accuracy_reward": 0.08482143096625805, + "grad_norm": 56.875335693359375, + "kl": 3.3515625, + "learning_rate": 4.2510948760963425e-07, + "loss": 0.3147, + "reward": 0.4776785969734192, + "reward_std": 0.20819284953176975, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687649011612, + "rewards/tag_count_reward": 0.4174107313156128, "step": 1097 }, { "clip_ratio": 0.0, - "completion_length": 1630.7701721191406, + "completion_length": 1459.97998046875, "epoch": 0.32798148009857364, - "grad_norm": 0.2566523253917694, - "kl": 0.148193359375, - "learning_rate": 8.498465742159674e-08, - "loss": 0.0382, - "reward": 0.6434152126312256, - "reward_std": 0.13856972940266132, - "rewards/accuracy_reward": 0.1718750074505806, + "grad_norm": 36.21147918701172, + "kl": 3.5390625, + "learning_rate": 4.249232871079837e-07, + "loss": 0.303, + "reward": 0.5697544813156128, + "reward_std": 0.18278021924197674, + "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.471540205180645, + "rewards/tag_count_reward": 0.415736623108387, "step": 1098 }, { "clip_ratio": 0.0, - "completion_length": 1752.7501220703125, + "completion_length": 1549.1786193847656, "epoch": 0.3282801881860951, - "grad_norm": 0.20735283195972443, - "kl": 0.15087890625, - "learning_rate": 8.494737926138824e-08, - "loss": 0.0326, - "reward": 0.5597098469734192, - "reward_std": 0.13127627968788147, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 30.499515533447266, + "kl": 4.00390625, + "learning_rate": 4.2473689630694123e-07, + "loss": 0.3306, + "reward": 0.4972098469734192, + "reward_std": 0.19302776083350182, + "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1099 }, { "clip_ratio": 0.0, - "completion_length": 1662.2254943847656, + "completion_length": 1534.2456359863281, "epoch": 0.3285788962736166, - "grad_norm": 0.22211818397045135, - "kl": 0.14013671875, - "learning_rate": 8.49100630818563e-08, - "loss": 0.0204, - "reward": 0.6082589477300644, - "reward_std": 0.13468307629227638, - "rewards/accuracy_reward": 0.12723214668221772, + "grad_norm": 11.760320663452148, + "kl": 4.4140625, + "learning_rate": 4.245503154092815e-07, + "loss": 0.348, + "reward": 0.5000000149011612, + "reward_std": 0.19340495020151138, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810267984867096, + "rewards/tag_count_reward": 0.4129464477300644, "step": 1100 }, { "clip_ratio": 0.0, - "completion_length": 1720.57373046875, + "completion_length": 1579.08935546875, "epoch": 0.32887760436113805, - "grad_norm": 0.20824746787548065, - "kl": 0.147216796875, - "learning_rate": 8.487270892359728e-08, - "loss": 0.033, - "reward": 0.5831473395228386, - "reward_std": 0.09726918023079634, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 26.37672233581543, + "kl": 5.640625, + "learning_rate": 4.243635446179864e-07, + "loss": 0.4297, + "reward": 0.5078125298023224, + "reward_std": 0.17703606933355331, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4782366305589676, + "rewards/tag_count_reward": 0.4051339477300644, "step": 1101 }, { "clip_ratio": 0.0, - "completion_length": 1702.62060546875, + "completion_length": 1499.8103332519531, "epoch": 0.3291763124486595, - "grad_norm": 0.22491803765296936, - "kl": 0.14111328125, - "learning_rate": 8.483531682724886e-08, - "loss": 0.0419, - "reward": 0.5401786044239998, - "reward_std": 0.11980560887604952, - "rewards/accuracy_reward": 0.0691964328289032, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4709821715950966, + "grad_norm": 28.15887451171875, + "kl": 5.05859375, + "learning_rate": 4.241765841362443e-07, + "loss": 0.3777, + "reward": 0.4782366305589676, + "reward_std": 0.1633016001433134, + "rewards/accuracy_reward": 0.06026786006987095, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.4157366305589676, "step": 1102 }, { "clip_ratio": 0.0, - "completion_length": 1737.3661499023438, + "completion_length": 1543.3371276855469, "epoch": 0.329475020536181, - "grad_norm": 0.19051985442638397, - "kl": 0.152587890625, - "learning_rate": 8.479788683348994e-08, - "loss": 0.0291, - "reward": 0.5267857387661934, - "reward_std": 0.09877963736653328, - "rewards/accuracy_reward": 0.06250000116415322, + "grad_norm": 47.15367126464844, + "kl": 5.3359375, + "learning_rate": 4.239894341674497e-07, + "loss": 0.3869, + "reward": 0.4726562798023224, + "reward_std": 0.17321716621518135, + "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4056919813156128, "step": 1103 }, { "clip_ratio": 0.0, - "completion_length": 1664.9465026855469, + "completion_length": 1500.0000610351562, "epoch": 0.32977372862370247, - "grad_norm": 0.21242402493953705, - "kl": 0.1484375, - "learning_rate": 8.476041898304072e-08, - "loss": 0.0392, - "reward": 0.5675223395228386, - "reward_std": 0.10538461431860924, - "rewards/accuracy_reward": 0.08482143585570157, + "grad_norm": 41.22899627685547, + "kl": 3.56640625, + "learning_rate": 4.238020949152036e-07, + "loss": 0.3066, + "reward": 0.4871651902794838, + "reward_std": 0.16500382870435715, + "rewards/accuracy_reward": 0.0736607147846371, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.482700914144516, + "rewards/tag_count_reward": 0.4135044813156128, "step": 1104 }, { "clip_ratio": 0.0, - "completion_length": 1666.0313415527344, + "completion_length": 1533.9263916015625, "epoch": 0.33007243671122394, - "grad_norm": 0.2245297133922577, - "kl": 0.1474609375, - "learning_rate": 8.472291331666253e-08, - "loss": 0.0296, - "reward": 0.6160714626312256, - "reward_std": 0.10494551435112953, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 38.69378662109375, + "kl": 4.35546875, + "learning_rate": 4.2361456658331264e-07, + "loss": 0.3985, + "reward": 0.5396205633878708, + "reward_std": 0.1611441820859909, + "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.475446455180645, + "rewards/tag_count_reward": 0.4079241305589676, "step": 1105 }, { "clip_ratio": 0.0, - "completion_length": 1615.9018249511719, - "epoch": 0.3303711447987454, - "grad_norm": 0.24678324162960052, - "kl": 0.14697265625, - "learning_rate": 8.468536987515786e-08, - "loss": 0.0477, - "reward": 0.5943080633878708, - "reward_std": 0.10437554400414228, - "rewards/accuracy_reward": 0.12500000861473382, + "completion_length": 1478.3281860351562, + "epoch": 0.3303711447987454, + "grad_norm": 53.9987907409668, + "kl": 5.9375, + "learning_rate": 4.234268493757893e-07, + "loss": 0.4773, + "reward": 0.5251116380095482, + "reward_std": 0.15964259207248688, + "rewards/accuracy_reward": 0.12500000977888703, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4001116305589676, "step": 1106 }, { "clip_ratio": 0.0, - "completion_length": 1631.1116943359375, + "completion_length": 1437.2969360351562, "epoch": 0.3306698528862669, - "grad_norm": 0.23869720101356506, - "kl": 0.14892578125, - "learning_rate": 8.46477886993703e-08, - "loss": 0.0491, - "reward": 0.647879496216774, - "reward_std": 0.10885660164058208, - "rewards/accuracy_reward": 0.16741072246804833, + "grad_norm": 19.226041793823242, + "kl": 5.1484375, + "learning_rate": 4.2323894349685153e-07, + "loss": 0.458, + "reward": 0.5731026977300644, + "reward_std": 0.1622866876423359, + "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687723517418, + "rewards/tag_count_reward": 0.4146205559372902, "step": 1107 }, { "clip_ratio": 0.0, - "completion_length": 1574.0090026855469, + "completion_length": 1406.2299499511719, "epoch": 0.33096856097378835, - "grad_norm": 0.22231705486774445, - "kl": 0.139404296875, - "learning_rate": 8.461016983018449e-08, - "loss": 0.0449, - "reward": 0.6222098618745804, - "reward_std": 0.12327926699072123, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 50.84612274169922, + "kl": 5.609375, + "learning_rate": 4.2305084915092245e-07, + "loss": 0.4667, + "reward": 0.545200914144516, + "reward_std": 0.18520469777286053, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4815848395228386, + "rewards/tag_count_reward": 0.4179687649011612, "step": 1108 }, { "clip_ratio": 0.0, - "completion_length": 1667.7679443359375, + "completion_length": 1539.8728637695312, "epoch": 0.3312672690613098, - "grad_norm": 0.21051110327243805, - "kl": 0.143798828125, - "learning_rate": 8.457251330852607e-08, - "loss": 0.0249, - "reward": 0.6428571715950966, - "reward_std": 0.10460910946130753, - "rewards/accuracy_reward": 0.16741071757860482, + "grad_norm": 63.42246627807617, + "kl": 5.91796875, + "learning_rate": 4.2286256654263033e-07, + "loss": 0.4136, + "reward": 0.5758928805589676, + "reward_std": 0.15752670541405678, + "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4754464477300644, + "rewards/tag_count_reward": 0.4107142984867096, "step": 1109 }, { "clip_ratio": 0.0, - "completion_length": 1683.0514221191406, + "completion_length": 1523.2589721679688, "epoch": 0.3315659771488313, - "grad_norm": 0.21823646128177643, - "kl": 0.148681640625, - "learning_rate": 8.453481917536162e-08, - "loss": 0.045, - "reward": 0.532924123108387, - "reward_std": 0.15896953456103802, - "rewards/accuracy_reward": 0.06250000093132257, + "grad_norm": 28.401241302490234, + "kl": 5.0859375, + "learning_rate": 4.226740958768081e-07, + "loss": 0.3984, + "reward": 0.472098246216774, + "reward_std": 0.21684274449944496, + "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470424123108387, + "rewards/tag_count_reward": 0.4051339402794838, "step": 1110 }, { "clip_ratio": 0.0, - "completion_length": 1740.4465026855469, + "completion_length": 1555.7077026367188, "epoch": 0.33186468523635276, - "grad_norm": 0.2278495579957962, - "kl": 0.1494140625, - "learning_rate": 8.449708747169868e-08, - "loss": 0.0291, - "reward": 0.5987723395228386, - "reward_std": 0.1092462558299303, - "rewards/accuracy_reward": 0.12946429220028222, + "grad_norm": 6.435114860534668, + "kl": 4.953125, + "learning_rate": 4.224854373584934e-07, + "loss": 0.3701, + "reward": 0.5385044887661934, + "reward_std": 0.17384998872876167, + "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080633878708, + "rewards/tag_count_reward": 0.4068080559372902, "step": 1111 }, { "clip_ratio": 0.0, - "completion_length": 1619.5022888183594, + "completion_length": 1518.6920471191406, "epoch": 0.33216339332387423, - "grad_norm": 0.21619398891925812, - "kl": 0.1427001953125, - "learning_rate": 8.445931823858567e-08, - "loss": 0.0232, - "reward": 0.6707589477300644, - "reward_std": 0.17801100760698318, - "rewards/accuracy_reward": 0.18526786379516125, + "grad_norm": 24.98603057861328, + "kl": 4.96875, + "learning_rate": 4.222965911929283e-07, + "loss": 0.3725, + "reward": 0.569754496216774, + "reward_std": 0.23836610466241837, + "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4854910895228386, + "rewards/tag_count_reward": 0.3978794813156128, "step": 1112 }, { "clip_ratio": 0.0, - "completion_length": 1731.0915832519531, + "completion_length": 1557.9129943847656, "epoch": 0.3324621014113957, - "grad_norm": 0.22793787717819214, - "kl": 0.156982421875, - "learning_rate": 8.442151151711179e-08, - "loss": 0.0315, - "reward": 0.517857164144516, - "reward_std": 0.10013702604919672, - "rewards/accuracy_reward": 0.05133928800933063, + "grad_norm": 27.3829402923584, + "kl": 5.4921875, + "learning_rate": 4.2210755758555895e-07, + "loss": 0.4142, + "reward": 0.455357164144516, + "reward_std": 0.18140070885419846, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.3973214477300644, "step": 1113 }, { "clip_ratio": 0.0, - "completion_length": 1619.4263916015625, + "completion_length": 1478.5938110351562, "epoch": 0.3327608094989172, - "grad_norm": 0.21558085083961487, - "kl": 0.144287109375, - "learning_rate": 8.438366734840702e-08, - "loss": 0.0369, - "reward": 0.4972098544239998, - "reward_std": 0.11770172975957394, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 76.59188079833984, + "kl": 3.6328125, + "learning_rate": 4.219183367420351e-07, + "loss": 0.3486, + "reward": 0.4296875149011612, + "reward_std": 0.1655359622091055, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4073660895228386, "step": 1114 }, { "clip_ratio": 0.0, - "completion_length": 1690.4911499023438, + "completion_length": 1502.5804443359375, "epoch": 0.33305951758643865, - "grad_norm": 0.214212104678154, - "kl": 0.146728515625, - "learning_rate": 8.434578577364217e-08, - "loss": 0.0378, - "reward": 0.6199777126312256, - "reward_std": 0.1468144915997982, - "rewards/accuracy_reward": 0.14062500488944352, + "grad_norm": 40.41933059692383, + "kl": 3.53515625, + "learning_rate": 4.2172892886821087e-07, + "loss": 0.3171, + "reward": 0.5390625223517418, + "reward_std": 0.2093126941472292, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.479352705180645, + "rewards/tag_count_reward": 0.4118303805589676, "step": 1115 }, { "clip_ratio": 0.0, - "completion_length": 1644.2746276855469, + "completion_length": 1447.3014221191406, "epoch": 0.3333582256739601, - "grad_norm": 0.1925843358039856, - "kl": 0.142578125, - "learning_rate": 8.430786683402864e-08, - "loss": 0.0247, - "reward": 0.6930803954601288, - "reward_std": 0.15604027546942234, - "rewards/accuracy_reward": 0.2165178693830967, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4765625223517418, + "grad_norm": 66.87886047363281, + "kl": 3.01171875, + "learning_rate": 4.215393341701432e-07, + "loss": 0.2825, + "reward": 0.6065848469734192, + "reward_std": 0.19747721403837204, + "rewards/accuracy_reward": 0.1852678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4213169738650322, "step": 1116 }, { "clip_ratio": 0.0, - "completion_length": 1656.6630249023438, + "completion_length": 1484.9085388183594, "epoch": 0.3336569337614816, - "grad_norm": 0.21702232956886292, - "kl": 0.13525390625, - "learning_rate": 8.426991057081854e-08, - "loss": 0.0412, - "reward": 0.4949776977300644, - "reward_std": 0.10233835875988007, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 48.29243850708008, + "kl": 3.4921875, + "learning_rate": 4.2134955285409266e-07, + "loss": 0.3337, + "reward": 0.435267873108387, + "reward_std": 0.16089589893817902, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.4129464477300644, "step": 1117 }, { "clip_ratio": 0.0, - "completion_length": 1586.4598999023438, + "completion_length": 1443.3371276855469, "epoch": 0.33395564184900306, - "grad_norm": 0.20403294265270233, - "kl": 0.1329345703125, - "learning_rate": 8.423191702530453e-08, - "loss": 0.0228, - "reward": 0.5987723469734192, - "reward_std": 0.15261717326939106, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 33.04225158691406, + "kl": 3.8203125, + "learning_rate": 4.211595851265226e-07, + "loss": 0.3577, + "reward": 0.5089286044239998, + "reward_std": 0.17562073655426502, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4782366305589676, + "rewards/tag_count_reward": 0.4062500223517418, "step": 1118 }, { "clip_ratio": 0.0, - "completion_length": 1703.540283203125, + "completion_length": 1464.3550109863281, "epoch": 0.33425434993652453, - "grad_norm": 0.21101397275924683, - "kl": 0.14306640625, - "learning_rate": 8.419388623881988e-08, - "loss": 0.0364, - "reward": 0.5217634066939354, - "reward_std": 0.09090955089777708, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 55.92015838623047, + "kl": 3.90234375, + "learning_rate": 4.2096943119409944e-07, + "loss": 0.354, + "reward": 0.470982164144516, + "reward_std": 0.15139034762978554, + "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.419642873108387, "step": 1119 }, { "clip_ratio": 0.0, - "completion_length": 1774.3327026367188, + "completion_length": 1537.2098693847656, "epoch": 0.334553058024046, - "grad_norm": 0.21980011463165283, - "kl": 0.155029296875, - "learning_rate": 8.415581825273838e-08, - "loss": 0.0325, - "reward": 0.5245536044239998, - "reward_std": 0.10036454908549786, - "rewards/accuracy_reward": 0.05133928684517741, + "grad_norm": 55.06420135498047, + "kl": 6.5, + "learning_rate": 4.207790912636919e-07, + "loss": 0.4892, + "reward": 0.4419643059372902, + "reward_std": 0.16076616197824478, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143059372902, + "rewards/tag_count_reward": 0.3973214477300644, "step": 1120 }, { "clip_ratio": 0.0, - "completion_length": 1732.1451721191406, + "completion_length": 1429.3639221191406, "epoch": 0.3348517661115675, - "grad_norm": 0.2317965030670166, - "kl": 0.1533203125, - "learning_rate": 8.411771310847425e-08, - "loss": 0.0308, - "reward": 0.5262276977300644, - "reward_std": 0.0971129173412919, + "grad_norm": 12.442398071289062, + "kl": 4.58984375, + "learning_rate": 4.205885655423712e-07, + "loss": 0.3962, + "reward": 0.4765625149011612, + "reward_std": 0.154171384871006, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.4252232313156128, "step": 1121 }, { "clip_ratio": 0.0, - "completion_length": 1678.7344360351562, + "completion_length": 1464.7656555175781, "epoch": 0.33515047419908894, - "grad_norm": 0.2135549932718277, - "kl": 0.147705078125, - "learning_rate": 8.407957084748214e-08, - "loss": 0.0464, - "reward": 0.5189732313156128, - "reward_std": 0.10632148012518883, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 27.795188903808594, + "kl": 4.36328125, + "learning_rate": 4.203978542374107e-07, + "loss": 0.3934, + "reward": 0.4771205633878708, + "reward_std": 0.15596584975719452, + "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4765625149011612, + "rewards/tag_count_reward": 0.4235491305589676, "step": 1122 }, { "clip_ratio": 0.0, - "completion_length": 1657.47998046875, + "completion_length": 1507.69873046875, "epoch": 0.3354491822866104, - "grad_norm": 0.25725018978118896, - "kl": 0.1417236328125, - "learning_rate": 8.40413915112571e-08, - "loss": 0.0382, - "reward": 0.5407366305589676, - "reward_std": 0.10992639511823654, - "rewards/accuracy_reward": 0.06919643143191934, + "grad_norm": 20.383512496948242, + "kl": 5.375, + "learning_rate": 4.2020695755628555e-07, + "loss": 0.4356, + "reward": 0.4587053805589676, + "reward_std": 0.17846173979341984, + "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.471540205180645, + "rewards/tag_count_reward": 0.4029018059372902, "step": 1123 }, { "clip_ratio": 0.0, - "completion_length": 1652.8996276855469, + "completion_length": 1483.3259582519531, "epoch": 0.3357478903741319, - "grad_norm": 0.2193685621023178, - "kl": 0.142333984375, - "learning_rate": 8.400317514133454e-08, - "loss": 0.0204, - "reward": 0.5541294887661934, - "reward_std": 0.10619634203612804, - "rewards/accuracy_reward": 0.08705357415601611, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "grad_norm": 15.075225830078125, + "kl": 5.125, + "learning_rate": 4.200158757066727e-07, + "loss": 0.4314, + "reward": 0.5027901977300644, + "reward_std": 0.15048566833138466, + "rewards/accuracy_reward": 0.09598214481957257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4068080484867096, "step": 1124 }, { "clip_ratio": 0.0, - "completion_length": 1583.01123046875, + "completion_length": 1404.8460388183594, "epoch": 0.33604659846165336, - "grad_norm": 0.24071407318115234, - "kl": 0.1319580078125, - "learning_rate": 8.396492177929007e-08, - "loss": 0.0373, - "reward": 0.5541294887661934, - "reward_std": 0.14504565298557281, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 36.89778137207031, + "kl": 3.1484375, + "learning_rate": 4.198246088964504e-07, + "loss": 0.2715, + "reward": 0.5167410969734192, + "reward_std": 0.16084223426878452, + "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4760044887661934, + "rewards/tag_count_reward": 0.431919664144516, "step": 1125 }, { "clip_ratio": 0.0, - "completion_length": 1679.0670776367188, + "completion_length": 1464.2679138183594, "epoch": 0.3363453065491748, - "grad_norm": 0.21426263451576233, - "kl": 0.13671875, - "learning_rate": 8.392663146673964e-08, - "loss": 0.0327, - "reward": 0.577566996216774, - "reward_std": 0.13962592370808125, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 50.02904510498047, + "kl": 3.58984375, + "learning_rate": 4.196331573336982e-07, + "loss": 0.3321, + "reward": 0.5061384066939354, + "reward_std": 0.18709429539740086, + "rewards/accuracy_reward": 0.0870535729918629, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4815848395228386, + "rewards/tag_count_reward": 0.4190848395228386, "step": 1126 }, { "clip_ratio": 0.0, - "completion_length": 1733.0915832519531, + "completion_length": 1525.8572082519531, "epoch": 0.3366440146366963, - "grad_norm": 0.22753338515758514, - "kl": 0.1474609375, - "learning_rate": 8.388830424533934e-08, - "loss": 0.0314, - "reward": 0.5329241305589676, - "reward_std": 0.11375010944902897, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 28.44256019592285, + "kl": 5.03125, + "learning_rate": 4.194415212266967e-07, + "loss": 0.4236, + "reward": 0.4190848469734192, + "reward_std": 0.17601820826530457, + "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4793526977300644, + "rewards/tag_count_reward": 0.3989955559372902, "step": 1127 }, { "clip_ratio": 0.0, - "completion_length": 1660.8460388183594, + "completion_length": 1533.6295471191406, "epoch": 0.33694272272421777, - "grad_norm": 0.2224162071943283, - "kl": 0.134765625, - "learning_rate": 8.384994015678541e-08, - "loss": 0.0332, - "reward": 0.5569196790456772, - "reward_std": 0.10236164554953575, - "rewards/accuracy_reward": 0.08482143026776612, + "grad_norm": 29.089014053344727, + "kl": 5.36328125, + "learning_rate": 4.1924970078392706e-07, + "loss": 0.4133, + "reward": 0.4854911044239998, + "reward_std": 0.18045857921242714, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4073660969734192, "step": 1128 }, { "clip_ratio": 0.0, - "completion_length": 1724.2991943359375, + "completion_length": 1517.16748046875, "epoch": 0.33724143081173924, - "grad_norm": 0.21278256177902222, - "kl": 0.142333984375, - "learning_rate": 8.381153924281428e-08, - "loss": 0.0314, - "reward": 0.5401785969734192, - "reward_std": 0.12124043144285679, - "rewards/accuracy_reward": 0.07589286169968545, + "grad_norm": 35.516754150390625, + "kl": 5.1484375, + "learning_rate": 4.190576962140714e-07, + "loss": 0.3678, + "reward": 0.4799107387661934, + "reward_std": 0.15518346428871155, + "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4107143059372902, "step": 1129 }, { "clip_ratio": 0.0, - "completion_length": 1695.6027221679688, + "completion_length": 1480.3572082519531, "epoch": 0.3375401388992607, - "grad_norm": 0.22428056597709656, - "kl": 0.143310546875, - "learning_rate": 8.377310154520231e-08, - "loss": 0.0083, - "reward": 0.5619419813156128, - "reward_std": 0.13951344974339008, - "rewards/accuracy_reward": 0.08928571990691125, + "grad_norm": 18.417436599731445, + "kl": 4.83203125, + "learning_rate": 4.188655077260115e-07, + "loss": 0.3841, + "reward": 0.4877232313156128, + "reward_std": 0.17442062497138977, + "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1130 }, { "clip_ratio": 0.0, - "completion_length": 1766.6943054199219, + "completion_length": 1562.82373046875, "epoch": 0.3378388469867822, - "grad_norm": 0.23809663951396942, - "kl": 0.148681640625, - "learning_rate": 8.373462710576597e-08, - "loss": 0.0321, - "reward": 0.568080373108387, - "reward_std": 0.15607344917953014, - "rewards/accuracy_reward": 0.09375000419095159, + "grad_norm": 25.589128494262695, + "kl": 5.7734375, + "learning_rate": 4.186731355288299e-07, + "loss": 0.4549, + "reward": 0.4609375223517418, + "reward_std": 0.21183444187045097, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.3917410895228386, "step": 1131 }, { "clip_ratio": 0.0, - "completion_length": 1644.9331359863281, + "completion_length": 1411.1473999023438, "epoch": 0.33813755507430365, - "grad_norm": 0.2094835489988327, - "kl": 0.1337890625, - "learning_rate": 8.369611596636172e-08, - "loss": 0.0299, - "reward": 0.5011160895228386, - "reward_std": 0.08479496650397778, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 44.12651062011719, + "kl": 3.7109375, + "learning_rate": 4.184805798318086e-07, + "loss": 0.3563, + "reward": 0.450892873108387, + "reward_std": 0.1629181131720543, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4832589477300644, + "rewards/tag_count_reward": 0.428571455180645, "step": 1132 }, { "clip_ratio": 0.0, - "completion_length": 1652.6652221679688, + "completion_length": 1453.66748046875, "epoch": 0.3384362631618251, - "grad_norm": 0.23192130029201508, - "kl": 0.135009765625, - "learning_rate": 8.365756816888585e-08, - "loss": 0.0372, - "reward": 0.6367187649011612, - "reward_std": 0.12280203960835934, - "rewards/accuracy_reward": 0.1629464328289032, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "grad_norm": 51.030975341796875, + "kl": 4.0078125, + "learning_rate": 4.1828784084442924e-07, + "loss": 0.3604, + "reward": 0.5641741305589676, + "reward_std": 0.1582561917603016, + "rewards/accuracy_reward": 0.1428571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4213169887661934, "step": 1133 }, { "clip_ratio": 0.0, - "completion_length": 1718.3973693847656, + "completion_length": 1491.3281860351562, "epoch": 0.3387349712493466, - "grad_norm": 0.23400430381298065, - "kl": 0.146484375, - "learning_rate": 8.361898375527462e-08, - "loss": 0.0403, - "reward": 0.4960937798023224, - "reward_std": 0.12204671744257212, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 29.74655532836914, + "kl": 4.03125, + "learning_rate": 4.180949187763731e-07, + "loss": 0.3415, + "reward": 0.431919664144516, + "reward_std": 0.18150590732693672, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401902794838, + "rewards/tag_count_reward": 0.4073660969734192, "step": 1134 }, { "clip_ratio": 0.0, - "completion_length": 1686.8371276855469, + "completion_length": 1490.7009582519531, "epoch": 0.33903367933686807, - "grad_norm": 0.22563691437244415, - "kl": 0.145263671875, - "learning_rate": 8.358036276750406e-08, - "loss": 0.0286, - "reward": 0.5758928805589676, - "reward_std": 0.09978851303458214, - "rewards/accuracy_reward": 0.09598214854486287, + "grad_norm": 42.432926177978516, + "kl": 4.43359375, + "learning_rate": 4.179018138375203e-07, + "loss": 0.3659, + "reward": 0.4972098395228386, + "reward_std": 0.18259546533226967, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4799107387661934, + "rewards/tag_count_reward": 0.4146205633878708, "step": 1135 }, { "clip_ratio": 0.0, - "completion_length": 1633.2947082519531, + "completion_length": 1412.4576416015625, "epoch": 0.33933238742438954, - "grad_norm": 0.20661962032318115, - "kl": 0.13623046875, - "learning_rate": 8.354170524759007e-08, - "loss": 0.0233, - "reward": 0.5424107313156128, - "reward_std": 0.15225882083177567, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 38.338382720947266, + "kl": 3.7578125, + "learning_rate": 4.1770852623795035e-07, + "loss": 0.3601, + "reward": 0.4704241305589676, + "reward_std": 0.17673451639711857, + "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143133878708, + "rewards/tag_count_reward": 0.4190848395228386, "step": 1136 }, { "clip_ratio": 0.0, - "completion_length": 1698.4732666015625, + "completion_length": 1443.5335388183594, "epoch": 0.339631095511911, - "grad_norm": 0.26816004514694214, - "kl": 0.144775390625, - "learning_rate": 8.350301123758818e-08, - "loss": 0.0395, - "reward": 0.5156250223517418, - "reward_std": 0.09512320253998041, + "grad_norm": 40.047447204589844, + "kl": 3.453125, + "learning_rate": 4.175150561879409e-07, + "loss": 0.3142, + "reward": 0.4776785895228386, + "reward_std": 0.14084906689822674, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1137 }, { "clip_ratio": 0.0, - "completion_length": 1763.0469665527344, + "completion_length": 1521.6295166015625, "epoch": 0.3399298035994325, - "grad_norm": 0.22816461324691772, - "kl": 0.144287109375, - "learning_rate": 8.346428077959373e-08, - "loss": 0.0358, - "reward": 0.4994419887661934, - "reward_std": 0.14455678686499596, - "rewards/accuracy_reward": 0.03348214412108064, + "grad_norm": 22.019664764404297, + "kl": 5.01171875, + "learning_rate": 4.173214038979687e-07, + "loss": 0.4024, + "reward": 0.4347098395228386, + "reward_std": 0.16105489805340767, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598469734192, + "rewards/tag_count_reward": 0.4190848395228386, "step": 1138 }, { "clip_ratio": 0.0, - "completion_length": 1698.5715026855469, + "completion_length": 1450.7032165527344, "epoch": 0.34022851168695395, - "grad_norm": 0.21697375178337097, - "kl": 0.1422119140625, - "learning_rate": 8.342551391574164e-08, - "loss": 0.0304, - "reward": 0.537946455180645, - "reward_std": 0.10759567283093929, - "rewards/accuracy_reward": 0.0669642873108387, + "grad_norm": 22.877798080444336, + "kl": 4.78125, + "learning_rate": 4.171275695787082e-07, + "loss": 0.4292, + "reward": 0.4966518059372902, + "reward_std": 0.16517476364970207, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1139 }, { "clip_ratio": 0.0, - "completion_length": 1764.7857971191406, + "completion_length": 1540.3728332519531, "epoch": 0.3405272197744754, - "grad_norm": 0.23597422242164612, - "kl": 0.14208984375, - "learning_rate": 8.338671068820646e-08, - "loss": 0.0367, - "reward": 0.5625000298023224, - "reward_std": 0.13202224858105183, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 65.56777954101562, + "kl": 5.859375, + "learning_rate": 4.169335534410323e-07, + "loss": 0.4206, + "reward": 0.4877232313156128, + "reward_std": 0.18596554920077324, + "rewards/accuracy_reward": 0.07812500093132257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4095982387661934, "step": 1140 }, { "clip_ratio": 0.0, - "completion_length": 1716.8616638183594, + "completion_length": 1456.4286193847656, "epoch": 0.3408259278619969, - "grad_norm": 0.2070176750421524, - "kl": 0.14111328125, - "learning_rate": 8.33478711392023e-08, - "loss": 0.0356, - "reward": 0.6261160969734192, - "reward_std": 0.2087925523519516, - "rewards/accuracy_reward": 0.1607142947614193, + "grad_norm": 76.34263610839844, + "kl": 5.91796875, + "learning_rate": 4.167393556960115e-07, + "loss": 0.4556, + "reward": 0.5585937947034836, + "reward_std": 0.21691688522696495, + "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.420200914144516, "step": 1141 }, { "clip_ratio": 0.0, - "completion_length": 1689.1585388183594, + "completion_length": 1431.2723999023438, "epoch": 0.3411246359495183, - "grad_norm": 0.22548271715641022, - "kl": 0.13330078125, - "learning_rate": 8.330899531098279e-08, - "loss": 0.0375, - "reward": 0.4899553805589676, - "reward_std": 0.09248613379895687, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 14.116023063659668, + "kl": 4.3125, + "learning_rate": 4.1654497655491394e-07, + "loss": 0.3793, + "reward": 0.4425223395228386, + "reward_std": 0.13572319224476814, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1142 }, { "clip_ratio": 0.0, - "completion_length": 1680.0693054199219, + "completion_length": 1477.3147888183594, "epoch": 0.3414233440370398, - "grad_norm": 0.20976100862026215, - "kl": 0.135498046875, - "learning_rate": 8.3270083245841e-08, - "loss": 0.0254, - "reward": 0.4927455484867096, - "reward_std": 0.106557821854949, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 52.86299514770508, + "kl": 5.0078125, + "learning_rate": 4.16350416229205e-07, + "loss": 0.3734, + "reward": 0.4497768059372902, + "reward_std": 0.1685454361140728, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4185268133878708, "step": 1143 }, { "clip_ratio": 0.0, - "completion_length": 1639.1451416015625, + "completion_length": 1418.93310546875, "epoch": 0.34172205212456125, - "grad_norm": 0.208227276802063, - "kl": 0.1309814453125, - "learning_rate": 8.323113498610948e-08, - "loss": 0.0222, - "reward": 0.507254496216774, - "reward_std": 0.12993598356842995, - "rewards/accuracy_reward": 0.03348214388824999, + "grad_norm": 23.434572219848633, + "kl": 4.0703125, + "learning_rate": 4.161556749305474e-07, + "loss": 0.3248, + "reward": 0.489955373108387, + "reward_std": 0.18199655413627625, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4363839402794838, "step": 1144 }, { "clip_ratio": 0.0, - "completion_length": 1772.0067749023438, + "completion_length": 1552.60498046875, "epoch": 0.3420207602120827, - "grad_norm": 0.24281759560108185, - "kl": 0.147216796875, - "learning_rate": 8.319215057416008e-08, - "loss": 0.0288, - "reward": 0.6372768133878708, - "reward_std": 0.17501486465334892, - "rewards/accuracy_reward": 0.16741071827709675, + "grad_norm": 13.298458099365234, + "kl": 4.72265625, + "learning_rate": 4.159607528708003e-07, + "loss": 0.3844, + "reward": 0.5636161044239998, + "reward_std": 0.2067689411342144, + "rewards/accuracy_reward": 0.15178572130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660969734192, + "rewards/tag_count_reward": 0.411830373108387, "step": 1145 }, { "clip_ratio": 0.0, - "completion_length": 1769.7389221191406, + "completion_length": 1583.9442443847656, "epoch": 0.3423194682996042, - "grad_norm": 0.19667796790599823, - "kl": 0.1444091796875, - "learning_rate": 8.315313005240403e-08, - "loss": 0.031, - "reward": 0.5055803805589676, - "reward_std": 0.10227415151894093, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 11.088444709777832, + "kl": 3.8515625, + "learning_rate": 4.1576565026202013e-07, + "loss": 0.2959, + "reward": 0.4486607313156128, + "reward_std": 0.14514323510229588, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4017857387661934, "step": 1146 }, { "clip_ratio": 0.0, - "completion_length": 1713.2634582519531, + "completion_length": 1477.7590026855469, "epoch": 0.34261817638712566, - "grad_norm": 0.2331399917602539, - "kl": 0.143310546875, - "learning_rate": 8.311407346329184e-08, - "loss": 0.0509, - "reward": 0.6210937723517418, - "reward_std": 0.09989826008677483, - "rewards/accuracy_reward": 0.15401786798611283, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "grad_norm": 68.97903442382812, + "kl": 3.453125, + "learning_rate": 4.155703673164592e-07, + "loss": 0.36, + "reward": 0.5775669887661934, + "reward_std": 0.15102426148951054, + "rewards/accuracy_reward": 0.15178572619333863, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1147 }, { "clip_ratio": 0.0, - "completion_length": 1686.9308776855469, + "completion_length": 1423.2790832519531, "epoch": 0.34291688447464713, - "grad_norm": 0.21374836564064026, - "kl": 0.1329345703125, - "learning_rate": 8.307498084931325e-08, - "loss": 0.0331, - "reward": 0.6646205633878708, - "reward_std": 0.16400853171944618, - "rewards/accuracy_reward": 0.18750000465661287, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "grad_norm": 41.52378463745117, + "kl": 2.966796875, + "learning_rate": 4.153749042465663e-07, + "loss": 0.2801, + "reward": 0.6099330633878708, + "reward_std": 0.23177003860473633, + "rewards/accuracy_reward": 0.17410715040750802, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.435825914144516, "step": 1148 }, { "clip_ratio": 0.0, - "completion_length": 1720.7032165527344, + "completion_length": 1475.1786499023438, "epoch": 0.3432155925621686, - "grad_norm": 0.2740260660648346, - "kl": 0.1416015625, - "learning_rate": 8.303585225299721e-08, - "loss": 0.0496, - "reward": 0.6060268133878708, - "reward_std": 0.17689766921103, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 26.453310012817383, + "kl": 4.2265625, + "learning_rate": 4.1517926126498606e-07, + "loss": 0.3444, + "reward": 0.5195312723517418, + "reward_std": 0.19151940196752548, + "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1149 }, { "clip_ratio": 0.0, - "completion_length": 1660.1920471191406, + "completion_length": 1494.6072082519531, "epoch": 0.3435143006496901, - "grad_norm": 0.20964056253433228, - "kl": 0.131103515625, - "learning_rate": 8.299668771691177e-08, - "loss": 0.0484, - "reward": 0.5814732387661934, - "reward_std": 0.17432155273854733, - "rewards/accuracy_reward": 0.10714286053553224, + "grad_norm": 53.48565673828125, + "kl": 4.9765625, + "learning_rate": 4.149834385845589e-07, + "loss": 0.3549, + "reward": 0.526785746216774, + "reward_std": 0.22198162972927094, + "rewards/accuracy_reward": 0.10937500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.4174107313156128, "step": 1150 }, { "clip_ratio": 0.0, - "completion_length": 1740.9509887695312, + "completion_length": 1507.8259582519531, "epoch": 0.34381300873721155, - "grad_norm": 0.20554119348526, - "kl": 0.1417236328125, - "learning_rate": 8.295748728366412e-08, - "loss": 0.0369, - "reward": 0.5184151977300644, - "reward_std": 0.10017957910895348, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 20.78907585144043, + "kl": 4.46484375, + "learning_rate": 4.147874364183206e-07, + "loss": 0.3543, + "reward": 0.4888393133878708, + "reward_std": 0.15104317292571068, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.4218750149011612, "step": 1151 }, { "clip_ratio": 0.0, - "completion_length": 1759.7478332519531, + "completion_length": 1523.01123046875, "epoch": 0.344111716824733, - "grad_norm": 0.21933649480342865, - "kl": 0.145751953125, - "learning_rate": 8.291825099590048e-08, - "loss": 0.0339, - "reward": 0.5106026977300644, - "reward_std": 0.08998927846550941, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 46.41920471191406, + "kl": 6.01171875, + "learning_rate": 4.1459125497950245e-07, + "loss": 0.47, + "reward": 0.4497768059372902, + "reward_std": 0.12864741683006287, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919813156128, + "rewards/tag_count_reward": 0.411830373108387, "step": 1152 }, { "clip_ratio": 0.0, - "completion_length": 1758.0514221191406, + "completion_length": 1510.2879943847656, "epoch": 0.3444104249122545, - "grad_norm": 0.19308242201805115, - "kl": 0.14794921875, - "learning_rate": 8.28789788963061e-08, - "loss": 0.0346, - "reward": 0.654575914144516, - "reward_std": 0.11628289800137281, - "rewards/accuracy_reward": 0.18303571990691125, + "grad_norm": 15.101097106933594, + "kl": 4.75, + "learning_rate": 4.1439489448153046e-07, + "loss": 0.396, + "reward": 0.5904018133878708, + "reward_std": 0.16178684681653976, + "rewards/accuracy_reward": 0.16517857811413705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.4252232313156128, "step": 1153 }, { "clip_ratio": 0.0, - "completion_length": 1714.1406860351562, + "completion_length": 1538.6429138183594, "epoch": 0.34470913299977596, - "grad_norm": 0.27530810236930847, - "kl": 0.133544921875, - "learning_rate": 8.283967102760516e-08, - "loss": 0.039, - "reward": 0.629464328289032, - "reward_std": 0.1603500060737133, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 46.85988235473633, + "kl": 5.8984375, + "learning_rate": 4.141983551380258e-07, + "loss": 0.4232, + "reward": 0.5636160969734192, + "reward_std": 0.22882236912846565, + "rewards/accuracy_reward": 0.15848214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4051339477300644, "step": 1154 }, { "clip_ratio": 0.0, - "completion_length": 1617.6295471191406, + "completion_length": 1399.3973999023438, "epoch": 0.34500784108729743, - "grad_norm": 0.20944125950336456, - "kl": 0.1337890625, - "learning_rate": 8.280032743256078e-08, - "loss": 0.0295, - "reward": 0.6277901977300644, - "reward_std": 0.14028103463351727, - "rewards/accuracy_reward": 0.14508928963914514, + "grad_norm": 35.17140197753906, + "kl": 5.21875, + "learning_rate": 4.140016371628039e-07, + "loss": 0.4356, + "reward": 0.5251116380095482, + "reward_std": 0.19577893614768982, + "rewards/accuracy_reward": 0.09598214458674192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.482700914144516, + "rewards/tag_count_reward": 0.429129496216774, "step": 1155 }, { "clip_ratio": 0.0, - "completion_length": 1719.2634887695312, + "completion_length": 1500.9063110351562, "epoch": 0.3453065491748189, - "grad_norm": 0.21819403767585754, - "kl": 0.144287109375, - "learning_rate": 8.276094815397493e-08, - "loss": 0.0237, - "reward": 0.5585937947034836, - "reward_std": 0.11882592923939228, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 14.357109069824219, + "kl": 4.765625, + "learning_rate": 4.138047407698746e-07, + "loss": 0.381, + "reward": 0.5100446566939354, + "reward_std": 0.13611139729619026, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080633878708, + "rewards/tag_count_reward": 0.427455373108387, "step": 1156 }, { "clip_ratio": 0.0, - "completion_length": 1691.2947387695312, + "completion_length": 1404.4063110351562, "epoch": 0.34560525726234037, - "grad_norm": 0.2226998507976532, - "kl": 0.146728515625, - "learning_rate": 8.27215332346884e-08, - "loss": 0.0292, - "reward": 0.4877232313156128, - "reward_std": 0.10089302808046341, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 6.704031467437744, + "kl": 3.9140625, + "learning_rate": 4.13607666173442e-07, + "loss": 0.3405, + "reward": 0.4453125223517418, + "reward_std": 0.13210083171725273, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.4274553880095482, "step": 1157 }, { "clip_ratio": 0.0, - "completion_length": 1666.2210693359375, + "completion_length": 1422.1585083007812, "epoch": 0.34590396534986184, - "grad_norm": 0.24958078563213348, - "kl": 0.1392822265625, - "learning_rate": 8.268208271758077e-08, - "loss": 0.0421, - "reward": 0.6004464477300644, - "reward_std": 0.13468431867659092, - "rewards/accuracy_reward": 0.12500000279396772, + "grad_norm": 21.669687271118164, + "kl": 4.4140625, + "learning_rate": 4.1341041358790387e-07, + "loss": 0.3787, + "reward": 0.5295759215950966, + "reward_std": 0.16853434592485428, + "rewards/accuracy_reward": 0.10714286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4754464477300644, + "rewards/tag_count_reward": 0.4224330484867096, "step": 1158 }, { "clip_ratio": 0.0, - "completion_length": 1634.4889221191406, + "completion_length": 1417.33935546875, "epoch": 0.3462026734373833, - "grad_norm": 0.230885311961174, - "kl": 0.1270751953125, - "learning_rate": 8.264259664557034e-08, - "loss": 0.0384, - "reward": 0.553571455180645, - "reward_std": 0.10602692514657974, - "rewards/accuracy_reward": 0.08035714761354029, + "grad_norm": 57.74095153808594, + "kl": 3.734375, + "learning_rate": 4.132129832278517e-07, + "loss": 0.3731, + "reward": 0.5083705559372902, + "reward_std": 0.1640004850924015, + "rewards/accuracy_reward": 0.07812500046566129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143059372902, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1159 }, { "clip_ratio": 0.0, - "completion_length": 1600.9777526855469, + "completion_length": 1436.3103332519531, "epoch": 0.3465013815249048, - "grad_norm": 0.20940041542053223, - "kl": 0.12646484375, - "learning_rate": 8.260307506161406e-08, - "loss": 0.0291, - "reward": 0.5915178880095482, - "reward_std": 0.060087510384619236, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 37.90242385864258, + "kl": 4.08984375, + "learning_rate": 4.130153753080703e-07, + "loss": 0.3872, + "reward": 0.5329241305589676, + "reward_std": 0.1283276081085205, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785969734192, + "rewards/tag_count_reward": 0.423549123108387, "step": 1160 }, { "clip_ratio": 0.0, - "completion_length": 1729.9867248535156, + "completion_length": 1557.3504943847656, "epoch": 0.34680008961242625, - "grad_norm": 0.3456806540489197, - "kl": 0.1416015625, - "learning_rate": 8.256351800870757e-08, - "loss": 0.0415, - "reward": 0.5044643208384514, - "reward_std": 0.13232623413205147, - "rewards/accuracy_reward": 0.03571428754366934, + "grad_norm": 44.708621978759766, + "kl": 3.89453125, + "learning_rate": 4.1281759004353787e-07, + "loss": 0.3215, + "reward": 0.4430803805589676, + "reward_std": 0.1616559438407421, + "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1161 }, { "clip_ratio": 0.0, - "completion_length": 1667.1875610351562, + "completion_length": 1408.9554138183594, "epoch": 0.3470987976999477, - "grad_norm": 0.22154919803142548, - "kl": 0.137451171875, - "learning_rate": 8.252392552988506e-08, - "loss": 0.0393, - "reward": 0.6891741454601288, - "reward_std": 0.13524800539016724, - "rewards/accuracy_reward": 0.2142857275903225, + "grad_norm": 71.8008041381836, + "kl": 3.390625, + "learning_rate": 4.1261962764942526e-07, + "loss": 0.3446, + "reward": 0.635044664144516, + "reward_std": 0.15830965526401997, + "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.443080373108387, "step": 1162 }, { "clip_ratio": 0.0, - "completion_length": 1704.2433776855469, + "completion_length": 1476.4063415527344, "epoch": 0.3473975057874692, - "grad_norm": 0.2354849874973297, - "kl": 0.1396484375, - "learning_rate": 8.248429766821925e-08, - "loss": 0.0411, - "reward": 0.5513393133878708, - "reward_std": 0.085940882563591, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 34.387786865234375, + "kl": 3.95703125, + "learning_rate": 4.1242148834109623e-07, + "loss": 0.3382, + "reward": 0.5111607238650322, + "reward_std": 0.16255823522806168, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.424107164144516, "step": 1163 }, { "clip_ratio": 0.0, - "completion_length": 1716.6317749023438, + "completion_length": 1499.9732666015625, "epoch": 0.34769621387499067, - "grad_norm": 0.3020344078540802, - "kl": 0.1422119140625, - "learning_rate": 8.244463446682137e-08, - "loss": 0.0318, - "reward": 0.5424107387661934, - "reward_std": 0.126263115555048, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 15.148104667663574, + "kl": 5.1171875, + "learning_rate": 4.1222317233410684e-07, + "loss": 0.4481, + "reward": 0.4642857313156128, + "reward_std": 0.19027116894721985, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.3973214402794838, "step": 1164 }, { "clip_ratio": 0.0, - "completion_length": 1652.2656860351562, + "completion_length": 1432.9241638183594, "epoch": 0.34799492196251214, - "grad_norm": 0.23401020467281342, - "kl": 0.134765625, - "learning_rate": 8.240493596884112e-08, - "loss": 0.0406, - "reward": 0.547433041036129, - "reward_std": 0.12445837818086147, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 15.272122383117676, + "kl": 4.203125, + "learning_rate": 4.1202467984420564e-07, + "loss": 0.321, + "reward": 0.5066964626312256, + "reward_std": 0.15943104773759842, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1165 }, { "clip_ratio": 0.0, - "completion_length": 1677.8259582519531, + "completion_length": 1473.8527526855469, "epoch": 0.3482936300500336, - "grad_norm": 0.2503893971443176, - "kl": 0.1300048828125, - "learning_rate": 8.236520221746657e-08, - "loss": 0.0355, - "reward": 0.7003348618745804, - "reward_std": 0.17907553538680077, - "rewards/accuracy_reward": 0.2276785783469677, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "grad_norm": 38.41749954223633, + "kl": 4.4140625, + "learning_rate": 4.1182601108733284e-07, + "loss": 0.3305, + "reward": 0.6010044813156128, + "reward_std": 0.2197759486734867, + "rewards/accuracy_reward": 0.1808035783469677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.420200914144516, "step": 1166 }, { "clip_ratio": 0.0, - "completion_length": 1795.0603332519531, + "completion_length": 1575.5871276855469, "epoch": 0.3485923381375551, - "grad_norm": 0.2109696865081787, - "kl": 0.1368408203125, - "learning_rate": 8.232543325592412e-08, - "loss": 0.0374, - "reward": 0.5792410969734192, - "reward_std": 0.1615912076085806, - "rewards/accuracy_reward": 0.11607143469154835, + "grad_norm": 27.68985939025879, + "kl": 3.5390625, + "learning_rate": 4.1162716627962064e-07, + "loss": 0.2855, + "reward": 0.5033482387661934, + "reward_std": 0.1654346026480198, + "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696566939354, + "rewards/tag_count_reward": 0.4319196566939354, "step": 1167 }, { "clip_ratio": 0.0, - "completion_length": 1724.83935546875, + "completion_length": 1522.13623046875, "epoch": 0.34889104622507655, - "grad_norm": 0.23298686742782593, - "kl": 0.1416015625, - "learning_rate": 8.228562912747854e-08, - "loss": 0.0194, - "reward": 0.511160746216774, - "reward_std": 0.13142837211489677, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 9.421676635742188, + "kl": 4.98828125, + "learning_rate": 4.114281456373927e-07, + "loss": 0.4147, + "reward": 0.4363839477300644, + "reward_std": 0.17174207419157028, + "rewards/accuracy_reward": 0.026785714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143133878708, + "rewards/tag_count_reward": 0.4095982238650322, "step": 1168 }, { "clip_ratio": 0.0, - "completion_length": 1645.2076416015625, + "completion_length": 1430.0536499023438, "epoch": 0.349189754312598, - "grad_norm": 0.24598577618598938, - "kl": 0.137939453125, - "learning_rate": 8.224578987543278e-08, - "loss": 0.0396, - "reward": 0.516741082072258, - "reward_std": 0.0989828109741211, - "rewards/accuracy_reward": 0.03794643119908869, + "grad_norm": 42.35009002685547, + "kl": 5.41015625, + "learning_rate": 4.1122894937716393e-07, + "loss": 0.4413, + "reward": 0.4525669813156128, + "reward_std": 0.1388267558068037, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478794664144516, + "rewards/tag_count_reward": 0.4235491305589676, "step": 1169 }, { "clip_ratio": 0.0, - "completion_length": 1676.1607971191406, + "completion_length": 1458.8728332519531, "epoch": 0.3494884624001195, - "grad_norm": 0.20887064933776855, - "kl": 0.13623046875, - "learning_rate": 8.220591554312808e-08, - "loss": 0.0321, - "reward": 0.6082589626312256, - "reward_std": 0.14825339056551456, - "rewards/accuracy_reward": 0.129464291036129, + "grad_norm": 21.947912216186523, + "kl": 4.65234375, + "learning_rate": 4.110295777156404e-07, + "loss": 0.3893, + "reward": 0.5401785969734192, + "reward_std": 0.16548151895403862, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478794664144516, + "rewards/tag_count_reward": 0.4330357238650322, "step": 1170 }, { "clip_ratio": 0.0, - "completion_length": 1676.8527526855469, + "completion_length": 1509.3795166015625, "epoch": 0.34978717048764096, - "grad_norm": 0.23790766298770905, - "kl": 0.13623046875, - "learning_rate": 8.216600617394378e-08, - "loss": 0.0363, - "reward": 0.5859375298023224, - "reward_std": 0.15044516883790493, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 7.804295063018799, + "kl": 4.29296875, + "learning_rate": 4.108300308697189e-07, + "loss": 0.3397, + "reward": 0.4899553805589676, + "reward_std": 0.1958666853606701, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.4185268133878708, "step": 1171 }, { "clip_ratio": 0.0, - "completion_length": 1673.79248046875, + "completion_length": 1479.15185546875, "epoch": 0.35008587857516243, - "grad_norm": 0.2933920919895172, - "kl": 0.1348876953125, - "learning_rate": 8.212606181129737e-08, - "loss": 0.0462, - "reward": 0.5937500298023224, - "reward_std": 0.10990095883607864, - "rewards/accuracy_reward": 0.12500000419095159, + "grad_norm": 14.75503158569336, + "kl": 4.0546875, + "learning_rate": 4.106303090564868e-07, + "loss": 0.3257, + "reward": 0.5390625298023224, + "reward_std": 0.1663689874112606, + "rewards/accuracy_reward": 0.12276786146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.416294664144516, "step": 1172 }, { "clip_ratio": 0.0, - "completion_length": 1789.4978332519531, + "completion_length": 1601.1741638183594, "epoch": 0.3503845866626839, - "grad_norm": 0.26893261075019836, - "kl": 0.14111328125, - "learning_rate": 8.20860824986444e-08, - "loss": 0.0375, - "reward": 0.5345982313156128, - "reward_std": 0.14046846143901348, - "rewards/accuracy_reward": 0.0736607201397419, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.458705373108387, + "grad_norm": 28.54844093322754, + "kl": 4.15234375, + "learning_rate": 4.1043041249322195e-07, + "loss": 0.3287, + "reward": 0.4726562723517418, + "reward_std": 0.16040240973234177, + "rewards/accuracy_reward": 0.0691964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4034598395228386, "step": 1173 }, { "clip_ratio": 0.0, - "completion_length": 1706.4889221191406, + "completion_length": 1510.2969665527344, "epoch": 0.3506832947502054, - "grad_norm": 0.21533861756324768, - "kl": 0.1331787109375, - "learning_rate": 8.204606827947844e-08, - "loss": 0.0354, - "reward": 0.5279018133878708, - "reward_std": 0.13476452976465225, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 27.811861038208008, + "kl": 3.90625, + "learning_rate": 4.102303413973922e-07, + "loss": 0.3353, + "reward": 0.4799107387661934, + "reward_std": 0.19103499501943588, + "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4174107387661934, "step": 1174 }, { "clip_ratio": 0.0, - "completion_length": 1730.9665832519531, + "completion_length": 1541.1741943359375, "epoch": 0.35098200283772685, - "grad_norm": 0.25929346680641174, - "kl": 0.13427734375, - "learning_rate": 8.200601919733106e-08, - "loss": 0.0342, - "reward": 0.6417411118745804, - "reward_std": 0.18176382407546043, - "rewards/accuracy_reward": 0.1696428582072258, + "grad_norm": 24.54498291015625, + "kl": 4.48046875, + "learning_rate": 4.1003009598665526e-07, + "loss": 0.3259, + "reward": 0.5937500298023224, + "reward_std": 0.22726193442940712, + "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4241071566939354, "step": 1175 }, { "clip_ratio": 0.0, - "completion_length": 1687.3281860351562, + "completion_length": 1499.8616638183594, "epoch": 0.3512807109252483, - "grad_norm": 0.2851468622684479, - "kl": 0.137939453125, - "learning_rate": 8.196593529577171e-08, - "loss": 0.0416, - "reward": 0.5719866380095482, - "reward_std": 0.126269293949008, - "rewards/accuracy_reward": 0.1026785746216774, + "grad_norm": 50.237117767333984, + "kl": 4.27734375, + "learning_rate": 4.098296764788586e-07, + "loss": 0.3786, + "reward": 0.4938616380095482, + "reward_std": 0.15878462046384811, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080484867096, + "rewards/tag_count_reward": 0.4246651977300644, "step": 1176 }, { "clip_ratio": 0.0, - "completion_length": 1704.7277526855469, + "completion_length": 1474.5714721679688, "epoch": 0.3515794190127698, - "grad_norm": 0.22054962813854218, - "kl": 0.133544921875, - "learning_rate": 8.192581661840777e-08, - "loss": 0.0214, - "reward": 0.568638414144516, - "reward_std": 0.12290458008646965, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 14.401457786560059, + "kl": 4.0703125, + "learning_rate": 4.096290830920388e-07, + "loss": 0.3511, + "reward": 0.5189732313156128, + "reward_std": 0.16647867485880852, + "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4793526977300644, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1177 }, { "clip_ratio": 0.0, - "completion_length": 1656.6786499023438, + "completion_length": 1411.8013916015625, "epoch": 0.35187812710029126, - "grad_norm": 0.26680484414100647, - "kl": 0.144775390625, - "learning_rate": 8.188566320888439e-08, - "loss": 0.026, - "reward": 0.6584821790456772, - "reward_std": 0.1677748654037714, - "rewards/accuracy_reward": 0.18303572572767735, + "grad_norm": 42.403282165527344, + "kl": 4.68359375, + "learning_rate": 4.0942831604442197e-07, + "loss": 0.3737, + "reward": 0.5708705633878708, + "reward_std": 0.15510273538529873, + "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.475446455180645, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1178 }, { "clip_ratio": 0.0, - "completion_length": 1744.4933776855469, + "completion_length": 1529.0067443847656, "epoch": 0.35217683518781273, - "grad_norm": 0.3084305226802826, - "kl": 0.144287109375, - "learning_rate": 8.184547511088459e-08, - "loss": 0.0396, - "reward": 0.5301339626312256, - "reward_std": 0.10558440629392862, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 21.990720748901367, + "kl": 4.5859375, + "learning_rate": 4.0922737555442295e-07, + "loss": 0.3652, + "reward": 0.463169664144516, + "reward_std": 0.14694657735526562, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660969734192, + "rewards/tag_count_reward": 0.4140625149011612, "step": 1179 }, { "clip_ratio": 0.0, - "completion_length": 1686.7366943359375, + "completion_length": 1501.8929138183594, "epoch": 0.3524755432753342, - "grad_norm": 0.21247237920761108, - "kl": 0.1376953125, - "learning_rate": 8.180525236812905e-08, - "loss": 0.0394, - "reward": 0.6612723395228386, - "reward_std": 0.10150512494146824, - "rewards/accuracy_reward": 0.17633929778821766, + "grad_norm": 31.601350784301758, + "kl": 4.27734375, + "learning_rate": 4.0902626184064525e-07, + "loss": 0.3647, + "reward": 0.5920759215950966, + "reward_std": 0.16835422441363335, + "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4849330559372902, + "rewards/tag_count_reward": 0.4179687649011612, "step": 1180 }, { "clip_ratio": 0.0, - "completion_length": 1709.2366638183594, + "completion_length": 1522.8192443847656, "epoch": 0.3527742513628557, - "grad_norm": 0.2924985885620117, - "kl": 0.140869140625, - "learning_rate": 8.176499502437619e-08, - "loss": 0.0504, - "reward": 0.5797991380095482, - "reward_std": 0.20059667900204659, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 88.24003601074219, + "kl": 5.9375, + "learning_rate": 4.0882497512188097e-07, + "loss": 0.4166, + "reward": 0.5228794887661934, + "reward_std": 0.2252089325338602, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919887661934, + "rewards/tag_count_reward": 0.420200914144516, "step": 1181 }, { "clip_ratio": 0.0, - "completion_length": 1660.6094360351562, + "completion_length": 1479.9531555175781, "epoch": 0.35307295945037714, - "grad_norm": 0.24284911155700684, - "kl": 0.1285400390625, - "learning_rate": 8.172470312342205e-08, - "loss": 0.0492, - "reward": 0.6088169813156128, - "reward_std": 0.11207661032676697, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 25.299148559570312, + "kl": 4.296875, + "learning_rate": 4.0862351561711026e-07, + "loss": 0.3294, + "reward": 0.5859375447034836, + "reward_std": 0.14981507323682308, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919813156128, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1182 }, { "clip_ratio": 0.0, - "completion_length": 1639.04248046875, + "completion_length": 1468.3795471191406, "epoch": 0.3533716675378986, - "grad_norm": 0.3262500762939453, - "kl": 0.139404296875, - "learning_rate": 8.168437670910024e-08, - "loss": 0.0267, - "reward": 0.5150669887661934, - "reward_std": 0.10656709410250187, - "rewards/accuracy_reward": 0.04464285774156451, + "grad_norm": 31.78482437133789, + "kl": 4.8203125, + "learning_rate": 4.0842188354550124e-07, + "loss": 0.3514, + "reward": 0.4620535895228386, + "reward_std": 0.14866582490503788, + "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.4241071566939354, "step": 1183 }, { "clip_ratio": 0.0, - "completion_length": 1700.5870971679688, + "completion_length": 1450.7813415527344, "epoch": 0.3536703756254201, - "grad_norm": 0.3385465741157532, - "kl": 0.140869140625, - "learning_rate": 8.1644015825282e-08, - "loss": 0.0221, - "reward": 0.5530134215950966, - "reward_std": 0.08895238768309355, - "rewards/accuracy_reward": 0.08258928940631449, + "grad_norm": 24.50768280029297, + "kl": 4.8203125, + "learning_rate": 4.0822007912641004e-07, + "loss": 0.3612, + "reward": 0.5066964477300644, + "reward_std": 0.1542123407125473, + "rewards/accuracy_reward": 0.08482143003493547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1184 }, { "clip_ratio": 0.0, - "completion_length": 1765.0938415527344, + "completion_length": 1614.9554138183594, "epoch": 0.3539690837129415, - "grad_norm": 0.20640279352664948, - "kl": 0.14208984375, - "learning_rate": 8.1603620515876e-08, - "loss": 0.0271, - "reward": 0.5066964477300644, - "reward_std": 0.12632647715508938, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 72.21038055419922, + "kl": 6.33984375, + "learning_rate": 4.0801810257938e-07, + "loss": 0.4325, + "reward": 0.4095982387661934, + "reward_std": 0.1483864150941372, + "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.396205373108387, "step": 1185 }, { "clip_ratio": 0.0, - "completion_length": 1743.5313110351562, + "completion_length": 1570.0514221191406, "epoch": 0.35426779180046297, - "grad_norm": 0.25297096371650696, - "kl": 0.1328125, - "learning_rate": 8.156319082482834e-08, - "loss": 0.0306, - "reward": 0.4732143133878708, - "reward_std": 0.11051256582140923, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 18.263654708862305, + "kl": 3.7578125, + "learning_rate": 4.0781595412414173e-07, + "loss": 0.2738, + "reward": 0.4330357387661934, + "reward_std": 0.14903313852846622, + "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535969734192, + "rewards/tag_count_reward": 0.4196428805589676, "step": 1186 }, { "clip_ratio": 0.0, - "completion_length": 1804.8125915527344, + "completion_length": 1558.1451721191406, "epoch": 0.35456649988798444, - "grad_norm": 0.23310376703739166, - "kl": 0.13916015625, - "learning_rate": 8.15227267961226e-08, - "loss": 0.0233, - "reward": 0.5273437723517418, - "reward_std": 0.148987359367311, - "rewards/accuracy_reward": 0.05580357555299997, + "grad_norm": 39.52939987182617, + "kl": 2.59765625, + "learning_rate": 4.07613633980613e-07, + "loss": 0.2128, + "reward": 0.4810268133878708, + "reward_std": 0.18253256753087044, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.436383955180645, "step": 1187 }, { "clip_ratio": 0.0, - "completion_length": 1718.5759887695312, + "completion_length": 1450.3884887695312, "epoch": 0.3548652079755059, - "grad_norm": 0.2584851086139679, - "kl": 0.1435546875, - "learning_rate": 8.148222847377968e-08, - "loss": 0.0398, - "reward": 0.4854910969734192, - "reward_std": 0.11427139490842819, - "rewards/accuracy_reward": 0.017857143888249993, + "grad_norm": 66.02269744873047, + "kl": 3.04296875, + "learning_rate": 4.074111423688984e-07, + "loss": 0.3098, + "reward": 0.4469866305589676, + "reward_std": 0.14859113842248917, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339402794838, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1188 }, { "clip_ratio": 0.0, - "completion_length": 1746.5067749023438, + "completion_length": 1482.7813110351562, "epoch": 0.3551639160630274, - "grad_norm": 0.3609263598918915, - "kl": 0.14697265625, - "learning_rate": 8.144169590185774e-08, - "loss": 0.0423, - "reward": 0.6021205633878708, - "reward_std": 0.1272274674847722, - "rewards/accuracy_reward": 0.13839286379516125, + "grad_norm": 67.85047912597656, + "kl": 2.7578125, + "learning_rate": 4.072084795092887e-07, + "loss": 0.2597, + "reward": 0.5647321715950966, + "reward_std": 0.1490248292684555, + "rewards/accuracy_reward": 0.1294642905704677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.435267873108387, "step": 1189 }, { "clip_ratio": 0.0, - "completion_length": 1626.4509887695312, + "completion_length": 1441.26123046875, "epoch": 0.35546262415054886, - "grad_norm": 0.20711418986320496, - "kl": 0.1380615234375, - "learning_rate": 8.14011291244523e-08, - "loss": 0.0224, - "reward": 0.590959832072258, - "reward_std": 0.14170705154538155, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 60.97877502441406, + "kl": 2.87109375, + "learning_rate": 4.070056456222615e-07, + "loss": 0.2629, + "reward": 0.5217634290456772, + "reward_std": 0.18236533366143703, + "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4793526977300644, + "rewards/tag_count_reward": 0.4213169887661934, "step": 1190 }, { "clip_ratio": 0.0, - "completion_length": 1726.7835388183594, + "completion_length": 1522.1830749511719, "epoch": 0.3557613322380703, - "grad_norm": 0.47821712493896484, - "kl": 0.1409912109375, - "learning_rate": 8.1360528185696e-08, - "loss": 0.0303, - "reward": 0.5887276902794838, - "reward_std": 0.1491672284901142, - "rewards/accuracy_reward": 0.129464291036129, + "grad_norm": 40.4552116394043, + "kl": 3.34375, + "learning_rate": 4.0680264092848005e-07, + "loss": 0.2655, + "reward": 0.5468750223517418, + "reward_std": 0.15329928323626518, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1191 }, { "clip_ratio": 0.0, - "completion_length": 1786.0156860351562, + "completion_length": 1563.6496276855469, "epoch": 0.3560600403255918, - "grad_norm": 0.2532656490802765, - "kl": 0.143798828125, - "learning_rate": 8.13198931297587e-08, - "loss": 0.0357, - "reward": 0.590401828289032, - "reward_std": 0.15527752600610256, - "rewards/accuracy_reward": 0.12276786379516125, + "grad_norm": 13.308736801147461, + "kl": 4.7265625, + "learning_rate": 4.065994656487935e-07, + "loss": 0.3668, + "reward": 0.5329241454601288, + "reward_std": 0.20560182258486748, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467633955180645, + "rewards/tag_count_reward": 0.4146205559372902, "step": 1192 }, { "clip_ratio": 0.0, - "completion_length": 1691.2545166015625, + "completion_length": 1519.2701416015625, "epoch": 0.35635874841311327, - "grad_norm": 0.2601299285888672, - "kl": 0.13671875, - "learning_rate": 8.127922400084735e-08, - "loss": 0.0168, - "reward": 0.5864955633878708, - "reward_std": 0.11414723563939333, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 11.208361625671387, + "kl": 4.0859375, + "learning_rate": 4.063961200042368e-07, + "loss": 0.3282, + "reward": 0.544084832072258, + "reward_std": 0.17383621633052826, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1193 }, { "clip_ratio": 0.0, - "completion_length": 1643.4353332519531, + "completion_length": 1451.466552734375, "epoch": 0.35665745650063474, - "grad_norm": 0.4615395665168762, - "kl": 0.13232421875, - "learning_rate": 8.1238520843206e-08, - "loss": 0.0487, - "reward": 0.577566996216774, - "reward_std": 0.13984175957739353, - "rewards/accuracy_reward": 0.11830357951112092, + "grad_norm": 35.108638763427734, + "kl": 5.3515625, + "learning_rate": 4.0619260421602997e-07, + "loss": 0.432, + "reward": 0.5323660969734192, + "reward_std": 0.18533695861697197, + "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4185267984867096, "step": 1194 }, { "clip_ratio": 0.0, - "completion_length": 1709.9353637695312, + "completion_length": 1482.6630249023438, "epoch": 0.3569561645881562, - "grad_norm": 0.5405179858207703, - "kl": 0.138427734375, - "learning_rate": 8.119778370111566e-08, - "loss": 0.0399, - "reward": 0.6372768133878708, - "reward_std": 0.09669766202569008, - "rewards/accuracy_reward": 0.18750000488944352, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768059372902, + "grad_norm": 132.93759155273438, + "kl": 6.890625, + "learning_rate": 4.0598891850557823e-07, + "loss": 0.4819, + "reward": 0.6065848469734192, + "reward_std": 0.14587992243468761, + "rewards/accuracy_reward": 0.19642858300358057, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4101562574505806, "step": 1195 }, { "clip_ratio": 0.0, - "completion_length": 1792.0134582519531, + "completion_length": 1621.5781860351562, "epoch": 0.3572548726756777, - "grad_norm": 0.5009599328041077, - "kl": 0.15283203125, - "learning_rate": 8.115701261889436e-08, - "loss": 0.0316, - "reward": 0.5474330484867096, - "reward_std": 0.15210044756531715, - "rewards/accuracy_reward": 0.08035714738070965, + "grad_norm": 179.8414764404297, + "kl": 9.0078125, + "learning_rate": 4.0578506309447183e-07, + "loss": 0.5792, + "reward": 0.4570312723517418, + "reward_std": 0.18259981274604797, + "rewards/accuracy_reward": 0.06026785844005644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4670759215950966, + "rewards/tag_count_reward": 0.3967634066939354, "step": 1196 }, { "clip_ratio": 0.0, - "completion_length": 1642.5290832519531, + "completion_length": 1391.9554138183594, "epoch": 0.35755358076319915, - "grad_norm": 0.3033026158809662, - "kl": 0.13916015625, - "learning_rate": 8.111620764089706e-08, - "loss": 0.046, - "reward": 0.5948660969734192, - "reward_std": 0.13370712287724018, - "rewards/accuracy_reward": 0.11607143701985478, + "grad_norm": 89.42304229736328, + "kl": 6.3203125, + "learning_rate": 4.055810382044853e-07, + "loss": 0.482, + "reward": 0.5239955484867096, + "reward_std": 0.17241018265485764, + "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478794664144516, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1197 }, { "clip_ratio": 0.0, - "completion_length": 1647.2746276855469, + "completion_length": 1384.3973693847656, "epoch": 0.3578522888507206, - "grad_norm": 0.29035618901252747, - "kl": 0.1376953125, - "learning_rate": 8.107536881151556e-08, - "loss": 0.0314, - "reward": 0.7025669813156128, - "reward_std": 0.17277800850570202, - "rewards/accuracy_reward": 0.22767858766019344, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "grad_norm": 27.747486114501953, + "kl": 4.84375, + "learning_rate": 4.053768440575778e-07, + "loss": 0.4131, + "reward": 0.6517857387661934, + "reward_std": 0.20018428191542625, + "rewards/accuracy_reward": 0.21875001210719347, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1198 }, { "clip_ratio": 0.0, - "completion_length": 1636.82373046875, + "completion_length": 1433.4442749023438, "epoch": 0.3581509969382421, - "grad_norm": 0.3057563900947571, - "kl": 0.1322021484375, - "learning_rate": 8.10344961751785e-08, - "loss": 0.0151, - "reward": 0.543526828289032, - "reward_std": 0.11512240953743458, - "rewards/accuracy_reward": 0.06250000442378223, + "grad_norm": 29.561710357666016, + "kl": 4.7734375, + "learning_rate": 4.051724808758925e-07, + "loss": 0.3898, + "reward": 0.4871652126312256, + "reward_std": 0.149665467441082, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.4335937649011612, "step": 1199 }, { "clip_ratio": 0.0, - "completion_length": 1680.8326721191406, + "completion_length": 1465.4844360351562, "epoch": 0.35844970502576357, - "grad_norm": 0.3387852609157562, - "kl": 0.137939453125, - "learning_rate": 8.099358977635131e-08, - "loss": 0.0503, - "reward": 0.566406287252903, - "reward_std": 0.08309149462729692, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 37.81698989868164, + "kl": 4.43359375, + "learning_rate": 4.0496794888175656e-07, + "loss": 0.3727, + "reward": 0.5156250223517418, + "reward_std": 0.1488787792623043, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205633878708, + "rewards/tag_count_reward": 0.424107164144516, "step": 1200 }, { "clip_ratio": 0.0, - "completion_length": 1700.2969055175781, + "completion_length": 1473.9219360351562, "epoch": 0.35874841311328504, - "grad_norm": 0.32654014229774475, - "kl": 0.13525390625, - "learning_rate": 8.095264965953614e-08, - "loss": 0.0269, - "reward": 0.635044664144516, - "reward_std": 0.13473091460764408, - "rewards/accuracy_reward": 0.1540178656578064, + "grad_norm": 27.308847427368164, + "kl": 3.91796875, + "learning_rate": 4.0476324829768074e-07, + "loss": 0.338, + "reward": 0.574776828289032, + "reward_std": 0.17400255054235458, + "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.431919664144516, "step": 1201 }, { "clip_ratio": 0.0, - "completion_length": 1614.8773193359375, + "completion_length": 1443.3951721191406, "epoch": 0.3590471212008065, - "grad_norm": 0.35284537076950073, - "kl": 0.11669921875, - "learning_rate": 8.091167586927184e-08, - "loss": 0.0365, - "reward": 0.556919664144516, - "reward_std": 0.07702694088220596, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 16.00550079345703, + "kl": 4.5, + "learning_rate": 4.0455837934635914e-07, + "loss": 0.3553, + "reward": 0.5094866380095482, + "reward_std": 0.12299532815814018, + "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4787946566939354, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1202 }, { "clip_ratio": 0.0, - "completion_length": 1695.1607971191406, + "completion_length": 1448.2232971191406, "epoch": 0.359345829288328, - "grad_norm": 0.6437035799026489, - "kl": 0.14013671875, - "learning_rate": 8.087066845013383e-08, - "loss": 0.0295, - "reward": 0.4955357387661934, - "reward_std": 0.11327672936022282, - "rewards/accuracy_reward": 0.024553572526201606, + "grad_norm": 52.6285400390625, + "kl": 5.4140625, + "learning_rate": 4.043533422506692e-07, + "loss": 0.3888, + "reward": 0.4380580633878708, + "reward_std": 0.1409877184778452, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.420200914144516, "step": 1203 }, { "clip_ratio": 0.0, - "completion_length": 1701.7679443359375, + "completion_length": 1508.9978332519531, "epoch": 0.35964453737584945, - "grad_norm": 0.26192015409469604, - "kl": 0.1395263671875, - "learning_rate": 8.082962744673424e-08, - "loss": 0.0305, - "reward": 0.5044643133878708, - "reward_std": 0.12017765641212463, - "rewards/accuracy_reward": 0.03348214365541935, + "grad_norm": 39.36411666870117, + "kl": 4.0859375, + "learning_rate": 4.041481372336712e-07, + "loss": 0.3583, + "reward": 0.4737723395228386, + "reward_std": 0.16968788392841816, + "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4246651977300644, "step": 1204 }, { "clip_ratio": 0.0, - "completion_length": 1798.2032165527344, + "completion_length": 1582.37060546875, "epoch": 0.3599432454633709, - "grad_norm": 0.577788770198822, - "kl": 0.14697265625, - "learning_rate": 8.078855290372159e-08, - "loss": 0.0299, - "reward": 0.5279018208384514, - "reward_std": 0.14358583837747574, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 67.00851440429688, + "kl": 5.99609375, + "learning_rate": 4.0394276451860796e-07, + "loss": 0.4066, + "reward": 0.4614955633878708, + "reward_std": 0.19210495799779892, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.3989955559372902, "step": 1205 }, { "clip_ratio": 0.0, - "completion_length": 1654.93310546875, + "completion_length": 1442.5870971679688, "epoch": 0.3602419535508924, - "grad_norm": 0.34376293420791626, - "kl": 0.137451171875, - "learning_rate": 8.074744486578099e-08, - "loss": 0.0329, - "reward": 0.563616082072258, - "reward_std": 0.13018788397312164, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 36.35771179199219, + "kl": 4.171875, + "learning_rate": 4.0373722432890493e-07, + "loss": 0.3554, + "reward": 0.5044643208384514, + "reward_std": 0.17856038361787796, + "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982238650322, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1206 }, { "clip_ratio": 0.0, - "completion_length": 1734.5201416015625, + "completion_length": 1521.3795471191406, "epoch": 0.36054066163841386, - "grad_norm": 0.3217662572860718, - "kl": 0.149658203125, - "learning_rate": 8.070630337763395e-08, - "loss": 0.0467, - "reward": 0.5279018133878708, - "reward_std": 0.16313879936933517, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 25.4012508392334, + "kl": 4.70703125, + "learning_rate": 4.035315168881698e-07, + "loss": 0.3385, + "reward": 0.467633955180645, + "reward_std": 0.18183300644159317, + "rewards/accuracy_reward": 0.051339286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660969734192, + "rewards/tag_count_reward": 0.416294664144516, "step": 1207 }, { "clip_ratio": 0.0, - "completion_length": 1726.0692443847656, + "completion_length": 1449.4040832519531, "epoch": 0.36083936972593533, - "grad_norm": 0.5210367441177368, - "kl": 0.150390625, - "learning_rate": 8.066512848403837e-08, - "loss": 0.028, - "reward": 0.5273437649011612, - "reward_std": 0.11997803766280413, - "rewards/accuracy_reward": 0.058035718044266105, + "grad_norm": 50.16354751586914, + "kl": 3.171875, + "learning_rate": 4.0332564242019184e-07, + "loss": 0.3005, + "reward": 0.498325914144516, + "reward_std": 0.16363093256950378, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.435825914144516, "step": 1208 }, { "clip_ratio": 0.0, - "completion_length": 1701.3282165527344, + "completion_length": 1492.0826721191406, "epoch": 0.3611380778134568, - "grad_norm": 0.36235496401786804, - "kl": 0.13818359375, - "learning_rate": 8.062392022978852e-08, - "loss": 0.0376, - "reward": 0.5959821715950966, - "reward_std": 0.15333703346550465, - "rewards/accuracy_reward": 0.12276786426082253, + "grad_norm": 10.722467422485352, + "kl": 3.78515625, + "learning_rate": 4.031196011489426e-07, + "loss": 0.3004, + "reward": 0.5284598395228386, + "reward_std": 0.18046438694000244, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732142984867096, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1209 }, { "clip_ratio": 0.0, - "completion_length": 1655.0491638183594, + "completion_length": 1451.1317443847656, "epoch": 0.3614367859009783, - "grad_norm": 0.29348304867744446, - "kl": 0.1341552734375, - "learning_rate": 8.05826786597149e-08, - "loss": 0.0378, - "reward": 0.5390625149011612, - "reward_std": 0.12427590135484934, - "rewards/accuracy_reward": 0.05803571455180645, + "grad_norm": 50.4427604675293, + "kl": 2.6640625, + "learning_rate": 4.0291339329857453e-07, + "loss": 0.255, + "reward": 0.4765625223517418, + "reward_std": 0.15627948567271233, + "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810267984867096, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1210 }, { "clip_ratio": 0.0, - "completion_length": 1742.0469360351562, + "completion_length": 1566.4353332519531, "epoch": 0.36173549398849975, - "grad_norm": 0.553865373134613, - "kl": 0.1434326171875, - "learning_rate": 8.054140381868436e-08, - "loss": 0.0384, - "reward": 0.521763414144516, - "reward_std": 0.13404459320008755, - "rewards/accuracy_reward": 0.058035716181620955, + "grad_norm": 13.415938377380371, + "kl": 4.5859375, + "learning_rate": 4.027070190934218e-07, + "loss": 0.3322, + "reward": 0.4631696566939354, + "reward_std": 0.14856291934847832, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4162946566939354, "step": 1211 }, { "clip_ratio": 0.0, - "completion_length": 1710.1652526855469, + "completion_length": 1508.0804138183594, "epoch": 0.3620342020760212, - "grad_norm": 0.5365976691246033, - "kl": 0.140380859375, - "learning_rate": 8.050009575159982e-08, - "loss": 0.0475, - "reward": 0.590959832072258, - "reward_std": 0.14667624235153198, - "rewards/accuracy_reward": 0.1227678656578064, + "grad_norm": 7.612099647521973, + "kl": 4.21484375, + "learning_rate": 4.025004787579991e-07, + "loss": 0.3088, + "reward": 0.5351562798023224, + "reward_std": 0.1676801424473524, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.468191996216774, + "rewards/tag_count_reward": 0.4146205559372902, "step": 1212 }, { "clip_ratio": 0.0, - "completion_length": 1677.6875610351562, + "completion_length": 1501.0469360351562, "epoch": 0.3623329101635427, - "grad_norm": 1.0318975448608398, - "kl": 0.1304931640625, - "learning_rate": 8.045875450340043e-08, - "loss": 0.0311, - "reward": 0.6082589477300644, - "reward_std": 0.11356337554752827, - "rewards/accuracy_reward": 0.13169643213041127, + "grad_norm": 67.95134735107422, + "kl": 2.94921875, + "learning_rate": 4.0229377251700213e-07, + "loss": 0.2738, + "reward": 0.5725446790456772, + "reward_std": 0.19888300448656082, + "rewards/accuracy_reward": 0.14285714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4765625223517418, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1213 }, { "clip_ratio": 0.0, - "completion_length": 1691.9732971191406, + "completion_length": 1409.7366943359375, "epoch": 0.36263161825106416, - "grad_norm": 4.091546535491943, - "kl": 0.1611328125, - "learning_rate": 8.041738011906143e-08, - "loss": 0.0216, - "reward": 0.5435268133878708, - "reward_std": 0.09906738251447678, - "rewards/accuracy_reward": 0.060267859138548374, + "grad_norm": 68.59136199951172, + "kl": 2.70703125, + "learning_rate": 4.0208690059530714e-07, + "loss": 0.2687, + "reward": 0.505580373108387, + "reward_std": 0.13024732656776905, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4832589477300644, + "rewards/tag_count_reward": 0.447544664144516, "step": 1214 }, { "clip_ratio": 0.0, - "completion_length": 1486.38623046875, + "completion_length": 1284.0245971679688, "epoch": 0.36293032633858563, - "grad_norm": 0.4423186480998993, - "kl": 0.11669921875, - "learning_rate": 8.037597264359408e-08, - "loss": 0.0258, - "reward": 0.635044664144516, - "reward_std": 0.14046559110283852, - "rewards/accuracy_reward": 0.14508928847499192, + "grad_norm": 13.629560470581055, + "kl": 3.06640625, + "learning_rate": 4.018798632179704e-07, + "loss": 0.267, + "reward": 0.5842634215950966, + "reward_std": 0.18835154175758362, + "rewards/accuracy_reward": 0.1294642877765, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.489955373108387, + "rewards/tag_count_reward": 0.454799123108387, "step": 1215 }, { "clip_ratio": 0.0, - "completion_length": 1712.21435546875, + "completion_length": 1492.5156860351562, "epoch": 0.3632290344261071, - "grad_norm": 0.6982455849647522, - "kl": 0.138427734375, - "learning_rate": 8.033453212204566e-08, - "loss": 0.032, - "reward": 0.5340401977300644, - "reward_std": 0.13519262336194515, - "rewards/accuracy_reward": 0.06250000302679837, + "grad_norm": 24.961658477783203, + "kl": 4.046875, + "learning_rate": 4.016726606102283e-07, + "loss": 0.3393, + "reward": 0.4927455559372902, + "reward_std": 0.17969315871596336, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.428013414144516, "step": 1216 }, { "clip_ratio": 0.0, - "completion_length": 1741.4264221191406, + "completion_length": 1528.6406860351562, "epoch": 0.36352774251362857, - "grad_norm": 1.0419704914093018, - "kl": 0.15576171875, - "learning_rate": 8.02930585994994e-08, - "loss": 0.0177, - "reward": 0.5005580559372902, - "reward_std": 0.1078734789043665, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 5.946451187133789, + "kl": 4.3125, + "learning_rate": 4.01465292997497e-07, + "loss": 0.3174, + "reward": 0.4570312649011612, + "reward_std": 0.1583476047962904, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4235491156578064, "step": 1217 }, { "clip_ratio": 0.0, - "completion_length": 1661.3973999023438, + "completion_length": 1441.76123046875, "epoch": 0.36382645060115004, - "grad_norm": 0.4151582419872284, - "kl": 0.136474609375, - "learning_rate": 8.025155212107441e-08, - "loss": 0.0271, - "reward": 0.5708705708384514, - "reward_std": 0.15007525496184826, - "rewards/accuracy_reward": 0.09821429033763707, + "grad_norm": 23.61194610595703, + "kl": 4.47265625, + "learning_rate": 4.0125776060537207e-07, + "loss": 0.3737, + "reward": 0.5251116305589676, + "reward_std": 0.18760451301932335, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1218 }, { "clip_ratio": 0.0, - "completion_length": 1692.0759887695312, + "completion_length": 1532.9241638183594, "epoch": 0.3641251586886715, - "grad_norm": 0.5288482904434204, - "kl": 0.13671875, - "learning_rate": 8.021001273192569e-08, - "loss": 0.0288, - "reward": 0.578683078289032, - "reward_std": 0.1350945420563221, - "rewards/accuracy_reward": 0.0959821455180645, + "grad_norm": 46.158592224121094, + "kl": 5.25390625, + "learning_rate": 4.0105006365962847e-07, + "loss": 0.3629, + "reward": 0.4944196715950966, + "reward_std": 0.18677605129778385, + "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.482700914144516, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1219 }, { "clip_ratio": 0.0, - "completion_length": 1640.4532165527344, + "completion_length": 1374.5871276855469, "epoch": 0.364423866776193, - "grad_norm": 1.0874390602111816, - "kl": 0.1412353515625, - "learning_rate": 8.016844047724403e-08, - "loss": 0.0355, - "reward": 0.6160714477300644, - "reward_std": 0.15175963565707207, - "rewards/accuracy_reward": 0.1450892947614193, + "grad_norm": 35.42365646362305, + "kl": 4.57421875, + "learning_rate": 4.008422023862201e-07, + "loss": 0.3679, + "reward": 0.6004464477300644, + "reward_std": 0.18019600957632065, + "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4709821566939354, + "rewards/tag_count_reward": 0.435267873108387, "step": 1220 }, { "clip_ratio": 0.0, - "completion_length": 1845.4911499023438, + "completion_length": 1625.3192749023438, "epoch": 0.36472257486371445, - "grad_norm": 0.558825671672821, - "kl": 0.15771484375, - "learning_rate": 8.012683540225593e-08, - "loss": 0.0372, - "reward": 0.4960937798023224, - "reward_std": 0.16429485380649567, - "rewards/accuracy_reward": 0.042410716880112886, + "grad_norm": 145.6487579345703, + "kl": 7.875, + "learning_rate": 4.0063417701127964e-07, + "loss": 0.5064, + "reward": 0.4414062649011612, + "reward_std": 0.19841773808002472, + "rewards/accuracy_reward": 0.03794642956927419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4536830484867096, + "rewards/tag_count_reward": 0.4034598395228386, "step": 1221 }, { "clip_ratio": 0.0, - "completion_length": 1812.2969665527344, + "completion_length": 1608.1205749511719, "epoch": 0.3650212829512359, - "grad_norm": 0.6462035179138184, - "kl": 0.14453125, - "learning_rate": 8.008519755222368e-08, - "loss": 0.0267, - "reward": 0.5385044887661934, - "reward_std": 0.1588397566229105, - "rewards/accuracy_reward": 0.07812500209547579, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794887661934, + "grad_norm": 127.34754180908203, + "kl": 6.890625, + "learning_rate": 4.004259877611184e-07, + "loss": 0.4276, + "reward": 0.4854911044239998, + "reward_std": 0.21125541254878044, + "rewards/accuracy_reward": 0.07366071850992739, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.4095982238650322, "step": 1222 }, { "clip_ratio": 0.0, - "completion_length": 1741.54248046875, + "completion_length": 1460.8326721191406, "epoch": 0.3653199910387574, - "grad_norm": 0.9606361389160156, - "kl": 0.15625, - "learning_rate": 8.004352697244515e-08, - "loss": 0.0302, - "reward": 0.5558035969734192, - "reward_std": 0.11687474884092808, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 71.03634643554688, + "kl": 5.7421875, + "learning_rate": 4.0021763486222577e-07, + "loss": 0.4341, + "reward": 0.513392873108387, + "reward_std": 0.17122315801680088, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.4218750149011612, "step": 1223 }, { "clip_ratio": 0.0, - "completion_length": 1762.5558776855469, + "completion_length": 1556.290283203125, "epoch": 0.36561869912627887, - "grad_norm": 0.7040682435035706, - "kl": 0.155029296875, - "learning_rate": 8.000182370825386e-08, - "loss": 0.024, - "reward": 0.6155134290456772, - "reward_std": 0.1837761402130127, - "rewards/accuracy_reward": 0.1450892947614193, + "grad_norm": 100.50299072265625, + "kl": 6.1796875, + "learning_rate": 4.0000911854126927e-07, + "loss": 0.4169, + "reward": 0.4960937649011612, + "reward_std": 0.19401892088353634, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.3934151977300644, "step": 1224 }, { "clip_ratio": 0.0, - "completion_length": 1764.4866943359375, + "completion_length": 1467.2567749023438, "epoch": 0.36591740721380034, - "grad_norm": 0.29301971197128296, - "kl": 0.148681640625, - "learning_rate": 7.996008780501888e-08, - "loss": 0.0291, - "reward": 0.4893973469734192, - "reward_std": 0.10965314321219921, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 12.638955116271973, + "kl": 3.9375, + "learning_rate": 3.998004390250944e-07, + "loss": 0.3276, + "reward": 0.4419643059372902, + "reward_std": 0.12977353297173977, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1225 }, { "clip_ratio": 0.0, - "completion_length": 1668.52685546875, + "completion_length": 1380.0067443847656, "epoch": 0.3662161153013218, - "grad_norm": 0.3946899175643921, - "kl": 0.13330078125, - "learning_rate": 7.991831930814474e-08, - "loss": 0.032, - "reward": 0.5474330633878708, - "reward_std": 0.10323346313089132, - "rewards/accuracy_reward": 0.0691964328289032, + "grad_norm": 67.33377838134766, + "kl": 2.72265625, + "learning_rate": 3.995915965407237e-07, + "loss": 0.2842, + "reward": 0.5078125223517418, + "reward_std": 0.13513562828302383, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4782366305589676, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1226 }, { "clip_ratio": 0.0, - "completion_length": 1719.57373046875, + "completion_length": 1481.3683471679688, "epoch": 0.3665148233888433, - "grad_norm": 0.6650804877281189, - "kl": 0.137451171875, - "learning_rate": 7.987651826307154e-08, - "loss": 0.0434, - "reward": 0.6356027275323868, - "reward_std": 0.17151613719761372, - "rewards/accuracy_reward": 0.16741071874275804, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919887661934, + "grad_norm": 43.77455520629883, + "kl": 2.84375, + "learning_rate": 3.9938259131535767e-07, + "loss": 0.2419, + "reward": 0.5937500149011612, + "reward_std": 0.20719616673886776, + "rewards/accuracy_reward": 0.16071428847499192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1227 }, { "clip_ratio": 0.0, - "completion_length": 1667.3147888183594, + "completion_length": 1419.0402526855469, "epoch": 0.3668135314763647, - "grad_norm": 0.877891480922699, - "kl": 0.138916015625, - "learning_rate": 7.983468471527466e-08, - "loss": 0.0278, - "reward": 0.607700914144516, - "reward_std": 0.08987836167216301, - "rewards/accuracy_reward": 0.12723214854486287, + "grad_norm": 50.266544342041016, + "kl": 3.62109375, + "learning_rate": 3.991734235763733e-07, + "loss": 0.3232, + "reward": 0.545758955180645, + "reward_std": 0.13034122064709663, + "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687723517418, + "rewards/tag_count_reward": 0.4319196715950966, "step": 1228 }, { "clip_ratio": 0.0, - "completion_length": 1732.8862609863281, + "completion_length": 1449.4375610351562, "epoch": 0.36711223956388617, - "grad_norm": 0.7604981660842896, - "kl": 0.134765625, - "learning_rate": 7.979281871026493e-08, - "loss": 0.0263, - "reward": 0.5625000298023224, - "reward_std": 0.19574349001049995, - "rewards/accuracy_reward": 0.098214291036129, + "grad_norm": 36.54620361328125, + "kl": 2.87890625, + "learning_rate": 3.9896409355132463e-07, + "loss": 0.2679, + "reward": 0.5251116380095482, + "reward_std": 0.22271152585744858, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857313156128, + "rewards/tag_count_reward": 0.4425223469734192, "step": 1229 }, { "clip_ratio": 0.0, - "completion_length": 1695.0870971679688, + "completion_length": 1471.602783203125, "epoch": 0.36741094765140764, - "grad_norm": 0.3105793595314026, - "kl": 0.1343994140625, - "learning_rate": 7.975092029358845e-08, - "loss": 0.0339, - "reward": 0.6093750149011612, - "reward_std": 0.15325173363089561, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 33.91981506347656, + "kl": 3.78125, + "learning_rate": 3.9875460146794225e-07, + "loss": 0.338, + "reward": 0.5279017984867096, + "reward_std": 0.19705335423350334, + "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785895228386, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1230 }, { "clip_ratio": 0.0, - "completion_length": 1685.5201416015625, + "completion_length": 1458.9933776855469, "epoch": 0.3677096557389291, - "grad_norm": 0.66167151927948, - "kl": 0.1337890625, - "learning_rate": 7.970898951082661e-08, - "loss": 0.0384, - "reward": 0.568638414144516, - "reward_std": 0.10059706121683121, - "rewards/accuracy_reward": 0.09151786309666932, + "grad_norm": 12.131470680236816, + "kl": 4.5078125, + "learning_rate": 3.9854494755413305e-07, + "loss": 0.3636, + "reward": 0.5139509215950966, + "reward_std": 0.13212929107248783, + "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1231 }, { "clip_ratio": 0.0, - "completion_length": 1795.0514221191406, + "completion_length": 1521.7812805175781, "epoch": 0.3680083638264506, - "grad_norm": 0.4606172740459442, - "kl": 0.148193359375, - "learning_rate": 7.966702640759596e-08, - "loss": 0.0319, - "reward": 0.5323661044239998, - "reward_std": 0.1401620414108038, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 20.1412353515625, + "kl": 4.38671875, + "learning_rate": 3.983351320379798e-07, + "loss": 0.3303, + "reward": 0.4827009215950966, + "reward_std": 0.16790449246764183, + "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.458705373108387, + "rewards/tag_count_reward": 0.4202009066939354, "step": 1232 }, { "clip_ratio": 0.0, - "completion_length": 1676.0156860351562, + "completion_length": 1450.8214721679688, "epoch": 0.36830707191397205, - "grad_norm": 0.8043585419654846, - "kl": 0.13623046875, - "learning_rate": 7.962503102954826e-08, - "loss": 0.0437, - "reward": 0.6635044813156128, - "reward_std": 0.16072359215468168, - "rewards/accuracy_reward": 0.191964291036129, + "grad_norm": 16.944995880126953, + "kl": 4.376953125, + "learning_rate": 3.981251551477413e-07, + "loss": 0.3506, + "reward": 0.5909598544239998, + "reward_std": 0.18590334989130497, + "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.428013414144516, "step": 1233 }, { "clip_ratio": 0.0, - "completion_length": 1740.49560546875, + "completion_length": 1468.1808776855469, "epoch": 0.3686057800014935, - "grad_norm": 0.9433277249336243, - "kl": 0.1439208984375, - "learning_rate": 7.958300342237036e-08, - "loss": 0.0445, - "reward": 0.5753348469734192, - "reward_std": 0.1652218960225582, - "rewards/accuracy_reward": 0.09821429010480642, + "grad_norm": 17.270156860351562, + "kl": 3.91796875, + "learning_rate": 3.9791501711185185e-07, + "loss": 0.2959, + "reward": 0.5357143133878708, + "reward_std": 0.18748386204242706, + "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1234 }, { "clip_ratio": 0.0, - "completion_length": 1715.6295471191406, + "completion_length": 1475.4308776855469, "epoch": 0.368904488089015, - "grad_norm": 14.638603210449219, - "kl": 0.192138671875, - "learning_rate": 7.954094363178421e-08, - "loss": 0.0441, - "reward": 0.5943080633878708, - "reward_std": 0.11570219323039055, - "rewards/accuracy_reward": 0.11383928917348385, + "grad_norm": 18.383817672729492, + "kl": 5.15234375, + "learning_rate": 3.9770471815892105e-07, + "loss": 0.4113, + "reward": 0.518415205180645, + "reward_std": 0.1484800148755312, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4804687798023224, + "rewards/tag_count_reward": 0.4246651977300644, "step": 1235 }, { "clip_ratio": 0.0, - "completion_length": 1805.7567749023438, + "completion_length": 1603.9264221191406, "epoch": 0.36920319617653646, - "grad_norm": 0.7420095205307007, - "kl": 0.15185546875, - "learning_rate": 7.949885170354668e-08, - "loss": 0.0277, - "reward": 0.5039062723517418, - "reward_std": 0.12945188023149967, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 11.350081443786621, + "kl": 4.29296875, + "learning_rate": 3.974942585177334e-07, + "loss": 0.3153, + "reward": 0.4654017984867096, + "reward_std": 0.1712711751461029, + "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463727705180645, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1236 }, { "clip_ratio": 0.0, - "completion_length": 1779.1429443359375, + "completion_length": 1532.0246276855469, "epoch": 0.36950190426405793, - "grad_norm": 0.5507447719573975, - "kl": 0.141357421875, - "learning_rate": 7.94567276834497e-08, - "loss": 0.0388, - "reward": 0.5518973469734192, - "reward_std": 0.12991894222795963, - "rewards/accuracy_reward": 0.09151786426082253, + "grad_norm": 11.328507423400879, + "kl": 4.71875, + "learning_rate": 3.9728363841724857e-07, + "loss": 0.3603, + "reward": 0.5128348469734192, + "reward_std": 0.16434019804000854, + "rewards/accuracy_reward": 0.09151786030270159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794813156128, + "rewards/tag_count_reward": 0.4213169738650322, "step": 1237 }, { "clip_ratio": 0.0, - "completion_length": 1687.55810546875, + "completion_length": 1501.7723693847656, "epoch": 0.3698006123515794, - "grad_norm": 0.5548381209373474, - "kl": 0.1361083984375, - "learning_rate": 7.941457161732009e-08, - "loss": 0.0341, - "reward": 0.593191996216774, - "reward_std": 0.13350950740277767, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 14.709986686706543, + "kl": 3.3046875, + "learning_rate": 3.970728580866005e-07, + "loss": 0.2546, + "reward": 0.5619419664144516, + "reward_std": 0.15384064614772797, + "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598469734192, + "rewards/tag_count_reward": 0.432477705180645, "step": 1238 }, { "clip_ratio": 0.0, - "completion_length": 1694.0915832519531, + "completion_length": 1449.1964721679688, "epoch": 0.3700993204391009, - "grad_norm": 0.8695433139801025, - "kl": 0.142333984375, - "learning_rate": 7.93723835510195e-08, - "loss": 0.0508, - "reward": 0.5864955633878708, - "reward_std": 0.17348028533160686, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 14.81948471069336, + "kl": 3.88671875, + "learning_rate": 3.9686191775509746e-07, + "loss": 0.3489, + "reward": 0.5485491380095482, + "reward_std": 0.19902825728058815, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.465959832072258, + "rewards/tag_count_reward": 0.4257812649011612, "step": 1239 }, { "clip_ratio": 0.0, - "completion_length": 1707.8304443359375, + "completion_length": 1436.9687805175781, "epoch": 0.37039802852662235, - "grad_norm": 0.37631094455718994, - "kl": 0.1365966796875, - "learning_rate": 7.933016353044438e-08, - "loss": 0.0237, - "reward": 0.6367187649011612, - "reward_std": 0.15816038567572832, - "rewards/accuracy_reward": 0.15848215157166123, + "grad_norm": 14.535292625427246, + "kl": 2.60546875, + "learning_rate": 3.9665081765222193e-07, + "loss": 0.2204, + "reward": 0.5998884215950966, + "reward_std": 0.17028621025383472, + "rewards/accuracy_reward": 0.15178572479635477, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4782366305589676, + "rewards/tag_count_reward": 0.4481026977300644, "step": 1240 }, { "clip_ratio": 0.0, - "completion_length": 1763.13623046875, + "completion_length": 1476.9241638183594, "epoch": 0.3706967366141438, - "grad_norm": 2.334547758102417, - "kl": 0.162109375, - "learning_rate": 7.928791160152603e-08, - "loss": 0.0304, - "reward": 0.5228794813156128, - "reward_std": 0.10041273571550846, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 16.168439865112305, + "kl": 4.03515625, + "learning_rate": 3.9643955800763013e-07, + "loss": 0.3348, + "reward": 0.4648437649011612, + "reward_std": 0.13842986337840557, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4179687723517418, "step": 1241 }, { "clip_ratio": 0.0, - "completion_length": 1804.9598999023438, + "completion_length": 1551.8951721191406, "epoch": 0.3709954447016653, - "grad_norm": 0.7659576535224915, - "kl": 0.149169921875, - "learning_rate": 7.924562781023036e-08, - "loss": 0.0489, - "reward": 0.489955373108387, - "reward_std": 0.16066263616085052, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 114.10850524902344, + "kl": 6.7109375, + "learning_rate": 3.962281390511518e-07, + "loss": 0.4803, + "reward": 0.4458705559372902, + "reward_std": 0.16462809219956398, + "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4520089477300644, + "rewards/tag_count_reward": 0.4213169887661934, "step": 1242 }, { "clip_ratio": 0.0, - "completion_length": 1787.2210388183594, + "completion_length": 1610.2232971191406, "epoch": 0.37129415278918676, - "grad_norm": 0.38669925928115845, - "kl": 0.1405029296875, - "learning_rate": 7.920331220255802e-08, - "loss": 0.0313, - "reward": 0.5206473544239998, - "reward_std": 0.13546007871627808, - "rewards/accuracy_reward": 0.058035718742758036, + "grad_norm": 47.2646598815918, + "kl": 4.77734375, + "learning_rate": 3.9601656101279013e-07, + "loss": 0.3125, + "reward": 0.4620535895228386, + "reward_std": 0.16126078367233276, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.4107142984867096, "step": 1243 }, { "clip_ratio": 0.0, - "completion_length": 1747.1719360351562, + "completion_length": 1541.47998046875, "epoch": 0.37159286087670823, - "grad_norm": 0.23473793268203735, - "kl": 0.1419677734375, - "learning_rate": 7.916096482454424e-08, - "loss": 0.0251, - "reward": 0.544642873108387, - "reward_std": 0.1547807902097702, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 19.098623275756836, + "kl": 3.8515625, + "learning_rate": 3.958048241227212e-07, + "loss": 0.2727, + "reward": 0.4854910895228386, + "reward_std": 0.17604700475931168, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857313156128, + "rewards/tag_count_reward": 0.4274553805589676, "step": 1244 }, { "clip_ratio": 0.0, - "completion_length": 1687.18310546875, + "completion_length": 1433.0090026855469, "epoch": 0.3718915689642297, - "grad_norm": 0.3714737892150879, - "kl": 0.13720703125, - "learning_rate": 7.911858572225883e-08, - "loss": 0.0303, - "reward": 0.545758955180645, - "reward_std": 0.15192694775760174, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 91.07913970947266, + "kl": 4.94921875, + "learning_rate": 3.9559292861129414e-07, + "loss": 0.3587, + "reward": 0.510602705180645, + "reward_std": 0.1472528986632824, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467633955180645, + "rewards/tag_count_reward": 0.448102705180645, "step": 1245 }, { "clip_ratio": 0.0, - "completion_length": 1715.0447082519531, + "completion_length": 1494.8773193359375, "epoch": 0.3721902770517512, - "grad_norm": 0.6218249201774597, - "kl": 0.132568359375, - "learning_rate": 7.907617494180606e-08, - "loss": 0.03, - "reward": 0.6110491305589676, - "reward_std": 0.11741734109818935, - "rewards/accuracy_reward": 0.14062500488944352, + "grad_norm": 19.430967330932617, + "kl": 3.171875, + "learning_rate": 3.9538087470903024e-07, + "loss": 0.2301, + "reward": 0.589285746216774, + "reward_std": 0.16713102720677853, + "rewards/accuracy_reward": 0.14285714854486287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241380095482, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1246 }, { "clip_ratio": 0.0, - "completion_length": 1820.9732971191406, + "completion_length": 1608.5090026855469, "epoch": 0.37248898513927264, - "grad_norm": 0.3130519390106201, - "kl": 0.14794921875, - "learning_rate": 7.903373252932473e-08, - "loss": 0.035, - "reward": 0.4955357387661934, - "reward_std": 0.17637742310762405, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 10.652753829956055, + "kl": 4.29296875, + "learning_rate": 3.951686626466236e-07, + "loss": 0.3012, + "reward": 0.4938616305589676, + "reward_std": 0.1974820401519537, + "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.4268973469734192, "step": 1247 }, { "clip_ratio": 0.0, - "completion_length": 1652.0603332519531, + "completion_length": 1397.2701721191406, "epoch": 0.3727876932267941, - "grad_norm": 0.6303621530532837, - "kl": 0.13818359375, - "learning_rate": 7.899125853098802e-08, - "loss": 0.0382, - "reward": 0.6132812798023224, - "reward_std": 0.15240469947457314, - "rewards/accuracy_reward": 0.1361607238650322, + "grad_norm": 20.05364418029785, + "kl": 3.93359375, + "learning_rate": 3.949562926549401e-07, + "loss": 0.3347, + "reward": 0.5664062723517418, + "reward_std": 0.1871238425374031, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4458705633878708, "step": 1248 }, { "clip_ratio": 0.0, - "completion_length": 1709.9286499023438, + "completion_length": 1523.6920471191406, "epoch": 0.3730864013143156, - "grad_norm": 0.23611068725585938, - "kl": 0.138671875, - "learning_rate": 7.894875299300346e-08, - "loss": 0.0319, - "reward": 0.6188616454601288, - "reward_std": 0.13613969273865223, - "rewards/accuracy_reward": 0.14285714668221772, + "grad_norm": 17.155492782592773, + "kl": 4.89453125, + "learning_rate": 3.947437649650173e-07, + "loss": 0.3885, + "reward": 0.5530134215950966, + "reward_std": 0.1734784133732319, + "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4760044813156128, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1249 }, { "clip_ratio": 0.0, - "completion_length": 1642.1630554199219, + "completion_length": 1447.9197082519531, "epoch": 0.37338510940183706, - "grad_norm": 0.7264059782028198, - "kl": 0.1307373046875, - "learning_rate": 7.890621596161294e-08, - "loss": 0.0396, - "reward": 0.5602678805589676, - "reward_std": 0.09427577070891857, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 47.11534881591797, + "kl": 3.232421875, + "learning_rate": 3.945310798080647e-07, + "loss": 0.301, + "reward": 0.5094866305589676, + "reward_std": 0.15938874520361423, + "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4799107313156128, + "rewards/tag_count_reward": 0.431361623108387, "step": 1250 }, { "clip_ratio": 0.0, - "completion_length": 1767.15185546875, + "completion_length": 1574.5915832519531, "epoch": 0.3736838174893585, - "grad_norm": 0.9131835699081421, - "kl": 0.1396484375, - "learning_rate": 7.886364748309258e-08, - "loss": 0.0302, - "reward": 0.5133928805589676, - "reward_std": 0.13194986805319786, - "rewards/accuracy_reward": 0.035714287078008056, + "grad_norm": 24.14457130432129, + "kl": 4.72265625, + "learning_rate": 3.943182374154629e-07, + "loss": 0.3417, + "reward": 0.4508928805589676, + "reward_std": 0.17686554789543152, + "rewards/accuracy_reward": 0.024553572293370962, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4776785969734192, + "rewards/tag_count_reward": 0.4263393059372902, "step": 1251 }, { "clip_ratio": 0.0, - "completion_length": 1687.7679138183594, + "completion_length": 1467.1853332519531, "epoch": 0.37398252557688, - "grad_norm": 0.9863004684448242, - "kl": 0.1331787109375, - "learning_rate": 7.882104760375268e-08, - "loss": 0.0373, - "reward": 0.5597098469734192, - "reward_std": 0.16337066143751144, - "rewards/accuracy_reward": 0.09151786286383867, + "grad_norm": 46.41028594970703, + "kl": 4.42578125, + "learning_rate": 3.941052380187634e-07, + "loss": 0.3044, + "reward": 0.5429687798023224, + "reward_std": 0.2112753801047802, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919813156128, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1252 }, { "clip_ratio": 0.0, - "completion_length": 1746.618408203125, + "completion_length": 1503.4532165527344, "epoch": 0.37428123366440147, - "grad_norm": 0.6451116800308228, - "kl": 0.1282958984375, - "learning_rate": 7.877841636993777e-08, - "loss": 0.0266, - "reward": 0.536830373108387, - "reward_std": 0.08803942240774632, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 44.662269592285156, + "kl": 5.07421875, + "learning_rate": 3.938920818496888e-07, + "loss": 0.3604, + "reward": 0.4910714477300644, + "reward_std": 0.11502698808908463, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4810268059372902, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1253 }, { "clip_ratio": 0.0, - "completion_length": 1682.0067443847656, + "completion_length": 1472.4152221679688, "epoch": 0.37457994175192294, - "grad_norm": 3.13533616065979, - "kl": 0.1416015625, - "learning_rate": 7.873575382802642e-08, - "loss": 0.0214, - "reward": 0.5368303880095482, - "reward_std": 0.11858363449573517, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 88.24928283691406, + "kl": 2.556640625, + "learning_rate": 3.9367876914013207e-07, + "loss": 0.2725, + "reward": 0.5167410969734192, + "reward_std": 0.17018435895442963, + "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982238650322, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1254 }, { "clip_ratio": 0.0, - "completion_length": 1788.1473999023438, + "completion_length": 1597.9598999023438, "epoch": 0.3748786498394444, - "grad_norm": 0.6987788081169128, - "kl": 0.1370849609375, - "learning_rate": 7.869306002443132e-08, - "loss": 0.0249, - "reward": 0.4776785969734192, - "reward_std": 0.07759265601634979, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 38.169376373291016, + "kl": 3.01953125, + "learning_rate": 3.9346530012215664e-07, + "loss": 0.2234, + "reward": 0.4302455484867096, + "reward_std": 0.13925860822200775, + "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143133878708, + "rewards/tag_count_reward": 0.423549123108387, "step": 1255 }, { "clip_ratio": 0.0, - "completion_length": 1701.7165832519531, + "completion_length": 1451.62060546875, "epoch": 0.3751773579269659, - "grad_norm": 0.834409236907959, - "kl": 0.1488037109375, - "learning_rate": 7.865033500559914e-08, - "loss": 0.0212, - "reward": 0.5585937649011612, - "reward_std": 0.08870344422757626, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 90.10745239257812, + "kl": 2.1328125, + "learning_rate": 3.932516750279957e-07, + "loss": 0.211, + "reward": 0.5234375149011612, + "reward_std": 0.10717580281198025, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1256 }, { "clip_ratio": 0.0, - "completion_length": 1756.9130249023438, + "completion_length": 1538.10498046875, "epoch": 0.37547606601448735, - "grad_norm": 0.6691091060638428, - "kl": 0.143310546875, - "learning_rate": 7.860757881801052e-08, - "loss": 0.0307, - "reward": 0.489955373108387, - "reward_std": 0.1356595791876316, - "rewards/accuracy_reward": 0.026785715948790312, + "grad_norm": 59.14175796508789, + "kl": 2.65234375, + "learning_rate": 3.9303789409005253e-07, + "loss": 0.2273, + "reward": 0.4475446566939354, + "reward_std": 0.16204673424363136, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1257 }, { "clip_ratio": 0.0, - "completion_length": 1657.2813110351562, + "completion_length": 1445.5045166015625, "epoch": 0.3757747741020088, - "grad_norm": 4.319146633148193, - "kl": 0.1514892578125, - "learning_rate": 7.856479150817996e-08, - "loss": 0.0348, - "reward": 0.577008955180645, - "reward_std": 0.09619904682040215, - "rewards/accuracy_reward": 0.09821429220028222, + "grad_norm": 67.37281036376953, + "kl": 3.2734375, + "learning_rate": 3.928239575408998e-07, + "loss": 0.3127, + "reward": 0.521763414144516, + "reward_std": 0.13154826872050762, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.478794664144516, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1258 }, { "clip_ratio": 0.0, - "completion_length": 1814.4599304199219, + "completion_length": 1522.935302734375, "epoch": 0.3760734821895303, - "grad_norm": 0.9047808647155762, - "kl": 0.142822265625, - "learning_rate": 7.852197312265592e-08, - "loss": 0.0237, - "reward": 0.5591518059372902, - "reward_std": 0.13245822675526142, - "rewards/accuracy_reward": 0.08482143213041127, + "grad_norm": 15.712522506713867, + "kl": 3.89453125, + "learning_rate": 3.926098656132796e-07, + "loss": 0.3025, + "reward": 0.5000000298023224, + "reward_std": 0.16033565811812878, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.424107164144516, "step": 1259 }, { "clip_ratio": 0.0, - "completion_length": 1727.62060546875, + "completion_length": 1493.9643859863281, "epoch": 0.37637219027705177, - "grad_norm": 0.5258731245994568, - "kl": 0.14599609375, - "learning_rate": 7.847912370802055e-08, - "loss": 0.0352, - "reward": 0.5831473469734192, - "reward_std": 0.15029644221067429, - "rewards/accuracy_reward": 0.11383929010480642, + "grad_norm": 66.83240509033203, + "kl": 3.4296875, + "learning_rate": 3.9239561854010276e-07, + "loss": 0.2679, + "reward": 0.5323660969734192, + "reward_std": 0.15324660763144493, + "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4274553805589676, "step": 1260 }, { "clip_ratio": 0.0, - "completion_length": 1673.2969665527344, + "completion_length": 1459.47998046875, "epoch": 0.37667089836457324, - "grad_norm": 0.46633604168891907, - "kl": 0.135009765625, - "learning_rate": 7.843624331088985e-08, - "loss": 0.0294, - "reward": 0.5747768133878708, - "reward_std": 0.14269863814115524, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 78.82701110839844, + "kl": 5.6328125, + "learning_rate": 3.9218121655444926e-07, + "loss": 0.3833, + "reward": 0.5262276977300644, + "reward_std": 0.15068095549941063, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1261 }, { "clip_ratio": 0.0, - "completion_length": 1713.76123046875, + "completion_length": 1479.1630249023438, "epoch": 0.3769696064520947, - "grad_norm": 0.5224746465682983, - "kl": 0.131103515625, - "learning_rate": 7.839333197791349e-08, - "loss": 0.0219, - "reward": 0.6071428805589676, - "reward_std": 0.10118240676820278, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 19.32759666442871, + "kl": 4.3125, + "learning_rate": 3.9196665988956743e-07, + "loss": 0.3278, + "reward": 0.5725446715950966, + "reward_std": 0.14285107888281345, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4754464477300644, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1262 }, { "clip_ratio": 0.0, - "completion_length": 1796.8884887695312, + "completion_length": 1568.6763916015625, "epoch": 0.3772683145396162, - "grad_norm": 0.4939546287059784, - "kl": 0.157958984375, - "learning_rate": 7.835038975577477e-08, - "loss": 0.0288, - "reward": 0.608816996216774, - "reward_std": 0.1667841151356697, - "rewards/accuracy_reward": 0.14955358067527413, + "grad_norm": 40.979087829589844, + "kl": 5.5703125, + "learning_rate": 3.9175194877887387e-07, + "loss": 0.408, + "reward": 0.5585937723517418, + "reward_std": 0.22559519112110138, + "rewards/accuracy_reward": 0.14508928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4135044813156128, "step": 1263 }, { "clip_ratio": 0.0, - "completion_length": 1683.8438110351562, + "completion_length": 1419.2366638183594, "epoch": 0.37756702262713765, - "grad_norm": 0.8761662840843201, - "kl": 0.130615234375, - "learning_rate": 7.830741669119063e-08, - "loss": 0.0469, - "reward": 0.5597098469734192, - "reward_std": 0.1470310389995575, - "rewards/accuracy_reward": 0.08705357415601611, + "grad_norm": 54.73875427246094, + "kl": 5.2421875, + "learning_rate": 3.9153708345595315e-07, + "loss": 0.4203, + "reward": 0.5078125298023224, + "reward_std": 0.14354369789361954, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562649011612, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1264 }, { "clip_ratio": 0.0, - "completion_length": 1749.8326416015625, + "completion_length": 1534.3929138183594, "epoch": 0.3778657307146591, - "grad_norm": 0.641232430934906, - "kl": 0.141845703125, - "learning_rate": 7.826441283091156e-08, - "loss": 0.0351, - "reward": 0.5396205484867096, - "reward_std": 0.12154448963701725, - "rewards/accuracy_reward": 0.06473214481957257, + "grad_norm": 73.04139709472656, + "kl": 5.8671875, + "learning_rate": 3.913220641545578e-07, + "loss": 0.4089, + "reward": 0.4815848469734192, + "reward_std": 0.13637750782072544, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.4280134215950966, "step": 1265 }, { "clip_ratio": 0.0, - "completion_length": 1761.7009582519531, + "completion_length": 1517.2165832519531, "epoch": 0.3781644388021806, - "grad_norm": 0.30661165714263916, - "kl": 0.141845703125, - "learning_rate": 7.822137822172155e-08, - "loss": 0.022, - "reward": 0.509486623108387, - "reward_std": 0.1465079579502344, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 55.29218292236328, + "kl": 4.83984375, + "learning_rate": 3.9110689110860774e-07, + "loss": 0.3252, + "reward": 0.4994419887661934, + "reward_std": 0.16542514227330685, + "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.448102705180645, "step": 1266 }, { "clip_ratio": 0.0, - "completion_length": 1728.149658203125, + "completion_length": 1499.93310546875, "epoch": 0.37846314688970206, - "grad_norm": 0.6783213019371033, - "kl": 0.138916015625, - "learning_rate": 7.817831291043802e-08, - "loss": 0.0307, - "reward": 0.5518973469734192, - "reward_std": 0.10954977758228779, - "rewards/accuracy_reward": 0.0870535783469677, + "grad_norm": 102.9538803100586, + "kl": 6.4140625, + "learning_rate": 3.908915645521901e-07, + "loss": 0.4136, + "reward": 0.5089285895228386, + "reward_std": 0.14185668528079987, + "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.4218750149011612, "step": 1267 }, { "clip_ratio": 0.0, - "completion_length": 1674.3906860351562, + "completion_length": 1454.9152526855469, "epoch": 0.37876185497722353, - "grad_norm": 0.5877175331115723, - "kl": 0.1243896484375, - "learning_rate": 7.81352169439118e-08, - "loss": 0.0445, - "reward": 0.6155134290456772, - "reward_std": 0.12115206755697727, - "rewards/accuracy_reward": 0.1383928619325161, + "grad_norm": 48.31977462768555, + "kl": 3.23828125, + "learning_rate": 3.90676084719559e-07, + "loss": 0.2689, + "reward": 0.5803571790456772, + "reward_std": 0.16422819904983044, + "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4441964477300644, "step": 1268 }, { "clip_ratio": 0.0, - "completion_length": 1779.94873046875, + "completion_length": 1561.9822082519531, "epoch": 0.379060563064745, - "grad_norm": 0.3624577820301056, - "kl": 0.149169921875, - "learning_rate": 7.809209036902711e-08, - "loss": 0.0248, - "reward": 0.5195312798023224, - "reward_std": 0.16249455697834492, - "rewards/accuracy_reward": 0.05357143236324191, + "grad_norm": 38.79275131225586, + "kl": 4.8203125, + "learning_rate": 3.904604518451356e-07, + "loss": 0.333, + "reward": 0.4570312723517418, + "reward_std": 0.17782100848853588, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598395228386, + "rewards/tag_count_reward": 0.4056919813156128, "step": 1269 }, { "clip_ratio": 0.0, - "completion_length": 1788.4286499023438, + "completion_length": 1553.1005249023438, "epoch": 0.3793592711522665, - "grad_norm": 1.5788301229476929, - "kl": 0.160888671875, - "learning_rate": 7.804893323270143e-08, - "loss": 0.0263, - "reward": 0.6093750223517418, - "reward_std": 0.15506966970860958, - "rewards/accuracy_reward": 0.14508929289877415, + "grad_norm": 18.94418716430664, + "kl": 4.66796875, + "learning_rate": 3.9024466616350713e-07, + "loss": 0.3554, + "reward": 0.537388414144516, + "reward_std": 0.19001031666994095, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.403459832072258, "step": 1270 }, { "clip_ratio": 0.0, - "completion_length": 1655.9822082519531, + "completion_length": 1413.15185546875, "epoch": 0.3796579792397879, - "grad_norm": 0.814338743686676, - "kl": 0.129638671875, - "learning_rate": 7.800574558188546e-08, - "loss": 0.0309, - "reward": 0.5708705708384514, - "reward_std": 0.13487258180975914, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 37.71917724609375, + "kl": 3.4765625, + "learning_rate": 3.900287279094273e-07, + "loss": 0.3311, + "reward": 0.5334821715950966, + "reward_std": 0.18773821741342545, + "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562649011612, + "rewards/tag_count_reward": 0.439732164144516, "step": 1271 }, { "clip_ratio": 0.0, - "completion_length": 1705.68310546875, + "completion_length": 1471.1250305175781, "epoch": 0.37995668732730936, - "grad_norm": 0.5581037402153015, - "kl": 0.138916015625, - "learning_rate": 7.796252746356317e-08, - "loss": 0.0496, - "reward": 0.5390625223517418, - "reward_std": 0.12978593073785305, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 81.22847747802734, + "kl": 2.0859375, + "learning_rate": 3.898126373178158e-07, + "loss": 0.2341, + "reward": 0.5061384066939354, + "reward_std": 0.17557835765182972, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.4347098469734192, "step": 1272 }, { "clip_ratio": 0.0, - "completion_length": 1761.9241943359375, + "completion_length": 1611.2857971191406, "epoch": 0.38025539541483083, - "grad_norm": 0.8112559914588928, - "kl": 0.1365966796875, - "learning_rate": 7.791927892475161e-08, - "loss": 0.0201, - "reward": 0.643973246216774, - "reward_std": 0.17798936553299427, - "rewards/accuracy_reward": 0.1808035746216774, + "grad_norm": 12.227075576782227, + "kl": 4.15625, + "learning_rate": 3.89596394623758e-07, + "loss": 0.2844, + "reward": 0.5613839477300644, + "reward_std": 0.2170797437429428, + "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696566939354, + "rewards/tag_count_reward": 0.4095982387661934, "step": 1273 }, { "clip_ratio": 0.0, - "completion_length": 1713.0603637695312, + "completion_length": 1466.0781860351562, "epoch": 0.3805541035023523, - "grad_norm": 0.3517560362815857, - "kl": 0.148193359375, - "learning_rate": 7.787600001250098e-08, - "loss": 0.0334, - "reward": 0.5742187798023224, - "reward_std": 0.16524307988584042, - "rewards/accuracy_reward": 0.10044643562287092, + "grad_norm": 61.89448547363281, + "kl": 2.96875, + "learning_rate": 3.893800000625049e-07, + "loss": 0.2724, + "reward": 0.525111623108387, + "reward_std": 0.17459117993712425, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4402901902794838, "step": 1274 }, { "clip_ratio": 0.0, - "completion_length": 1756.5134582519531, + "completion_length": 1517.3192749023438, "epoch": 0.3808528115898738, - "grad_norm": 1.056689739227295, - "kl": 0.14111328125, - "learning_rate": 7.783269077389446e-08, - "loss": 0.0284, - "reward": 0.5825893133878708, - "reward_std": 0.13609974458813667, - "rewards/accuracy_reward": 0.12053571874275804, + "grad_norm": 50.783721923828125, + "kl": 2.7744140625, + "learning_rate": 3.891634538694723e-07, + "loss": 0.225, + "reward": 0.5731026902794838, + "reward_std": 0.1678936406970024, + "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535969734192, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1275 }, { "clip_ratio": 0.0, - "completion_length": 1699.6607971191406, + "completion_length": 1455.9197082519531, "epoch": 0.38115151967739525, - "grad_norm": 1.1303997039794922, - "kl": 0.1370849609375, - "learning_rate": 7.778935125604829e-08, - "loss": 0.04, - "reward": 0.5435268133878708, - "reward_std": 0.12397899758070707, - "rewards/accuracy_reward": 0.07366071874275804, + "grad_norm": 67.63020324707031, + "kl": 3.287109375, + "learning_rate": 3.889467562802414e-07, + "loss": 0.3179, + "reward": 0.5066964477300644, + "reward_std": 0.16733341291546822, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660895228386, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1276 }, { "clip_ratio": 0.0, - "completion_length": 1719.1875915527344, + "completion_length": 1532.4465026855469, "epoch": 0.3814502277649167, - "grad_norm": 0.5371478796005249, - "kl": 0.14306640625, - "learning_rate": 7.77459815061116e-08, - "loss": 0.0313, - "reward": 0.580357164144516, - "reward_std": 0.07994947023689747, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 27.781240463256836, + "kl": 5.14453125, + "learning_rate": 3.88729907530558e-07, + "loss": 0.3658, + "reward": 0.5212053805589676, + "reward_std": 0.1363147608935833, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.4095982313156128, "step": 1277 }, { "clip_ratio": 0.0, - "completion_length": 1726.8036499023438, + "completion_length": 1495.6942749023438, "epoch": 0.3817489358524382, - "grad_norm": 0.7165974974632263, - "kl": 0.144287109375, - "learning_rate": 7.770258157126641e-08, - "loss": 0.0295, - "reward": 0.663504496216774, - "reward_std": 0.19127115607261658, - "rewards/accuracy_reward": 0.19866072246804833, + "grad_norm": 36.19876480102539, + "kl": 4.7734375, + "learning_rate": 3.885129078563321e-07, + "loss": 0.3575, + "reward": 0.5797991380095482, + "reward_std": 0.21181683987379074, + "rewards/accuracy_reward": 0.16741072060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.412388414144516, "step": 1278 }, { "clip_ratio": 0.0, - "completion_length": 1795.3951721191406, + "completion_length": 1540.8482971191406, "epoch": 0.38204764393995966, - "grad_norm": 1.294432282447815, - "kl": 0.1484375, - "learning_rate": 7.765915149872764e-08, - "loss": 0.0391, - "reward": 0.5658482387661934, - "reward_std": 0.13576405681669712, - "rewards/accuracy_reward": 0.10044643259607255, + "grad_norm": 9.210381507873535, + "kl": 4.48046875, + "learning_rate": 3.8829575749363817e-07, + "loss": 0.3254, + "reward": 0.5178571715950966, + "reward_std": 0.14574753865599632, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1279 }, { "clip_ratio": 0.0, - "completion_length": 1607.0536499023438, + "completion_length": 1431.0379943847656, "epoch": 0.38234635202748113, - "grad_norm": 2.470337390899658, - "kl": 0.126708984375, - "learning_rate": 7.761569133574291e-08, - "loss": 0.0358, - "reward": 0.5334821566939354, - "reward_std": 0.13278614543378353, - "rewards/accuracy_reward": 0.058035718044266105, + "grad_norm": 22.136837005615234, + "kl": 4.04296875, + "learning_rate": 3.880784566787145e-07, + "loss": 0.3627, + "reward": 0.4709821715950966, + "reward_std": 0.1648912914097309, + "rewards/accuracy_reward": 0.03794642956927419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4754464477300644, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1280 }, { "clip_ratio": 0.0, - "completion_length": 1767.97998046875, + "completion_length": 1555.7723693847656, "epoch": 0.3826450601150026, - "grad_norm": 1.5150407552719116, - "kl": 0.1416015625, - "learning_rate": 7.757220112959264e-08, - "loss": 0.0259, - "reward": 0.5078125223517418, - "reward_std": 0.12292543612420559, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 9.385151863098145, + "kl": 3.6484375, + "learning_rate": 3.878610056479632e-07, + "loss": 0.2766, + "reward": 0.4665178805589676, + "reward_std": 0.18152549117803574, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4263393059372902, "step": 1281 }, { "clip_ratio": 0.0, - "completion_length": 1751.8326721191406, + "completion_length": 1572.9688110351562, "epoch": 0.38294376820252407, - "grad_norm": 0.42861834168434143, - "kl": 0.134765625, - "learning_rate": 7.75286809275899e-08, - "loss": 0.028, - "reward": 0.6216518133878708, - "reward_std": 0.20609359443187714, - "rewards/accuracy_reward": 0.1562500037252903, + "grad_norm": 46.929527282714844, + "kl": 4.375, + "learning_rate": 3.8764340463794954e-07, + "loss": 0.2764, + "reward": 0.5518973469734192, + "reward_std": 0.20453695766627789, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018133878708, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1282 }, { "clip_ratio": 0.0, - "completion_length": 1740.7054138183594, + "completion_length": 1501.2009887695312, "epoch": 0.38324247629004554, - "grad_norm": 0.865935742855072, - "kl": 0.130615234375, - "learning_rate": 7.748513077708043e-08, - "loss": 0.0318, - "reward": 0.5664062649011612, - "reward_std": 0.1288299784064293, - "rewards/accuracy_reward": 0.10267857578583062, + "grad_norm": 34.31199645996094, + "kl": 3.296875, + "learning_rate": 3.8742565388540214e-07, + "loss": 0.2697, + "reward": 0.5468750223517418, + "reward_std": 0.1539290864020586, + "rewards/accuracy_reward": 0.09598214761354029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1283 }, { "clip_ratio": 0.0, - "completion_length": 1778.4420776367188, + "completion_length": 1583.1697082519531, "epoch": 0.383541184377567, - "grad_norm": 0.2813398838043213, - "kl": 0.1396484375, - "learning_rate": 7.74415507254425e-08, - "loss": 0.0379, - "reward": 0.5859375149011612, - "reward_std": 0.10904659889638424, - "rewards/accuracy_reward": 0.12053572130389512, + "grad_norm": 60.0711784362793, + "kl": 6.140625, + "learning_rate": 3.8720775362721253e-07, + "loss": 0.4217, + "reward": 0.5284598395228386, + "reward_std": 0.13499336317181587, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1284 }, { "clip_ratio": 0.0, - "completion_length": 1716.3416137695312, + "completion_length": 1478.0625915527344, "epoch": 0.3838398924650885, - "grad_norm": 0.5163976550102234, - "kl": 0.13623046875, - "learning_rate": 7.739794082008695e-08, - "loss": 0.0305, - "reward": 0.5580357313156128, - "reward_std": 0.10818899609148502, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 19.3468017578125, + "kl": 4.53125, + "learning_rate": 3.8698970410043475e-07, + "loss": 0.3328, + "reward": 0.5195312723517418, + "reward_std": 0.14610961265861988, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.428013414144516, "step": 1285 }, { "clip_ratio": 0.0, - "completion_length": 1658.5179443359375, + "completion_length": 1456.8125610351562, "epoch": 0.38413860055260995, - "grad_norm": 0.7133060693740845, - "kl": 0.1298828125, - "learning_rate": 7.735430110845706e-08, - "loss": 0.0206, - "reward": 0.4955357238650322, - "reward_std": 0.11346527375280857, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 30.484533309936523, + "kl": 4.55859375, + "learning_rate": 3.8677150554228534e-07, + "loss": 0.3279, + "reward": 0.455357164144516, + "reward_std": 0.1288006417453289, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1286 }, { "clip_ratio": 0.0, - "completion_length": 1705.5960693359375, + "completion_length": 1487.9085388183594, "epoch": 0.3844373086401314, - "grad_norm": 0.5766462087631226, - "kl": 0.132080078125, - "learning_rate": 7.731063163802858e-08, - "loss": 0.0359, - "reward": 0.5239955633878708, - "reward_std": 0.09278230927884579, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 50.648948669433594, + "kl": 2.96484375, + "learning_rate": 3.865531581901429e-07, + "loss": 0.27, + "reward": 0.4888393133878708, + "reward_std": 0.120752127841115, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4748884066939354, + "rewards/tag_count_reward": 0.444196455180645, "step": 1287 }, { "clip_ratio": 0.0, - "completion_length": 1727.9822692871094, + "completion_length": 1526.22998046875, "epoch": 0.3847360167276529, - "grad_norm": 1.3250805139541626, - "kl": 0.137939453125, - "learning_rate": 7.726693245630961e-08, - "loss": 0.0457, - "reward": 0.608816996216774, - "reward_std": 0.1485845260322094, - "rewards/accuracy_reward": 0.145089291036129, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463727705180645, + "grad_norm": 24.656667709350586, + "kl": 4.09375, + "learning_rate": 3.8633466228154805e-07, + "loss": 0.3221, + "reward": 0.5786830708384514, + "reward_std": 0.1672380492091179, + "rewards/accuracy_reward": 0.1428571529686451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4358259215950966, "step": 1288 }, { "clip_ratio": 0.0, - "completion_length": 1688.8125610351562, + "completion_length": 1463.0536499023438, "epoch": 0.38503472481517437, - "grad_norm": 0.9647566676139832, - "kl": 0.128662109375, - "learning_rate": 7.722320361084056e-08, - "loss": 0.0331, - "reward": 0.5669643059372902, - "reward_std": 0.14875544048845768, - "rewards/accuracy_reward": 0.10267857555299997, + "grad_norm": 63.84490966796875, + "kl": 2.583984375, + "learning_rate": 3.861160180542028e-07, + "loss": 0.2289, + "reward": 0.5541294738650322, + "reward_std": 0.18520366586744785, + "rewards/accuracy_reward": 0.10937500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4447544887661934, "step": 1289 }, { "clip_ratio": 0.0, - "completion_length": 1685.4688415527344, + "completion_length": 1388.7745971679688, "epoch": 0.38533343290269584, - "grad_norm": 5.374574661254883, - "kl": 0.1611328125, - "learning_rate": 7.717944514919415e-08, - "loss": 0.0452, - "reward": 0.498883955180645, - "reward_std": 0.10127542167901993, - "rewards/accuracy_reward": 0.029017857974395156, + "grad_norm": 63.11453628540039, + "kl": 1.92578125, + "learning_rate": 3.8589722574597077e-07, + "loss": 0.1756, + "reward": 0.4877232313156128, + "reward_std": 0.11110446229577065, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660895228386, + "rewards/tag_count_reward": 0.4587053805589676, "step": 1290 }, { "clip_ratio": 0.0, - "completion_length": 1744.2947082519531, + "completion_length": 1527.0447082519531, "epoch": 0.3856321409902173, - "grad_norm": 6.303050994873047, - "kl": 0.180419921875, - "learning_rate": 7.713565711897529e-08, - "loss": 0.0377, - "reward": 0.5262276902794838, - "reward_std": 0.12736345175653696, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 18.445737838745117, + "kl": 3.53125, + "learning_rate": 3.8567828559487645e-07, + "loss": 0.2742, + "reward": 0.482142873108387, + "reward_std": 0.18583733588457108, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1291 }, { "clip_ratio": 0.0, - "completion_length": 1643.1094360351562, + "completion_length": 1402.5715026855469, "epoch": 0.3859308490777388, - "grad_norm": 1.4845486879348755, - "kl": 0.1361083984375, - "learning_rate": 7.709183956782108e-08, - "loss": 0.0358, - "reward": 0.6428571715950966, - "reward_std": 0.13421891443431377, - "rewards/accuracy_reward": 0.1718750074505806, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4709821715950966, + "grad_norm": 14.82524585723877, + "kl": 3.35546875, + "learning_rate": 3.854591978391054e-07, + "loss": 0.2731, + "reward": 0.6166294813156128, + "reward_std": 0.16885140351951122, + "rewards/accuracy_reward": 0.1718750111758709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1292 }, { "clip_ratio": 0.0, - "completion_length": 1753.8772888183594, + "completion_length": 1451.9576721191406, "epoch": 0.38622955716526025, - "grad_norm": 1.1027843952178955, - "kl": 0.140380859375, - "learning_rate": 7.704799254340069e-08, - "loss": 0.0281, - "reward": 0.533482164144516, - "reward_std": 0.12119701318442822, - "rewards/accuracy_reward": 0.06473214505240321, + "grad_norm": 27.912349700927734, + "kl": 4.10546875, + "learning_rate": 3.8523996271700343e-07, + "loss": 0.3641, + "reward": 0.4815848469734192, + "reward_std": 0.14909899793565273, + "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.4302455484867096, "step": 1293 }, { "clip_ratio": 0.0, - "completion_length": 1758.7166137695312, + "completion_length": 1482.2344665527344, "epoch": 0.3865282652527817, - "grad_norm": 0.5079576969146729, - "kl": 0.147705078125, - "learning_rate": 7.70041160934154e-08, - "loss": 0.0366, - "reward": 0.5591518059372902, - "reward_std": 0.16533365659415722, + "grad_norm": 13.441256523132324, + "kl": 3.7265625, + "learning_rate": 3.85020580467077e-07, + "loss": 0.3085, + "reward": 0.5396205633878708, + "reward_std": 0.17871698923408985, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.448102705180645, "step": 1294 }, { "clip_ratio": 0.0, - "completion_length": 1653.33935546875, + "completion_length": 1462.2857666015625, "epoch": 0.3868269733403032, - "grad_norm": 1.3651697635650635, - "kl": 0.14404296875, - "learning_rate": 7.696021026559849e-08, - "loss": 0.0212, - "reward": 0.6244419738650322, - "reward_std": 0.09979642182588577, - "rewards/accuracy_reward": 0.1473214365541935, + "grad_norm": 63.02687072753906, + "kl": 4.83984375, + "learning_rate": 3.8480105132799244e-07, + "loss": 0.3327, + "reward": 0.5770089477300644, + "reward_std": 0.150366535410285, + "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.431919664144516, "step": 1295 }, { "clip_ratio": 0.0, - "completion_length": 1683.3103332519531, + "completion_length": 1485.4621276855469, "epoch": 0.38712568142782466, - "grad_norm": 1.6812763214111328, - "kl": 0.154296875, - "learning_rate": 7.69162751077152e-08, - "loss": 0.0339, - "reward": 0.5909598469734192, - "reward_std": 0.1519194319844246, - "rewards/accuracy_reward": 0.11607143399305642, + "grad_norm": 110.61550903320312, + "kl": 6.109375, + "learning_rate": 3.84581375538576e-07, + "loss": 0.4225, + "reward": 0.5212053656578064, + "reward_std": 0.18514585867524147, + "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.431919664144516, "step": 1296 }, { "clip_ratio": 0.0, - "completion_length": 1738.7478332519531, + "completion_length": 1547.7522888183594, "epoch": 0.38742438951534613, - "grad_norm": 8.515798568725586, - "kl": 0.211181640625, - "learning_rate": 7.687231066756267e-08, - "loss": 0.0387, - "reward": 0.535156287252903, - "reward_std": 0.17287108674645424, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 111.26005554199219, + "kl": 6.796875, + "learning_rate": 3.843615533378133e-07, + "loss": 0.4716, + "reward": 0.4659598395228386, + "reward_std": 0.2050965502858162, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.412388414144516, "step": 1297 }, { "clip_ratio": 0.0, - "completion_length": 1826.7657165527344, + "completion_length": 1688.2835388183594, "epoch": 0.3877230976028676, - "grad_norm": 20.166942596435547, - "kl": 0.251953125, - "learning_rate": 7.68283169929699e-08, - "loss": 0.029, - "reward": 0.5619419813156128, - "reward_std": 0.2168467864394188, - "rewards/accuracy_reward": 0.09598214644938707, + "grad_norm": 147.81222534179688, + "kl": 7.84375, + "learning_rate": 3.841415849648495e-07, + "loss": 0.4872, + "reward": 0.4693080559372902, + "reward_std": 0.22909464314579964, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598395228386, + "rewards/tag_count_reward": 0.3934151977300644, "step": 1298 }, { "clip_ratio": 0.0, - "completion_length": 1706.149658203125, + "completion_length": 1473.0291137695312, "epoch": 0.3880218056903891, - "grad_norm": 6.263221263885498, - "kl": 0.187744140625, - "learning_rate": 7.67842941317977e-08, - "loss": 0.0449, - "reward": 0.576450914144516, - "reward_std": 0.1679827943444252, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 116.06525421142578, + "kl": 6.6953125, + "learning_rate": 3.839214706589885e-07, + "loss": 0.4722, + "reward": 0.5340402126312256, + "reward_std": 0.18341074883937836, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1299 }, { "clip_ratio": 0.0, - "completion_length": 1716.82373046875, + "completion_length": 1519.5000915527344, "epoch": 0.38832051377791055, - "grad_norm": 15.967903137207031, - "kl": 0.237548828125, - "learning_rate": 7.674024213193863e-08, - "loss": 0.0492, - "reward": 0.5502232387661934, - "reward_std": 0.14624688401818275, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 167.61610412597656, + "kl": 7.484375, + "learning_rate": 3.837012106596932e-07, + "loss": 0.4546, + "reward": 0.491071455180645, + "reward_std": 0.1913202926516533, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.4241071566939354, "step": 1300 }, { "clip_ratio": 0.0, - "completion_length": 1691.0648193359375, + "completion_length": 1519.1451721191406, "epoch": 0.388619221865432, - "grad_norm": 1.320677399635315, - "kl": 0.12451171875, - "learning_rate": 7.669616104131696e-08, - "loss": 0.0307, - "reward": 0.5915178805589676, - "reward_std": 0.15220184437930584, - "rewards/accuracy_reward": 0.12500000558793545, + "grad_norm": 60.138526916503906, + "kl": 4.75, + "learning_rate": 3.8348080520658477e-07, + "loss": 0.3322, + "reward": 0.5647321715950966, + "reward_std": 0.15331684984266758, + "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1301 }, { "clip_ratio": 0.0, - "completion_length": 1831.102783203125, + "completion_length": 1553.0067749023438, "epoch": 0.3889179299529535, - "grad_norm": 0.7081120610237122, - "kl": 0.1446533203125, - "learning_rate": 7.665205090788855e-08, - "loss": 0.0331, - "reward": 0.6082589626312256, - "reward_std": 0.18291030451655388, - "rewards/accuracy_reward": 0.14285714738070965, + "grad_norm": 23.569028854370117, + "kl": 3.9140625, + "learning_rate": 3.832602545394427e-07, + "loss": 0.3211, + "reward": 0.5552455708384514, + "reward_std": 0.20371198281645775, + "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018133878708, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1302 }, { "clip_ratio": 0.0, - "completion_length": 1737.4264221191406, + "completion_length": 1464.9643249511719, "epoch": 0.38921663804047496, - "grad_norm": 0.45418041944503784, - "kl": 0.1376953125, - "learning_rate": 7.660791177964094e-08, - "loss": 0.0341, - "reward": 0.5083705559372902, - "reward_std": 0.12306816130876541, - "rewards/accuracy_reward": 0.03348214412108064, + "grad_norm": 10.51165771484375, + "kl": 3.84375, + "learning_rate": 3.830395588982047e-07, + "loss": 0.3108, + "reward": 0.4603794813156128, + "reward_std": 0.13618669658899307, + "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.435825914144516, "step": 1303 }, { "clip_ratio": 0.0, - "completion_length": 1691.4442749023438, + "completion_length": 1502.5715026855469, "epoch": 0.38951534612799643, - "grad_norm": 0.6808429956436157, - "kl": 0.1392822265625, - "learning_rate": 7.65637437045932e-08, - "loss": 0.0342, - "reward": 0.529575914144516, - "reward_std": 0.08997167367488146, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 41.0344123840332, + "kl": 3.6484375, + "learning_rate": 3.8281871852296597e-07, + "loss": 0.3125, + "reward": 0.4693080484867096, + "reward_std": 0.1203591600060463, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.426897332072258, "step": 1304 }, { "clip_ratio": 0.0, - "completion_length": 1731.32373046875, + "completion_length": 1508.6495971679688, "epoch": 0.3898140542155179, - "grad_norm": 0.676544725894928, - "kl": 0.141845703125, - "learning_rate": 7.651954673079581e-08, - "loss": 0.0394, - "reward": 0.517299123108387, - "reward_std": 0.1506677307188511, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 8.734365463256836, + "kl": 3.5078125, + "learning_rate": 3.825977336539791e-07, + "loss": 0.2661, + "reward": 0.4955357313156128, + "reward_std": 0.17815385200083256, + "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1305 }, { "clip_ratio": 0.0, - "completion_length": 1764.65185546875, + "completion_length": 1537.33935546875, "epoch": 0.3901127623030394, - "grad_norm": 0.4944855868816376, - "kl": 0.140625, - "learning_rate": 7.647532090633077e-08, - "loss": 0.0268, - "reward": 0.577566996216774, - "reward_std": 0.14919354021549225, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 33.38508224487305, + "kl": 2.86328125, + "learning_rate": 3.8237660453165386e-07, + "loss": 0.2541, + "reward": 0.5323660969734192, + "reward_std": 0.15103640407323837, + "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.468191996216774, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1306 }, { "clip_ratio": 0.0, - "completion_length": 1642.9040832519531, + "completion_length": 1440.5402221679688, "epoch": 0.39041147039056084, - "grad_norm": 0.6087823510169983, - "kl": 0.1348876953125, - "learning_rate": 7.643106627931146e-08, - "loss": 0.0356, - "reward": 0.5837053954601288, - "reward_std": 0.10825843177735806, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 52.93671417236328, + "kl": 3.154296875, + "learning_rate": 3.821553313965573e-07, + "loss": 0.2862, + "reward": 0.5546875223517418, + "reward_std": 0.12212117575109005, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1307 }, { "clip_ratio": 0.0, - "completion_length": 1789.7880554199219, + "completion_length": 1544.071533203125, "epoch": 0.3907101784780823, - "grad_norm": 1.1876097917556763, - "kl": 0.156494140625, - "learning_rate": 7.638678289788256e-08, - "loss": 0.0382, - "reward": 0.508928582072258, - "reward_std": 0.15873987041413784, - "rewards/accuracy_reward": 0.046875002793967724, + "grad_norm": 9.903542518615723, + "kl": 4.45703125, + "learning_rate": 3.8193391448941277e-07, + "loss": 0.3392, + "reward": 0.4709821715950966, + "reward_std": 0.20191789418458939, + "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535895228386, + "rewards/tag_count_reward": 0.4174107387661934, "step": 1308 }, { "clip_ratio": 0.0, - "completion_length": 1659.0826416015625, + "completion_length": 1392.1496276855469, "epoch": 0.3910088865656038, - "grad_norm": 0.7656062841415405, - "kl": 0.133056640625, - "learning_rate": 7.634247081022003e-08, - "loss": 0.0228, - "reward": 0.5708705633878708, - "reward_std": 0.12633774057030678, - "rewards/accuracy_reward": 0.09598214644938707, + "grad_norm": 68.98812866210938, + "kl": 2.068359375, + "learning_rate": 3.8171235405110013e-07, + "loss": 0.2012, + "reward": 0.5167411044239998, + "reward_std": 0.1570369489490986, + "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474888414144516, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1309 }, { "clip_ratio": 0.0, - "completion_length": 1762.5447387695312, + "completion_length": 1513.3840026855469, "epoch": 0.39130759465312526, - "grad_norm": 0.9180033802986145, - "kl": 0.13916015625, - "learning_rate": 7.629813006453112e-08, - "loss": 0.0323, - "reward": 0.546316996216774, - "reward_std": 0.16565529257059097, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 72.37940979003906, + "kl": 2.47265625, + "learning_rate": 3.814906503226556e-07, + "loss": 0.2274, + "reward": 0.5239955633878708, + "reward_std": 0.182341530919075, + "rewards/accuracy_reward": 0.08258929150179029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4614955559372902, + "rewards/tag_count_reward": 0.4414062723517418, "step": 1310 }, { "clip_ratio": 0.0, - "completion_length": 1740.1295776367188, + "completion_length": 1474.1183776855469, "epoch": 0.39160630274064673, - "grad_norm": 0.7226965427398682, - "kl": 0.15673828125, - "learning_rate": 7.625376070905418e-08, - "loss": 0.0323, - "reward": 0.5223214700818062, - "reward_std": 0.13736676797270775, - "rewards/accuracy_reward": 0.06250000465661287, + "grad_norm": 40.21659851074219, + "kl": 3.302734375, + "learning_rate": 3.812688035452709e-07, + "loss": 0.2926, + "reward": 0.4866071715950966, + "reward_std": 0.14452451653778553, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4598214477300644, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1311 }, { "clip_ratio": 0.0, - "completion_length": 1758.4554138183594, + "completion_length": 1515.60498046875, "epoch": 0.3919050108281682, - "grad_norm": 2.8768632411956787, - "kl": 0.166015625, - "learning_rate": 7.620936279205874e-08, - "loss": 0.0408, - "reward": 0.5563616305589676, - "reward_std": 0.136737247928977, - "rewards/accuracy_reward": 0.08705357369035482, + "grad_norm": 25.677051544189453, + "kl": 3.69921875, + "learning_rate": 3.810468139602937e-07, + "loss": 0.3181, + "reward": 0.5200893133878708, + "reward_std": 0.19294268265366554, + "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4263393133878708, "step": 1312 }, { "clip_ratio": 0.0, - "completion_length": 1698.1540832519531, + "completion_length": 1462.0313110351562, "epoch": 0.39220371891568967, - "grad_norm": 0.4387132525444031, - "kl": 0.1329345703125, - "learning_rate": 7.616493636184537e-08, - "loss": 0.0246, - "reward": 0.6768973469734192, - "reward_std": 0.1821621172130108, - "rewards/accuracy_reward": 0.2075892984867096, + "grad_norm": 17.955053329467773, + "kl": 3.08203125, + "learning_rate": 3.8082468180922683e-07, + "loss": 0.1982, + "reward": 0.6417410969734192, + "reward_std": 0.20492911711335182, + "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.447544664144516, "step": 1313 }, { "clip_ratio": 0.0, - "completion_length": 1564.0848693847656, + "completion_length": 1393.0804443359375, "epoch": 0.3925024270032111, - "grad_norm": 0.5251122117042542, - "kl": 0.11962890625, - "learning_rate": 7.612048146674568e-08, - "loss": 0.0564, - "reward": 0.5837053954601288, - "reward_std": 0.14451200515031815, - "rewards/accuracy_reward": 0.10491072107106447, + "grad_norm": 60.0616569519043, + "kl": 4.91796875, + "learning_rate": 3.806024073337284e-07, + "loss": 0.3678, + "reward": 0.5362723395228386, + "reward_std": 0.1712651439011097, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4787946715950966, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1314 }, { "clip_ratio": 0.0, - "completion_length": 1785.5000915527344, + "completion_length": 1588.6965026855469, "epoch": 0.39280113509073256, - "grad_norm": 0.6266716718673706, - "kl": 0.1455078125, - "learning_rate": 7.607599815512224e-08, - "loss": 0.0333, - "reward": 0.6099330559372902, - "reward_std": 0.15365520492196083, - "rewards/accuracy_reward": 0.14955358067527413, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.460379496216774, + "grad_norm": 35.023162841796875, + "kl": 4.66015625, + "learning_rate": 3.803799907756112e-07, + "loss": 0.3264, + "reward": 0.5786830633878708, + "reward_std": 0.16471973434090614, + "rewards/accuracy_reward": 0.14062500488944352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1315 }, { "clip_ratio": 0.0, - "completion_length": 1763.1741943359375, + "completion_length": 1522.5313110351562, "epoch": 0.393099843178254, - "grad_norm": 1.3012800216674805, - "kl": 0.13232421875, - "learning_rate": 7.603148647536853e-08, - "loss": 0.0331, - "reward": 0.5837053880095482, - "reward_std": 0.1287491787225008, - "rewards/accuracy_reward": 0.11607143003493547, + "grad_norm": 21.160337448120117, + "kl": 3.59765625, + "learning_rate": 3.801574323768426e-07, + "loss": 0.2876, + "reward": 0.5401785895228386, + "reward_std": 0.15324592031538486, + "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467633955180645, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1316 }, { "clip_ratio": 0.0, - "completion_length": 1689.4710388183594, + "completion_length": 1490.04248046875, "epoch": 0.3933985512657755, - "grad_norm": 0.5159251093864441, - "kl": 0.137451171875, - "learning_rate": 7.598694647590888e-08, - "loss": 0.0219, - "reward": 0.5133928805589676, - "reward_std": 0.15311224572360516, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 18.046106338500977, + "kl": 3.025390625, + "learning_rate": 3.7993473237954445e-07, + "loss": 0.2205, + "reward": 0.4799107313156128, + "reward_std": 0.16653244942426682, + "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.466517873108387, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1317 }, { "clip_ratio": 0.0, - "completion_length": 1725.7545166015625, + "completion_length": 1505.1652526855469, "epoch": 0.39369725935329697, - "grad_norm": 1.9464383125305176, - "kl": 0.136474609375, - "learning_rate": 7.594237820519848e-08, - "loss": 0.0469, - "reward": 0.5608259066939354, - "reward_std": 0.14003478921949863, - "rewards/accuracy_reward": 0.08705357648432255, + "grad_norm": 10.065442085266113, + "kl": 3.38671875, + "learning_rate": 3.7971189102599246e-07, + "loss": 0.2522, + "reward": 0.5279018059372902, + "reward_std": 0.19211949035525322, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723395228386, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1318 }, { "clip_ratio": 0.0, - "completion_length": 1756.0491638183594, + "completion_length": 1517.68310546875, "epoch": 0.39399596744081844, - "grad_norm": 0.7225290536880493, - "kl": 0.15576171875, - "learning_rate": 7.58977817117232e-08, - "loss": 0.0229, - "reward": 0.5870535969734192, - "reward_std": 0.09319883957505226, - "rewards/accuracy_reward": 0.12500000488944352, + "grad_norm": 35.18148422241211, + "kl": 3.41796875, + "learning_rate": 3.7948890855861603e-07, + "loss": 0.2611, + "reward": 0.5613839477300644, + "reward_std": 0.11361411958932877, + "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620536044239998, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1319 }, { "clip_ratio": 0.0, - "completion_length": 1783.1541137695312, + "completion_length": 1495.7523193359375, "epoch": 0.3942946755283399, - "grad_norm": 1.2104487419128418, - "kl": 0.157958984375, - "learning_rate": 7.585315704399968e-08, - "loss": 0.0302, - "reward": 0.5396205633878708, - "reward_std": 0.13092371076345444, - "rewards/accuracy_reward": 0.08035714412108064, + "grad_norm": 16.243579864501953, + "kl": 3.59375, + "learning_rate": 3.7926578521999843e-07, + "loss": 0.2723, + "reward": 0.517857164144516, + "reward_std": 0.15928495675325394, + "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.439732164144516, "step": 1320 }, { "clip_ratio": 0.0, - "completion_length": 1742.2768859863281, + "completion_length": 1458.6585388183594, "epoch": 0.3945933836158614, - "grad_norm": 0.4759024381637573, - "kl": 0.15185546875, - "learning_rate": 7.580850425057519e-08, - "loss": 0.0149, - "reward": 0.5117187723517418, - "reward_std": 0.14630357176065445, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 29.111801147460938, + "kl": 4.40234375, + "learning_rate": 3.7904252125287594e-07, + "loss": 0.319, + "reward": 0.4737723395228386, + "reward_std": 0.15724255610257387, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1321 }, { "clip_ratio": 0.0, - "completion_length": 1791.1228332519531, + "completion_length": 1520.7433776855469, "epoch": 0.39489209170338285, - "grad_norm": 0.8203977942466736, - "kl": 0.158935546875, - "learning_rate": 7.576382338002757e-08, - "loss": 0.0229, - "reward": 0.4838169813156128, - "reward_std": 0.13685563020408154, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 24.655000686645508, + "kl": 3.7109375, + "learning_rate": 3.788191169001379e-07, + "loss": 0.311, + "reward": 0.4564732387661934, + "reward_std": 0.17004377208650112, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4570312649011612, + "rewards/tag_count_reward": 0.4319196566939354, "step": 1322 }, { "clip_ratio": 0.0, - "completion_length": 1698.3125610351562, + "completion_length": 1447.1384582519531, "epoch": 0.3951907997909043, - "grad_norm": 1.0484340190887451, - "kl": 0.143798828125, - "learning_rate": 7.571911448096525e-08, - "loss": 0.0371, - "reward": 0.5385044813156128, - "reward_std": 0.12733584083616734, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 11.153548240661621, + "kl": 3.95703125, + "learning_rate": 3.785955724048262e-07, + "loss": 0.3263, + "reward": 0.5016741305589676, + "reward_std": 0.13410876877605915, + "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.4458705633878708, "step": 1323 }, { "clip_ratio": 0.0, - "completion_length": 1667.5670471191406, + "completion_length": 1419.1942749023438, "epoch": 0.3954895078784258, - "grad_norm": 1.2993844747543335, - "kl": 0.1424560546875, - "learning_rate": 7.56743776020271e-08, - "loss": 0.0484, - "reward": 0.5613839477300644, - "reward_std": 0.11942668631672859, - "rewards/accuracy_reward": 0.09375000116415322, + "grad_norm": 41.64998245239258, + "kl": 5.015625, + "learning_rate": 3.7837188801013553e-07, + "loss": 0.3784, + "reward": 0.5256696715950966, + "reward_std": 0.14723791368305683, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1324 }, { "clip_ratio": 0.0, - "completion_length": 1702.3170776367188, + "completion_length": 1480.8438110351562, "epoch": 0.39578821596594727, - "grad_norm": 1.2707188129425049, - "kl": 0.1461181640625, - "learning_rate": 7.562961279188249e-08, - "loss": 0.0352, - "reward": 0.5842634290456772, - "reward_std": 0.11969576589763165, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 26.610553741455078, + "kl": 4.4453125, + "learning_rate": 3.7814806395941246e-07, + "loss": 0.3203, + "reward": 0.5608259290456772, + "reward_std": 0.1297959890216589, + "rewards/accuracy_reward": 0.1227678582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4570312723517418, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1325 }, { "clip_ratio": 0.0, - "completion_length": 1669.1273193359375, + "completion_length": 1444.3884582519531, "epoch": 0.39608692405346874, - "grad_norm": 0.8886904716491699, - "kl": 0.143310546875, - "learning_rate": 7.558482009923113e-08, - "loss": 0.0385, - "reward": 0.6110491454601288, - "reward_std": 0.11979898996651173, - "rewards/accuracy_reward": 0.14062500558793545, + "grad_norm": 19.633434295654297, + "kl": 4.21484375, + "learning_rate": 3.7792410049615567e-07, + "loss": 0.3692, + "reward": 0.5680803805589676, + "reward_std": 0.16257998906075954, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470424123108387, + "rewards/tag_count_reward": 0.4386160969734192, "step": 1326 }, { "clip_ratio": 0.0, - "completion_length": 1728.9598693847656, + "completion_length": 1493.0290832519531, "epoch": 0.3963856321409902, - "grad_norm": 0.6016729474067688, - "kl": 0.1494140625, - "learning_rate": 7.553999957280308e-08, - "loss": 0.0343, - "reward": 0.5792411044239998, - "reward_std": 0.12792439758777618, - "rewards/accuracy_reward": 0.1071428656578064, + "grad_norm": 37.3018684387207, + "kl": 4.82421875, + "learning_rate": 3.776999978640154e-07, + "loss": 0.3433, + "reward": 0.5429687649011612, + "reward_std": 0.1649802289903164, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4720982387661934, + "rewards/tag_count_reward": 0.4268973395228386, "step": 1327 }, { "clip_ratio": 0.0, - "completion_length": 1763.5425109863281, + "completion_length": 1547.6496276855469, "epoch": 0.3966843402285117, - "grad_norm": 0.8871358036994934, - "kl": 0.152099609375, - "learning_rate": 7.549515126135871e-08, - "loss": 0.0305, - "reward": 0.5982142984867096, - "reward_std": 0.10454865358769894, - "rewards/accuracy_reward": 0.12723214784637094, + "grad_norm": 12.610398292541504, + "kl": 4.40234375, + "learning_rate": 3.774757563067935e-07, + "loss": 0.3254, + "reward": 0.5585937723517418, + "reward_std": 0.14634745381772518, + "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4291294813156128, "step": 1328 }, { "clip_ratio": 0.0, - "completion_length": 1827.7657165527344, + "completion_length": 1561.5960693359375, "epoch": 0.39698304831603315, - "grad_norm": 1.6228326559066772, - "kl": 0.165771484375, - "learning_rate": 7.545027521368854e-08, - "loss": 0.0286, - "reward": 0.5658482611179352, - "reward_std": 0.18191486969590187, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 25.606586456298828, + "kl": 5.0078125, + "learning_rate": 3.7725137606844273e-07, + "loss": 0.3586, + "reward": 0.5066964477300644, + "reward_std": 0.19275490753352642, + "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410895228386, + "rewards/tag_count_reward": 0.4218750074505806, "step": 1329 }, { "clip_ratio": 0.0, - "completion_length": 1716.2411193847656, + "completion_length": 1482.3572082519531, "epoch": 0.3972817564035546, - "grad_norm": 1.155138373374939, - "kl": 0.15771484375, - "learning_rate": 7.540537147861332e-08, - "loss": 0.0348, - "reward": 0.521205373108387, - "reward_std": 0.14142991602420807, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 11.971052169799805, + "kl": 4.35546875, + "learning_rate": 3.770268573930666e-07, + "loss": 0.3745, + "reward": 0.486607164144516, + "reward_std": 0.19278451800346375, + "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1330 }, { "clip_ratio": 0.0, - "completion_length": 1768.4844665527344, + "completion_length": 1507.8996276855469, "epoch": 0.3975804644910761, - "grad_norm": 0.7360109090805054, - "kl": 0.156982421875, - "learning_rate": 7.536044010498395e-08, - "loss": 0.0368, - "reward": 0.6205357536673546, - "reward_std": 0.11732006445527077, - "rewards/accuracy_reward": 0.1629464365541935, + "grad_norm": 58.936798095703125, + "kl": 3.23046875, + "learning_rate": 3.7680220052491974e-07, + "loss": 0.2903, + "reward": 0.588727705180645, + "reward_std": 0.120096854865551, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893059372902, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1331 }, { "clip_ratio": 0.0, - "completion_length": 1654.0491638183594, + "completion_length": 1419.8728637695312, "epoch": 0.39787917257859756, - "grad_norm": 1.2172755002975464, - "kl": 0.150390625, - "learning_rate": 7.531548114168132e-08, - "loss": 0.0491, - "reward": 0.6305803805589676, - "reward_std": 0.14415135234594345, - "rewards/accuracy_reward": 0.16964286426082253, + "grad_norm": 46.11073684692383, + "kl": 2.7265625, + "learning_rate": 3.765774057084066e-07, + "loss": 0.2468, + "reward": 0.6121652200818062, + "reward_std": 0.1577361524105072, + "rewards/accuracy_reward": 0.16517857951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1332 }, { "clip_ratio": 0.0, - "completion_length": 1741.3014221191406, + "completion_length": 1562.2165832519531, "epoch": 0.39817788066611903, - "grad_norm": 0.7192592620849609, - "kl": 0.1412353515625, - "learning_rate": 7.52704946376164e-08, - "loss": 0.0271, - "reward": 0.5390625149011612, - "reward_std": 0.12837214209139347, - "rewards/accuracy_reward": 0.07142857322469354, + "grad_norm": 92.28487396240234, + "kl": 5.5859375, + "learning_rate": 3.76352473188082e-07, + "loss": 0.354, + "reward": 0.495535746216774, + "reward_std": 0.14340483769774437, + "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339402794838, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1333 }, { "clip_ratio": 0.0, - "completion_length": 1762.4956359863281, + "completion_length": 1492.5603332519531, "epoch": 0.3984765887536405, - "grad_norm": 3.2300150394439697, - "kl": 0.166259765625, - "learning_rate": 7.522548064173008e-08, - "loss": 0.0342, - "reward": 0.5820312723517418, - "reward_std": 0.14732768200337887, - "rewards/accuracy_reward": 0.12276786169968545, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4592634215950966, + "grad_norm": 15.35764217376709, + "kl": 3.9375, + "learning_rate": 3.761274032086504e-07, + "loss": 0.3135, + "reward": 0.5345982238650322, + "reward_std": 0.1641866471618414, + "rewards/accuracy_reward": 0.09598214970901608, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1334 }, { "clip_ratio": 0.0, - "completion_length": 1773.0134887695312, + "completion_length": 1543.8304138183594, "epoch": 0.398775296841162, - "grad_norm": 0.5092839598655701, - "kl": 0.13671875, - "learning_rate": 7.51804392029932e-08, - "loss": 0.0313, - "reward": 0.5859375447034836, - "reward_std": 0.14513026736676693, - "rewards/accuracy_reward": 0.1227678656578064, + "grad_norm": 17.701705932617188, + "kl": 3.646484375, + "learning_rate": 3.7590219601496596e-07, + "loss": 0.2776, + "reward": 0.5714285969734192, + "reward_std": 0.17822470515966415, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696715950966, + "rewards/tag_count_reward": 0.4441964477300644, "step": 1335 }, { "clip_ratio": 0.0, - "completion_length": 1765.3282165527344, + "completion_length": 1545.1808776855469, "epoch": 0.39907400492868345, - "grad_norm": 1.5990031957626343, - "kl": 0.14453125, - "learning_rate": 7.51353703704064e-08, - "loss": 0.037, - "reward": 0.6026786118745804, - "reward_std": 0.12355122715234756, - "rewards/accuracy_reward": 0.13392857951112092, + "grad_norm": 100.78438568115234, + "kl": 5.3046875, + "learning_rate": 3.7567685185203203e-07, + "loss": 0.3347, + "reward": 0.6104911118745804, + "reward_std": 0.17931295558810234, + "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1336 }, { "clip_ratio": 0.0, - "completion_length": 1755.2523193359375, + "completion_length": 1599.5223693847656, "epoch": 0.3993727130162049, - "grad_norm": 1.9984264373779297, - "kl": 0.148681640625, - "learning_rate": 7.509027419300016e-08, - "loss": 0.0401, - "reward": 0.5970982387661934, - "reward_std": 0.13936078920960426, + "grad_norm": 153.1273193359375, + "kl": 7.546875, + "learning_rate": 3.754513709650008e-07, + "loss": 0.4674, + "reward": 0.5647321715950966, + "reward_std": 0.1779017485678196, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4587053805589676, + "rewards/tag_count_reward": 0.4263393059372902, "step": 1337 }, { "clip_ratio": 0.0, - "completion_length": 1675.6116943359375, + "completion_length": 1419.1317749023438, "epoch": 0.3996714211037264, - "grad_norm": 0.9094367027282715, - "kl": 0.129150390625, - "learning_rate": 7.504515071983472e-08, - "loss": 0.0413, - "reward": 0.640625037252903, - "reward_std": 0.16363739967346191, - "rewards/accuracy_reward": 0.15625000558793545, + "grad_norm": 71.07659149169922, + "kl": 5.2890625, + "learning_rate": 3.7522575359917364e-07, + "loss": 0.4236, + "reward": 0.5781250298023224, + "reward_std": 0.1936220731586218, + "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4843750223517418, + "rewards/tag_count_reward": 0.4397321715950966, "step": 1338 }, { "clip_ratio": 0.0, - "completion_length": 1717.3638916015625, + "completion_length": 1441.5000915527344, "epoch": 0.39997012919124786, - "grad_norm": 1.0398073196411133, - "kl": 0.153564453125, - "learning_rate": 7.5e-08, - "loss": 0.0255, - "reward": 0.6143973469734192, - "reward_std": 0.1585216587409377, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 57.1750602722168, + "kl": 4.849609375, + "learning_rate": 3.75e-07, + "loss": 0.3621, + "reward": 0.5887276977300644, + "reward_std": 0.16095156036317348, + "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.471540205180645, + "rewards/tag_count_reward": 0.4503348395228386, "step": 1339 }, { "clip_ratio": 0.0, - "completion_length": 1739.3415832519531, + "completion_length": 1512.4442443847656, "epoch": 0.40026883727876933, - "grad_norm": 4.104918003082275, - "kl": 0.16748046875, - "learning_rate": 7.495482208261553e-08, - "loss": 0.0362, - "reward": 0.623325914144516, - "reward_std": 0.16582290455698967, - "rewards/accuracy_reward": 0.15848215389996767, + "grad_norm": 109.33737182617188, + "kl": 5.7265625, + "learning_rate": 3.747741104130777e-07, + "loss": 0.4089, + "reward": 0.581473246216774, + "reward_std": 0.16761961951851845, + "rewards/accuracy_reward": 0.13616072316654027, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437649011612, + "rewards/tag_count_reward": 0.4453125149011612, "step": 1340 }, { "clip_ratio": 0.0, - "completion_length": 1801.1318054199219, + "completion_length": 1577.8415832519531, "epoch": 0.4005675453662908, - "grad_norm": 2.408722400665283, - "kl": 0.15771484375, - "learning_rate": 7.490961701683048e-08, - "loss": 0.0336, - "reward": 0.5636160969734192, - "reward_std": 0.1759437471628189, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 43.297401428222656, + "kl": 4.80859375, + "learning_rate": 3.7454808508415237e-07, + "loss": 0.3224, + "reward": 0.517299123108387, + "reward_std": 0.1896546706557274, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.458705373108387, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1341 }, { "clip_ratio": 0.0, - "completion_length": 1689.6005249023438, + "completion_length": 1471.38623046875, "epoch": 0.40086625345381227, - "grad_norm": 1.7797974348068237, - "kl": 0.14892578125, - "learning_rate": 7.48643848518235e-08, - "loss": 0.0422, - "reward": 0.5636160895228386, - "reward_std": 0.10696693882346153, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 10.072811126708984, + "kl": 3.6640625, + "learning_rate": 3.743219242591175e-07, + "loss": 0.2735, + "reward": 0.531250037252903, + "reward_std": 0.12994037196040154, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660895228386, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1342 }, { "clip_ratio": 0.0, - "completion_length": 1709.1830749511719, + "completion_length": 1524.3907165527344, "epoch": 0.40116496154133374, - "grad_norm": 1.4601484537124634, - "kl": 0.1513671875, - "learning_rate": 7.481912563680279e-08, - "loss": 0.0287, - "reward": 0.5239955559372902, - "reward_std": 0.13747013360261917, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 39.315330505371094, + "kl": 2.78515625, + "learning_rate": 3.7409562818401395e-07, + "loss": 0.2113, + "reward": 0.5055803880095482, + "reward_std": 0.19099636003375053, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4614955559372902, + "rewards/tag_count_reward": 0.431919664144516, "step": 1343 }, { "clip_ratio": 0.0, - "completion_length": 1789.8951721191406, + "completion_length": 1559.1563110351562, "epoch": 0.4014636696288552, - "grad_norm": 1.1765477657318115, - "kl": 0.150390625, - "learning_rate": 7.477383942100592e-08, - "loss": 0.0249, - "reward": 0.5345982313156128, - "reward_std": 0.14813394844532013, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 48.78361129760742, + "kl": 2.896484375, + "learning_rate": 3.738691971050296e-07, + "loss": 0.2417, + "reward": 0.5044642984867096, + "reward_std": 0.15767910704016685, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410969734192, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1344 }, { "clip_ratio": 0.0, - "completion_length": 1800.6697387695312, + "completion_length": 1556.5067443847656, "epoch": 0.4017623777163767, - "grad_norm": 4.183812618255615, - "kl": 0.168701171875, - "learning_rate": 7.472852625369985e-08, - "loss": 0.0306, - "reward": 0.526227705180645, - "reward_std": 0.12996549904346466, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 26.767873764038086, + "kl": 3.23828125, + "learning_rate": 3.7364263126849924e-07, + "loss": 0.2685, + "reward": 0.5050223469734192, + "reward_std": 0.1915640141814947, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470424123108387, + "rewards/tag_count_reward": 0.4335937649011612, "step": 1345 }, { "clip_ratio": 0.0, - "completion_length": 1740.1183776855469, + "completion_length": 1523.2723999023438, "epoch": 0.40206108580389815, - "grad_norm": 4.379576206207275, - "kl": 0.1641845703125, - "learning_rate": 7.468318618418088e-08, - "loss": 0.0273, - "reward": 0.5239955633878708, - "reward_std": 0.09269173629581928, - "rewards/accuracy_reward": 0.05133928661234677, + "grad_norm": 52.8403205871582, + "kl": 2.568359375, + "learning_rate": 3.734159309209044e-07, + "loss": 0.2298, + "reward": 0.4977678880095482, + "reward_std": 0.10809685103595257, + "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562798023224, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1346 }, { "clip_ratio": 0.0, - "completion_length": 1744.6429138183594, + "completion_length": 1586.5313415527344, "epoch": 0.4023597938914196, - "grad_norm": 1.8378609418869019, - "kl": 0.1494140625, - "learning_rate": 7.463781926177455e-08, - "loss": 0.0293, - "reward": 0.5039062649011612, - "reward_std": 0.13653886504471302, - "rewards/accuracy_reward": 0.04464285867288709, + "grad_norm": 19.748279571533203, + "kl": 3.69921875, + "learning_rate": 3.7318909630887276e-07, + "loss": 0.2391, + "reward": 0.466517873108387, + "reward_std": 0.15702650509774685, + "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1347 }, { "clip_ratio": 0.0, - "completion_length": 1771.7768859863281, + "completion_length": 1622.2121276855469, "epoch": 0.4026585019789411, - "grad_norm": 2.2206056118011475, - "kl": 0.15087890625, - "learning_rate": 7.459242553583563e-08, - "loss": 0.0241, - "reward": 0.532366082072258, - "reward_std": 0.15256562270224094, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 8.773845672607422, + "kl": 3.95703125, + "learning_rate": 3.7296212767917813e-07, + "loss": 0.2685, + "reward": 0.474330373108387, + "reward_std": 0.15803285501897335, + "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.458705373108387, + "rewards/tag_count_reward": 0.416294664144516, "step": 1348 }, { "clip_ratio": 0.0, - "completion_length": 1773.4442749023438, + "completion_length": 1521.8706359863281, "epoch": 0.40295721006646257, - "grad_norm": 0.9824259281158447, - "kl": 0.15087890625, - "learning_rate": 7.454700505574803e-08, - "loss": 0.0206, - "reward": 0.631696455180645, - "reward_std": 0.16333747282624245, - "rewards/accuracy_reward": 0.16517858067527413, + "grad_norm": 19.897489547729492, + "kl": 3.9921875, + "learning_rate": 3.7273502527874017e-07, + "loss": 0.2927, + "reward": 0.5630580633878708, + "reward_std": 0.14937039092183113, + "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.466517873108387, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1349 }, { "clip_ratio": 0.0, - "completion_length": 1740.5045471191406, + "completion_length": 1516.1540832519531, "epoch": 0.40325591815398404, - "grad_norm": 1.2426023483276367, - "kl": 0.137451171875, - "learning_rate": 7.450155787092483e-08, - "loss": 0.0415, - "reward": 0.6015625223517418, - "reward_std": 0.11820575781166553, - "rewards/accuracy_reward": 0.13392857951112092, + "grad_norm": 51.03802490234375, + "kl": 3.0, + "learning_rate": 3.725077893546241e-07, + "loss": 0.2527, + "reward": 0.5982143208384514, + "reward_std": 0.12906441651284695, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1350 }, { "clip_ratio": 0.0, - "completion_length": 1813.9197387695312, + "completion_length": 1643.2121276855469, "epoch": 0.4035546262415055, - "grad_norm": 6.620652675628662, - "kl": 0.1767578125, - "learning_rate": 7.445608403080806e-08, - "loss": 0.0266, - "reward": 0.5541295036673546, - "reward_std": 0.16720747202634811, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 42.61442565917969, + "kl": 4.84375, + "learning_rate": 3.722804201540403e-07, + "loss": 0.3132, + "reward": 0.4659598395228386, + "reward_std": 0.20050760731101036, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723469734192, + "rewards/tag_count_reward": 0.4079241305589676, "step": 1351 }, { "clip_ratio": 0.0, - "completion_length": 1751.97998046875, + "completion_length": 1532.3081359863281, "epoch": 0.403853334329027, - "grad_norm": 0.8044705390930176, - "kl": 0.137939453125, - "learning_rate": 7.441058358486879e-08, - "loss": 0.0301, - "reward": 0.4726562723517418, - "reward_std": 0.09006266947835684, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 53.0906867980957, + "kl": 2.953125, + "learning_rate": 3.7205291792434397e-07, + "loss": 0.2272, + "reward": 0.4508928805589676, + "reward_std": 0.10211499035358429, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598395228386, + "rewards/tag_count_reward": 0.4464285969734192, "step": 1352 }, { "clip_ratio": 0.0, - "completion_length": 1745.6407165527344, + "completion_length": 1510.7322082519531, "epoch": 0.40415204241654845, - "grad_norm": 0.6883458495140076, - "kl": 0.145751953125, - "learning_rate": 7.43650565826071e-08, - "loss": 0.0329, - "reward": 0.5189732387661934, - "reward_std": 0.11833003908395767, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 7.686004638671875, + "kl": 3.671875, + "learning_rate": 3.718252829130355e-07, + "loss": 0.2604, + "reward": 0.4765625223517418, + "reward_std": 0.14839400351047516, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375149011612, + "rewards/tag_count_reward": 0.4207589477300644, "step": 1353 }, { "clip_ratio": 0.0, - "completion_length": 1758.63623046875, + "completion_length": 1624.33935546875, "epoch": 0.4044507505040699, - "grad_norm": 2.022012948989868, - "kl": 0.130126953125, - "learning_rate": 7.431950307355188e-08, - "loss": 0.0303, - "reward": 0.6194196715950966, - "reward_std": 0.12730952445417643, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 48.989070892333984, + "kl": 4.515625, + "learning_rate": 3.715975153677594e-07, + "loss": 0.2782, + "reward": 0.5652902126312256, + "reward_std": 0.17247237637639046, + "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.4246651902794838, "step": 1354 }, { "clip_ratio": 0.0, - "completion_length": 1671.9889221191406, + "completion_length": 1429.0223999023438, "epoch": 0.4047494585915914, - "grad_norm": 0.6624343395233154, - "kl": 0.1414794921875, - "learning_rate": 7.427392310726087e-08, - "loss": 0.0309, - "reward": 0.5731026977300644, - "reward_std": 0.1160865519195795, - "rewards/accuracy_reward": 0.09598214784637094, + "grad_norm": 13.560033798217773, + "kl": 4.046875, + "learning_rate": 3.7136961553630437e-07, + "loss": 0.3189, + "reward": 0.5189732387661934, + "reward_std": 0.12493516132235527, + "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1355 }, { "clip_ratio": 0.0, - "completion_length": 1724.3817749023438, + "completion_length": 1511.5335693359375, "epoch": 0.40504816667911286, - "grad_norm": 0.5301927328109741, - "kl": 0.1396484375, - "learning_rate": 7.422831673332064e-08, - "loss": 0.0417, - "reward": 0.5167410969734192, - "reward_std": 0.10934286285191774, - "rewards/accuracy_reward": 0.042410717345774174, + "grad_norm": 32.09595489501953, + "kl": 3.63671875, + "learning_rate": 3.711415836666032e-07, + "loss": 0.2944, + "reward": 0.4760044887661934, + "reward_std": 0.1568305417895317, + "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474330373108387, + "rewards/tag_count_reward": 0.435825914144516, "step": 1356 }, { "clip_ratio": 0.0, - "completion_length": 1744.1272888183594, + "completion_length": 1534.790283203125, "epoch": 0.4053468747666343, - "grad_norm": 1.0052516460418701, - "kl": 0.1435546875, - "learning_rate": 7.418268400134643e-08, - "loss": 0.0308, - "reward": 0.5256696566939354, - "reward_std": 0.13329633511602879, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 23.30109977722168, + "kl": 4.171875, + "learning_rate": 3.7091342000673216e-07, + "loss": 0.2897, + "reward": 0.4972098395228386, + "reward_std": 0.16137411631643772, + "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375149011612, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1357 }, { "clip_ratio": 0.0, - "completion_length": 1663.3348999023438, + "completion_length": 1418.4085388183594, "epoch": 0.40564558285415575, - "grad_norm": 1.8446307182312012, - "kl": 0.1533203125, - "learning_rate": 7.413702496098218e-08, - "loss": 0.0316, - "reward": 0.561383955180645, - "reward_std": 0.09909173753112555, - "rewards/accuracy_reward": 0.09151785937137902, + "grad_norm": 31.130617141723633, + "kl": 2.78515625, + "learning_rate": 3.706851248049109e-07, + "loss": 0.2305, + "reward": 0.5440848469734192, + "reward_std": 0.11512686870992184, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4698660969734192, + "rewards/tag_count_reward": 0.4525669813156128, "step": 1358 }, { "clip_ratio": 0.0, - "completion_length": 1719.8125610351562, + "completion_length": 1511.7344665527344, "epoch": 0.4059442909416772, - "grad_norm": 3.487643003463745, - "kl": 0.1556396484375, - "learning_rate": 7.409133966190045e-08, - "loss": 0.0314, - "reward": 0.581473246216774, - "reward_std": 0.15905306488275528, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 41.73582077026367, + "kl": 3.078125, + "learning_rate": 3.704566983095022e-07, + "loss": 0.2581, + "reward": 0.5803571715950966, + "reward_std": 0.17149999737739563, + "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696566939354, + "rewards/tag_count_reward": 0.450892873108387, "step": 1359 }, { "clip_ratio": 0.0, - "completion_length": 1823.0201416015625, + "completion_length": 1620.9197387695312, "epoch": 0.4062429990291987, - "grad_norm": 0.7606572508811951, - "kl": 0.150146484375, - "learning_rate": 7.404562815380238e-08, - "loss": 0.0254, - "reward": 0.5518973395228386, - "reward_std": 0.11160346120595932, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 17.62242317199707, + "kl": 3.91015625, + "learning_rate": 3.7022814076901187e-07, + "loss": 0.2696, + "reward": 0.5156250149011612, + "reward_std": 0.12963330373167992, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437649011612, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1360 }, { "clip_ratio": 0.0, - "completion_length": 1766.7634582519531, + "completion_length": 1617.7254943847656, "epoch": 0.40654170711672016, - "grad_norm": 1.6737440824508667, - "kl": 0.15283203125, - "learning_rate": 7.399989048641759e-08, - "loss": 0.0384, - "reward": 0.635044664144516, - "reward_std": 0.13106032647192478, - "rewards/accuracy_reward": 0.1741071529686451, + "grad_norm": 21.543987274169922, + "kl": 3.84765625, + "learning_rate": 3.6999945243208795e-07, + "loss": 0.2756, + "reward": 0.6026785969734192, + "reward_std": 0.18320314958691597, + "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.419642873108387, "step": 1361 }, { "clip_ratio": 0.0, - "completion_length": 1748.7478637695312, + "completion_length": 1549.6005249023438, "epoch": 0.40684041520424163, - "grad_norm": 1.1856629848480225, - "kl": 0.142333984375, - "learning_rate": 7.39541267095042e-08, - "loss": 0.0366, - "reward": 0.549665205180645, - "reward_std": 0.15015487559139729, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 62.5572509765625, + "kl": 5.1953125, + "learning_rate": 3.69770633547521e-07, + "loss": 0.3651, + "reward": 0.556919664144516, + "reward_std": 0.20681850612163544, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.4475446566939354, "step": 1362 }, { "clip_ratio": 0.0, - "completion_length": 1722.6384887695312, + "completion_length": 1519.8683471679688, "epoch": 0.4071391232917631, - "grad_norm": 0.41842812299728394, - "kl": 0.144775390625, - "learning_rate": 7.390833687284872e-08, - "loss": 0.037, - "reward": 0.565290205180645, - "reward_std": 0.17396751046180725, - "rewards/accuracy_reward": 0.10491071827709675, + "grad_norm": 47.360172271728516, + "kl": 4.57421875, + "learning_rate": 3.695416843642436e-07, + "loss": 0.3031, + "reward": 0.517299123108387, + "reward_std": 0.1952129751443863, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794813156128, + "rewards/tag_count_reward": 0.428013414144516, "step": 1363 }, { "clip_ratio": 0.0, - "completion_length": 1738.7411499023438, + "completion_length": 1547.5648193359375, "epoch": 0.4074378313792846, - "grad_norm": 1.1935629844665527, - "kl": 0.152099609375, - "learning_rate": 7.386252102626598e-08, - "loss": 0.0357, - "reward": 0.6316964626312256, - "reward_std": 0.15275218151509762, - "rewards/accuracy_reward": 0.16294643469154835, + "grad_norm": 46.98716354370117, + "kl": 4.81640625, + "learning_rate": 3.693126051313299e-07, + "loss": 0.3093, + "reward": 0.5697544887661934, + "reward_std": 0.15692155621945858, + "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500223517418, + "rewards/tag_count_reward": 0.431361623108387, "step": 1364 }, { "clip_ratio": 0.0, - "completion_length": 1758.1764526367188, + "completion_length": 1513.3125915527344, "epoch": 0.40773653946680605, - "grad_norm": 1.6057977676391602, - "kl": 0.161865234375, - "learning_rate": 7.381667921959915e-08, - "loss": 0.0365, - "reward": 0.4888393133878708, - "reward_std": 0.1378103718161583, - "rewards/accuracy_reward": 0.02232142980210483, + "grad_norm": 74.69768524169922, + "kl": 4.98828125, + "learning_rate": 3.6908339609799573e-07, + "loss": 0.3093, + "reward": 0.4866071566939354, + "reward_std": 0.17243976891040802, + "rewards/accuracy_reward": 0.04687500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.466517873108387, + "rewards/tag_count_reward": 0.439732164144516, "step": 1365 }, { "clip_ratio": 0.0, - "completion_length": 1806.1473999023438, + "completion_length": 1603.1965026855469, "epoch": 0.4080352475543275, - "grad_norm": 1.1660295724868774, - "kl": 0.17529296875, - "learning_rate": 7.37708115027196e-08, - "loss": 0.0392, - "reward": 0.5758928880095482, - "reward_std": 0.14891951903700829, - "rewards/accuracy_reward": 0.11830357578583062, + "grad_norm": 13.175323486328125, + "kl": 3.90625, + "learning_rate": 3.68854057513598e-07, + "loss": 0.2816, + "reward": 0.5457589626312256, + "reward_std": 0.17364555783569813, + "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893133878708, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1366 }, { "clip_ratio": 0.0, - "completion_length": 1751.1697387695312, + "completion_length": 1566.3348693847656, "epoch": 0.408333955641849, - "grad_norm": 0.8645027875900269, - "kl": 0.157958984375, - "learning_rate": 7.372491792552693e-08, - "loss": 0.0319, - "reward": 0.6467634215950966, - "reward_std": 0.13580774422734976, - "rewards/accuracy_reward": 0.1808035857975483, + "grad_norm": 37.77369689941406, + "kl": 4.6953125, + "learning_rate": 3.6862458962763463e-07, + "loss": 0.3137, + "reward": 0.6004464626312256, + "reward_std": 0.17737188376486301, + "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598469734192, + "rewards/tag_count_reward": 0.4218750149011612, "step": 1367 }, { "clip_ratio": 0.0, - "completion_length": 1812.69873046875, + "completion_length": 1624.1563110351562, "epoch": 0.40863266372937046, - "grad_norm": 2.263353109359741, - "kl": 0.16943359375, - "learning_rate": 7.367899853794885e-08, - "loss": 0.0298, - "reward": 0.5418526977300644, - "reward_std": 0.16290037892758846, - "rewards/accuracy_reward": 0.07812500302679837, + "grad_norm": 33.757781982421875, + "kl": 3.4296875, + "learning_rate": 3.6839499268974426e-07, + "loss": 0.2491, + "reward": 0.5145089477300644, + "reward_std": 0.20166927576065063, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1368 }, { "clip_ratio": 0.0, - "completion_length": 1750.3773193359375, + "completion_length": 1564.602783203125, "epoch": 0.40893137181689193, - "grad_norm": 1.66549551486969, - "kl": 0.1632080078125, - "learning_rate": 7.363305338994115e-08, - "loss": 0.0344, - "reward": 0.5859375298023224, - "reward_std": 0.1492079794406891, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 36.540061950683594, + "kl": 3.271484375, + "learning_rate": 3.6816526694970576e-07, + "loss": 0.2632, + "reward": 0.5731026977300644, + "reward_std": 0.15532847307622433, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4676339477300644, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1369 }, { "clip_ratio": 0.0, - "completion_length": 1751.2679138183594, + "completion_length": 1544.01123046875, "epoch": 0.4092300799044134, - "grad_norm": 0.9856199026107788, - "kl": 0.154052734375, - "learning_rate": 7.358708253148766e-08, - "loss": 0.0308, - "reward": 0.5714285895228386, - "reward_std": 0.1772101316601038, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 44.578365325927734, + "kl": 4.453125, + "learning_rate": 3.679354126574383e-07, + "loss": 0.2939, + "reward": 0.5680803805589676, + "reward_std": 0.2071404941380024, + "rewards/accuracy_reward": 0.12276786495931447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1370 }, { "clip_ratio": 0.0, - "completion_length": 1817.2634582519531, + "completion_length": 1618.6585388183594, "epoch": 0.4095287879919349, - "grad_norm": 2.5607857704162598, - "kl": 0.180908203125, - "learning_rate": 7.354108601260014e-08, - "loss": 0.0368, - "reward": 0.487165205180645, - "reward_std": 0.1073995865881443, - "rewards/accuracy_reward": 0.020089285913854837, + "grad_norm": 17.439876556396484, + "kl": 4.70703125, + "learning_rate": 3.677054300630007e-07, + "loss": 0.3349, + "reward": 0.4508928805589676, + "reward_std": 0.1469549499452114, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1371 }, { "clip_ratio": 0.0, - "completion_length": 1715.3683776855469, + "completion_length": 1464.6340026855469, "epoch": 0.40982749607945634, - "grad_norm": 1.5471996068954468, - "kl": 0.145263671875, - "learning_rate": 7.349506388331833e-08, - "loss": 0.0316, - "reward": 0.5591518208384514, - "reward_std": 0.1387225929647684, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 20.87938690185547, + "kl": 3.578125, + "learning_rate": 3.674753194165917e-07, + "loss": 0.2665, + "reward": 0.5262277126312256, + "reward_std": 0.14989223890006542, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4743303805589676, + "rewards/tag_count_reward": 0.4570312649011612, "step": 1372 }, { "clip_ratio": 0.0, - "completion_length": 1745.3371276855469, + "completion_length": 1520.2634887695312, "epoch": 0.4101262041669778, - "grad_norm": 0.772765040397644, - "kl": 0.15771484375, - "learning_rate": 7.344901619370977e-08, - "loss": 0.0398, - "reward": 0.5758928805589676, - "reward_std": 0.14908982627093792, - "rewards/accuracy_reward": 0.10937500116415322, + "grad_norm": 17.004886627197266, + "kl": 3.15234375, + "learning_rate": 3.672450809685488e-07, + "loss": 0.2324, + "reward": 0.5379464477300644, + "reward_std": 0.19398957304656506, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.466517873108387, + "rewards/tag_count_reward": 0.435267873108387, "step": 1373 }, { "clip_ratio": 0.0, - "completion_length": 1748.0402526855469, + "completion_length": 1507.2232971191406, "epoch": 0.4104249122544993, - "grad_norm": 1.762274146080017, - "kl": 0.168701171875, - "learning_rate": 7.340294299386984e-08, - "loss": 0.0324, - "reward": 0.5200893059372902, - "reward_std": 0.16958842612802982, - "rewards/accuracy_reward": 0.058035715483129025, + "grad_norm": 42.45034408569336, + "kl": 3.13671875, + "learning_rate": 3.670147149693492e-07, + "loss": 0.2762, + "reward": 0.478236623108387, + "reward_std": 0.16527236253023148, + "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535895228386, + "rewards/tag_count_reward": 0.440290205180645, "step": 1374 }, { "clip_ratio": 0.0, - "completion_length": 1782.2657165527344, + "completion_length": 1580.7902221679688, "epoch": 0.41072362034202076, - "grad_norm": 1.3104974031448364, - "kl": 0.1702880859375, - "learning_rate": 7.335684433392169e-08, - "loss": 0.0307, - "reward": 0.4793526977300644, - "reward_std": 0.11422690190374851, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 36.500213623046875, + "kl": 3.3671875, + "learning_rate": 3.667842216696084e-07, + "loss": 0.2735, + "reward": 0.4447544887661934, + "reward_std": 0.1446375548839569, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4268973469734192, "step": 1375 }, { "clip_ratio": 0.0, - "completion_length": 1794.3058776855469, + "completion_length": 1545.47998046875, "epoch": 0.4110223284295422, - "grad_norm": 1.0438309907913208, - "kl": 0.173095703125, - "learning_rate": 7.33107202640161e-08, - "loss": 0.0448, - "reward": 0.5585937798023224, - "reward_std": 0.12565508484840393, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 18.709720611572266, + "kl": 3.89453125, + "learning_rate": 3.665536013200805e-07, + "loss": 0.3045, + "reward": 0.5669643133878708, + "reward_std": 0.15088903717696667, + "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.446986623108387, + "rewards/tag_count_reward": 0.444196455180645, "step": 1376 }, { "clip_ratio": 0.0, - "completion_length": 1685.4375610351562, + "completion_length": 1461.0268249511719, "epoch": 0.4113210365170637, - "grad_norm": 2.2266764640808105, - "kl": 0.159423828125, - "learning_rate": 7.326457083433155e-08, - "loss": 0.0422, - "reward": 0.626674123108387, - "reward_std": 0.12033795192837715, - "rewards/accuracy_reward": 0.15848215157166123, + "grad_norm": 19.115894317626953, + "kl": 4.0234375, + "learning_rate": 3.663228541716578e-07, + "loss": 0.3283, + "reward": 0.5965401977300644, + "reward_std": 0.117519935593009, + "rewards/accuracy_reward": 0.15178571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4681919813156128, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1377 }, { "clip_ratio": 0.0, - "completion_length": 1701.1094665527344, + "completion_length": 1524.3147583007812, "epoch": 0.41161974460458517, - "grad_norm": 5.1130690574646, - "kl": 0.169189453125, - "learning_rate": 7.321839609507412e-08, - "loss": 0.0259, - "reward": 0.6456473469734192, - "reward_std": 0.14158300962299109, - "rewards/accuracy_reward": 0.1785714365541935, + "grad_norm": 33.395790100097656, + "kl": 4.3203125, + "learning_rate": 3.660919804753706e-07, + "loss": 0.2971, + "reward": 0.6082589626312256, + "reward_std": 0.13959864526987076, + "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1378 }, { "clip_ratio": 0.0, - "completion_length": 1783.2656860351562, + "completion_length": 1515.0201721191406, "epoch": 0.41191845269210664, - "grad_norm": 2.003352165222168, - "kl": 0.1611328125, - "learning_rate": 7.31721960964774e-08, - "loss": 0.0403, - "reward": 0.5725446790456772, - "reward_std": 0.1405333299189806, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 20.93138313293457, + "kl": 3.77734375, + "learning_rate": 3.6586098048238693e-07, + "loss": 0.289, + "reward": 0.555803619325161, + "reward_std": 0.1417046468704939, + "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1379 }, { "clip_ratio": 0.0, - "completion_length": 1743.2255554199219, + "completion_length": 1470.96435546875, "epoch": 0.4122171607796281, - "grad_norm": 1.695895791053772, - "kl": 0.165771484375, - "learning_rate": 7.312597088880242e-08, - "loss": 0.0438, - "reward": 0.5111607387661934, - "reward_std": 0.11690266244113445, - "rewards/accuracy_reward": 0.05133928800933063, + "grad_norm": 11.871637344360352, + "kl": 4.00390625, + "learning_rate": 3.6562985444401205e-07, + "loss": 0.2963, + "reward": 0.4927455559372902, + "reward_std": 0.13928680680692196, + "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4598214402794838, + "rewards/tag_count_reward": 0.4369419887661934, "step": 1380 }, { "clip_ratio": 0.0, - "completion_length": 1640.07373046875, + "completion_length": 1398.1965026855469, "epoch": 0.4125158688671496, - "grad_norm": 0.9683740735054016, - "kl": 0.142578125, - "learning_rate": 7.307972052233771e-08, - "loss": 0.0274, - "reward": 0.603794664144516, - "reward_std": 0.16658007726073265, - "rewards/accuracy_reward": 0.12946429220028222, + "grad_norm": 32.21767807006836, + "kl": 3.21875, + "learning_rate": 3.6539860261168854e-07, + "loss": 0.2274, + "reward": 0.5792411044239998, + "reward_std": 0.16181877814233303, + "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.474330373108387, + "rewards/tag_count_reward": 0.4564732387661934, "step": 1381 }, { "clip_ratio": 0.0, - "completion_length": 1894.4085388183594, + "completion_length": 1709.1139221191406, "epoch": 0.41281457695467105, - "grad_norm": 1.2980173826217651, - "kl": 0.162353515625, - "learning_rate": 7.303344504739913e-08, - "loss": 0.0245, - "reward": 0.5675223469734192, - "reward_std": 0.15550493821501732, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 6.037398815155029, + "kl": 3.8046875, + "learning_rate": 3.6516722523699566e-07, + "loss": 0.237, + "reward": 0.5474330559372902, + "reward_std": 0.18723542988300323, + "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4581473469734192, + "rewards/tag_count_reward": 0.4335937649011612, "step": 1382 }, { "clip_ratio": 0.0, - "completion_length": 1739.7277526855469, + "completion_length": 1517.6072082519531, "epoch": 0.4131132850421925, - "grad_norm": 0.8146997094154358, - "kl": 0.146240234375, - "learning_rate": 7.298714451432985e-08, - "loss": 0.0355, - "reward": 0.5463169738650322, - "reward_std": 0.1263580620288849, - "rewards/accuracy_reward": 0.06919643119908869, + "grad_norm": 32.25304412841797, + "kl": 2.845703125, + "learning_rate": 3.649357225716493e-07, + "loss": 0.2132, + "reward": 0.5206473469734192, + "reward_std": 0.14910650067031384, + "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205484867096, + "rewards/tag_count_reward": 0.451450914144516, "step": 1383 }, { "clip_ratio": 0.0, - "completion_length": 1661.3750610351562, + "completion_length": 1499.4532165527344, "epoch": 0.413411993129714, - "grad_norm": 1.5825769901275635, - "kl": 0.1474609375, - "learning_rate": 7.294081897350033e-08, - "loss": 0.0352, - "reward": 0.5597098395228386, - "reward_std": 0.10513161309063435, - "rewards/accuracy_reward": 0.0870535783469677, + "grad_norm": 65.30013275146484, + "kl": 2.064453125, + "learning_rate": 3.6470409486750166e-07, + "loss": 0.1768, + "reward": 0.529575914144516, + "reward_std": 0.11234602890908718, + "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562723517418, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1384 }, { "clip_ratio": 0.0, - "completion_length": 1745.1607971191406, + "completion_length": 1576.0625915527344, "epoch": 0.41371070121723547, - "grad_norm": 3.734862804412842, - "kl": 0.167236328125, - "learning_rate": 7.289446847530822e-08, - "loss": 0.0339, - "reward": 0.5491071566939354, - "reward_std": 0.1631463374942541, - "rewards/accuracy_reward": 0.08928571944124997, + "grad_norm": 56.75543975830078, + "kl": 3.125, + "learning_rate": 3.6447234237654104e-07, + "loss": 0.2835, + "reward": 0.4921875223517418, + "reward_std": 0.16849613189697266, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459821455180645, + "rewards/tag_count_reward": 0.4252232387661934, "step": 1385 }, { "clip_ratio": 0.0, - "completion_length": 1680.2232666015625, + "completion_length": 1465.1138916015625, "epoch": 0.41400940930475694, - "grad_norm": 0.9969618320465088, - "kl": 0.1611328125, - "learning_rate": 7.284809307017829e-08, - "loss": 0.041, - "reward": 0.5864955633878708, - "reward_std": 0.14473088830709457, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 129.37539672851562, + "kl": 1.8232421875, + "learning_rate": 3.642404653508915e-07, + "loss": 0.2061, + "reward": 0.5412946715950966, + "reward_std": 0.13545162789523602, + "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4659598395228386, + "rewards/tag_count_reward": 0.4497767984867096, "step": 1386 }, { "clip_ratio": 0.0, - "completion_length": 1766.0469665527344, + "completion_length": 1512.62060546875, "epoch": 0.4143081173922784, - "grad_norm": 0.8232771754264832, - "kl": 0.177734375, - "learning_rate": 7.280169280856247e-08, - "loss": 0.0347, - "reward": 0.5362723544239998, - "reward_std": 0.14446361176669598, - "rewards/accuracy_reward": 0.07366071874275804, + "grad_norm": 66.4278335571289, + "kl": 2.07421875, + "learning_rate": 3.640084640428123e-07, + "loss": 0.186, + "reward": 0.5178571790456772, + "reward_std": 0.1457260400056839, + "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1387 }, { "clip_ratio": 0.0, - "completion_length": 1754.57373046875, + "completion_length": 1573.3148193359375, "epoch": 0.4146068254797999, - "grad_norm": 1.8358914852142334, - "kl": 0.156005859375, - "learning_rate": 7.275526774093966e-08, - "loss": 0.0221, - "reward": 0.5412946715950966, - "reward_std": 0.17275275103747845, - "rewards/accuracy_reward": 0.08035714412108064, + "grad_norm": 22.603727340698242, + "kl": 2.8828125, + "learning_rate": 3.6377633870469827e-07, + "loss": 0.213, + "reward": 0.5150669887661934, + "reward_std": 0.198160320520401, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1388 }, { "clip_ratio": 0.0, - "completion_length": 1710.3170166015625, + "completion_length": 1474.3326721191406, "epoch": 0.41490553356732135, - "grad_norm": 4.341665267944336, - "kl": 0.194091796875, - "learning_rate": 7.270881791781582e-08, - "loss": 0.0386, - "reward": 0.5362723469734192, - "reward_std": 0.1566976197063923, - "rewards/accuracy_reward": 0.06919643213041127, + "grad_norm": 53.31907653808594, + "kl": 3.111328125, + "learning_rate": 3.6354408958907906e-07, + "loss": 0.2898, + "reward": 0.545758955180645, + "reward_std": 0.1775260902941227, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.443080373108387, "step": 1389 }, { "clip_ratio": 0.0, - "completion_length": 1659.5022888183594, + "completion_length": 1493.77685546875, "epoch": 0.4152042416548428, - "grad_norm": 1.9271124601364136, - "kl": 0.15869140625, - "learning_rate": 7.266234338972378e-08, - "loss": 0.034, - "reward": 0.5368303954601288, - "reward_std": 0.12941434793174267, - "rewards/accuracy_reward": 0.07589285913854837, + "grad_norm": 22.970779418945312, + "kl": 3.15234375, + "learning_rate": 3.633117169486189e-07, + "loss": 0.2451, + "reward": 0.5245536044239998, + "reward_std": 0.16777056455612183, + "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4419643133878708, "step": 1390 }, { "clip_ratio": 0.0, - "completion_length": 1694.872802734375, + "completion_length": 1475.8036499023438, "epoch": 0.4155029497423643, - "grad_norm": 1.1832762956619263, - "kl": 0.157470703125, - "learning_rate": 7.261584420722327e-08, - "loss": 0.0395, - "reward": 0.5156250149011612, - "reward_std": 0.14715416356921196, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 47.83264923095703, + "kl": 4.796875, + "learning_rate": 3.630792210361164e-07, + "loss": 0.3795, + "reward": 0.4827009290456772, + "reward_std": 0.17200934328138828, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1391 }, { "clip_ratio": 0.0, - "completion_length": 1710.2344665527344, + "completion_length": 1460.90185546875, "epoch": 0.41580165782988576, - "grad_norm": 2.135064125061035, - "kl": 0.17919921875, - "learning_rate": 7.256932042090086e-08, - "loss": 0.0406, - "reward": 0.5083705633878708, - "reward_std": 0.16825828701257706, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 70.47198486328125, + "kl": 5.1640625, + "learning_rate": 3.6284660210450434e-07, + "loss": 0.3802, + "reward": 0.5078125298023224, + "reward_std": 0.2051066756248474, + "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4570312723517418, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1392 }, { "clip_ratio": 0.0, - "completion_length": 1762.388427734375, + "completion_length": 1515.7813110351562, "epoch": 0.41610036591740723, - "grad_norm": 1.981034278869629, - "kl": 0.165283203125, - "learning_rate": 7.252277208136988e-08, - "loss": 0.0338, - "reward": 0.5915178805589676, - "reward_std": 0.10691705159842968, - "rewards/accuracy_reward": 0.12053572107106447, + "grad_norm": 19.61629867553711, + "kl": 3.55859375, + "learning_rate": 3.6261386040684937e-07, + "loss": 0.261, + "reward": 0.5502232387661934, + "reward_std": 0.12935829907655716, + "rewards/accuracy_reward": 0.11830357811413705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.4319196566939354, "step": 1393 }, { "clip_ratio": 0.0, - "completion_length": 1854.9889221191406, + "completion_length": 1641.930908203125, "epoch": 0.4163990740049287, - "grad_norm": 1.8790518045425415, - "kl": 0.191650390625, - "learning_rate": 7.247619923927034e-08, - "loss": 0.0266, - "reward": 0.5463169813156128, - "reward_std": 0.17803269252181053, - "rewards/accuracy_reward": 0.08705357881262898, + "grad_norm": 127.1767578125, + "kl": 6.9609375, + "learning_rate": 3.623809961963517e-07, + "loss": 0.4316, + "reward": 0.5078125149011612, + "reward_std": 0.1983569785952568, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1394 }, { "clip_ratio": 0.0, - "completion_length": 1747.12060546875, + "completion_length": 1583.2768859863281, "epoch": 0.4166977820924502, - "grad_norm": 1.871187686920166, - "kl": 0.1650390625, - "learning_rate": 7.242960194526892e-08, - "loss": 0.0386, - "reward": 0.4832589626312256, - "reward_std": 0.12013974413275719, + "grad_norm": 87.8060531616211, + "kl": 6.1953125, + "learning_rate": 3.621480097263446e-07, + "loss": 0.3973, + "reward": 0.4430803805589676, + "reward_std": 0.16278668493032455, "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696715950966, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1395 }, { "clip_ratio": 0.0, - "completion_length": 1834.5693054199219, + "completion_length": 1688.4308471679688, "epoch": 0.41699649017997165, - "grad_norm": 1.2831584215164185, - "kl": 0.16455078125, - "learning_rate": 7.238298025005892e-08, - "loss": 0.0307, - "reward": 0.5228794813156128, - "reward_std": 0.17044532112777233, - "rewards/accuracy_reward": 0.06026786123402417, + "grad_norm": 149.83041381835938, + "kl": 7.5703125, + "learning_rate": 3.619149012502946e-07, + "loss": 0.4537, + "reward": 0.446986623108387, + "reward_std": 0.15790134854614735, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4626116305589676, + "rewards/tag_count_reward": 0.4179687723517418, "step": 1396 }, { "clip_ratio": 0.0, - "completion_length": 1759.4911499023438, + "completion_length": 1560.6808776855469, "epoch": 0.4172951982674931, - "grad_norm": 1.8287596702575684, - "kl": 0.17919921875, - "learning_rate": 7.23363342043602e-08, - "loss": 0.0348, - "reward": 0.4977678805589676, - "reward_std": 0.14488459937274456, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 52.01384735107422, + "kl": 5.109375, + "learning_rate": 3.6168167102180095e-07, + "loss": 0.3328, + "reward": 0.4464285895228386, + "reward_std": 0.1442829705774784, + "rewards/accuracy_reward": 0.01785714295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535969734192, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1397 }, { "clip_ratio": 0.0, - "completion_length": 1728.51123046875, + "completion_length": 1514.9911193847656, "epoch": 0.4175939063550146, - "grad_norm": 3.045013904571533, - "kl": 0.196533203125, - "learning_rate": 7.228966385891902e-08, - "loss": 0.0334, - "reward": 0.4799107313156128, - "reward_std": 0.09163136966526508, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 25.073022842407227, + "kl": 5.4375, + "learning_rate": 3.6144831929459513e-07, + "loss": 0.4076, + "reward": 0.4397321715950966, + "reward_std": 0.13864734955132008, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.470982164144516, + "rewards/tag_count_reward": 0.424107164144516, "step": 1398 }, { "clip_ratio": 0.0, - "completion_length": 1679.3616638183594, + "completion_length": 1513.9420776367188, "epoch": 0.41789261444253606, - "grad_norm": 1.309074878692627, - "kl": 0.159912109375, - "learning_rate": 7.224296926450822e-08, - "loss": 0.0217, - "reward": 0.568638414144516, - "reward_std": 0.12967008631676435, - "rewards/accuracy_reward": 0.098214291036129, + "grad_norm": 11.512999534606934, + "kl": 3.98828125, + "learning_rate": 3.612148463225411e-07, + "loss": 0.3118, + "reward": 0.5111607313156128, + "reward_std": 0.16364446468651295, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1399 }, { "clip_ratio": 0.0, - "completion_length": 1647.5513916015625, + "completion_length": 1446.2500915527344, "epoch": 0.4181913225300575, - "grad_norm": 0.8187063336372375, - "kl": 0.1407470703125, - "learning_rate": 7.21962504719269e-08, - "loss": 0.0266, - "reward": 0.5262276977300644, - "reward_std": 0.12730016373097897, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 58.58030319213867, + "kl": 3.15625, + "learning_rate": 3.6098125235963445e-07, + "loss": 0.2997, + "reward": 0.4927455559372902, + "reward_std": 0.17848804593086243, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4771205559372902, + "rewards/tag_count_reward": 0.4436384066939354, "step": 1400 }, { "clip_ratio": 0.0, - "completion_length": 1758.8014221191406, + "completion_length": 1577.99560546875, "epoch": 0.41849003061757895, - "grad_norm": 3.0006866455078125, - "kl": 0.152587890625, - "learning_rate": 7.214950753200052e-08, - "loss": 0.0356, - "reward": 0.4977678805589676, - "reward_std": 0.151550754904747, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 13.345547676086426, + "kl": 3.90625, + "learning_rate": 3.607475376600026e-07, + "loss": 0.2612, + "reward": 0.447544664144516, + "reward_std": 0.1498772781342268, + "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455357164144516, + "rewards/tag_count_reward": 0.4229910969734192, "step": 1401 }, { "clip_ratio": 0.0, - "completion_length": 1703.3683776855469, + "completion_length": 1500.4866943359375, "epoch": 0.4187887387051004, - "grad_norm": 3.2982559204101562, - "kl": 0.14013671875, - "learning_rate": 7.210274049558082e-08, - "loss": 0.0495, - "reward": 0.5719866305589676, - "reward_std": 0.193830706179142, - "rewards/accuracy_reward": 0.0982142873108387, + "grad_norm": 33.27885055541992, + "kl": 3.2578125, + "learning_rate": 3.605137024779041e-07, + "loss": 0.26, + "reward": 0.5284598469734192, + "reward_std": 0.1894755121320486, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723469734192, + "rewards/tag_count_reward": 0.443638414144516, "step": 1402 }, { "clip_ratio": 0.0, - "completion_length": 1784.4621276855469, + "completion_length": 1587.5022888183594, "epoch": 0.4190874467926219, - "grad_norm": 1.6579530239105225, - "kl": 0.182861328125, - "learning_rate": 7.205594941354579e-08, - "loss": 0.0337, - "reward": 0.5340401977300644, - "reward_std": 0.13825492933392525, - "rewards/accuracy_reward": 0.07142857438884676, + "grad_norm": 13.882495880126953, + "kl": 4.27734375, + "learning_rate": 3.6027974706772894e-07, + "loss": 0.3148, + "reward": 0.4765625298023224, + "reward_std": 0.145012766122818, + "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.462611623108387, + "rewards/tag_count_reward": 0.4252232313156128, "step": 1403 }, { "clip_ratio": 0.0, - "completion_length": 1749.9152526855469, + "completion_length": 1583.1384582519531, "epoch": 0.41938615488014336, - "grad_norm": 3.4398396015167236, - "kl": 0.146484375, - "learning_rate": 7.20091343367995e-08, - "loss": 0.0409, - "reward": 0.5552455633878708, - "reward_std": 0.20011531189084053, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 8.958839416503906, + "kl": 3.548828125, + "learning_rate": 3.6004567168399755e-07, + "loss": 0.2409, + "reward": 0.4933035969734192, + "reward_std": 0.18743673339486122, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1404 }, { "clip_ratio": 0.0, - "completion_length": 1735.5759582519531, + "completion_length": 1565.4420166015625, "epoch": 0.41968486296766483, - "grad_norm": 2.3317577838897705, - "kl": 0.18115234375, - "learning_rate": 7.196229531627218e-08, - "loss": 0.046, - "reward": 0.5691964626312256, - "reward_std": 0.18292045779526234, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 13.716802597045898, + "kl": 3.80859375, + "learning_rate": 3.598114765813609e-07, + "loss": 0.31, + "reward": 0.5083705633878708, + "reward_std": 0.2209208831191063, + "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4190848469734192, "step": 1405 }, { "clip_ratio": 0.0, - "completion_length": 1689.4576721191406, + "completion_length": 1511.0313110351562, "epoch": 0.4199835710551863, - "grad_norm": 1.1456248760223389, - "kl": 0.175537109375, - "learning_rate": 7.19154324029201e-08, - "loss": 0.0307, - "reward": 0.5831473618745804, - "reward_std": 0.1662580668926239, - "rewards/accuracy_reward": 0.12500000279396772, + "grad_norm": 19.03940773010254, + "kl": 3.71484375, + "learning_rate": 3.595771620146005e-07, + "loss": 0.2765, + "reward": 0.5446428880095482, + "reward_std": 0.1843728944659233, + "rewards/accuracy_reward": 0.12053572339937091, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4581473469734192, + "rewards/tag_count_reward": 0.424107164144516, "step": 1406 }, { "clip_ratio": 0.0, - "completion_length": 1711.3772888183594, + "completion_length": 1569.7969665527344, "epoch": 0.42028227914270777, - "grad_norm": 1.0716581344604492, - "kl": 0.158935546875, - "learning_rate": 7.18685456477255e-08, - "loss": 0.0369, - "reward": 0.5658482313156128, - "reward_std": 0.12083438225090504, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 7.5712409019470215, + "kl": 4.15625, + "learning_rate": 3.593427282386275e-07, + "loss": 0.2899, + "reward": 0.5200893133878708, + "reward_std": 0.18085487559437752, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4765625223517418, + "rewards/tag_count_reward": 0.4263393133878708, "step": 1407 }, { "clip_ratio": 0.0, - "completion_length": 1755.6072387695312, + "completion_length": 1628.9063110351562, "epoch": 0.42058098723022924, - "grad_norm": 1.9187897443771362, - "kl": 0.169921875, - "learning_rate": 7.182163510169658e-08, - "loss": 0.0327, - "reward": 0.564732164144516, - "reward_std": 0.1439322642982006, - "rewards/accuracy_reward": 0.09821429336443543, + "grad_norm": 73.74468994140625, + "kl": 5.9765625, + "learning_rate": 3.591081755084829e-07, + "loss": 0.3905, + "reward": 0.4497768059372902, + "reward_std": 0.1802276372909546, + "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4665178805589676, + "rewards/tag_count_reward": 0.3805803805589676, "step": 1408 }, { "clip_ratio": 0.0, - "completion_length": 1805.5983276367188, + "completion_length": 1577.8348999023438, "epoch": 0.4208796953177507, - "grad_norm": 1.4323629140853882, - "kl": 0.179931640625, - "learning_rate": 7.177470081586742e-08, - "loss": 0.0334, - "reward": 0.5390625298023224, - "reward_std": 0.14281874150037766, - "rewards/accuracy_reward": 0.07366071944124997, + "grad_norm": 19.72395133972168, + "kl": 4.12109375, + "learning_rate": 3.5887350407933707e-07, + "loss": 0.2968, + "reward": 0.526227705180645, + "reward_std": 0.17388656735420227, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/tag_count_reward": 0.4369419887661934, "step": 1409 }, { "clip_ratio": 0.0, - "completion_length": 1766.962158203125, + "completion_length": 1554.0893249511719, "epoch": 0.4211784034052722, - "grad_norm": 0.9981937408447266, - "kl": 0.16845703125, - "learning_rate": 7.172774284129792e-08, - "loss": 0.0393, - "reward": 0.5424107313156128, - "reward_std": 0.15237823501229286, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 28.011688232421875, + "kl": 4.30078125, + "learning_rate": 3.586387142064896e-07, + "loss": 0.3206, + "reward": 0.5005580559372902, + "reward_std": 0.17357107624411583, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4246651977300644, "step": 1410 }, { "clip_ratio": 0.0, - "completion_length": 1731.7232971191406, + "completion_length": 1537.9107971191406, "epoch": 0.42147711149279365, - "grad_norm": 2.867185115814209, - "kl": 0.15478515625, - "learning_rate": 7.168076122907377e-08, - "loss": 0.034, - "reward": 0.5496651977300644, - "reward_std": 0.09381785988807678, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 111.5328140258789, + "kl": 2.62890625, + "learning_rate": 3.584038061453689e-07, + "loss": 0.2643, + "reward": 0.5055803880095482, + "reward_std": 0.12599072977900505, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4296875298023224, "step": 1411 }, { "clip_ratio": 0.0, - "completion_length": 1765.1183776855469, + "completion_length": 1563.2634582519531, "epoch": 0.4217758195803151, - "grad_norm": 2.7077040672302246, - "kl": 0.179443359375, - "learning_rate": 7.163375603030634e-08, - "loss": 0.0379, - "reward": 0.5641741305589676, - "reward_std": 0.1479755975306034, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 94.41519927978516, + "kl": 2.80859375, + "learning_rate": 3.5816878015153166e-07, + "loss": 0.2511, + "reward": 0.5329241380095482, + "reward_std": 0.19719773903489113, + "rewards/accuracy_reward": 0.12053572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669887661934, + "rewards/tag_count_reward": 0.412388414144516, "step": 1412 }, { "clip_ratio": 0.0, - "completion_length": 1780.0737609863281, + "completion_length": 1588.7076721191406, "epoch": 0.4220745276678366, - "grad_norm": 1.5574159622192383, - "kl": 0.17041015625, - "learning_rate": 7.158672729613268e-08, - "loss": 0.0337, - "reward": 0.581473246216774, - "reward_std": 0.2033590953797102, - "rewards/accuracy_reward": 0.13616071874275804, + "grad_norm": 55.93451690673828, + "kl": 2.51171875, + "learning_rate": 3.579336364806634e-07, + "loss": 0.1938, + "reward": 0.5373884066939354, + "reward_std": 0.22761493548750877, + "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4453125223517418, + "rewards/tag_count_reward": 0.4168526902794838, "step": 1413 }, { "clip_ratio": 0.0, - "completion_length": 1726.7745971679688, + "completion_length": 1509.8728332519531, "epoch": 0.42237323575535807, - "grad_norm": 2.1048178672790527, - "kl": 0.1953125, - "learning_rate": 7.153967507771546e-08, - "loss": 0.0365, - "reward": 0.4955357387661934, - "reward_std": 0.13654660806059837, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 12.809821128845215, + "kl": 4.27734375, + "learning_rate": 3.576983753885773e-07, + "loss": 0.3024, + "reward": 0.4380580559372902, + "reward_std": 0.14709274284541607, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857387661934, + "rewards/tag_count_reward": 0.4224330559372902, "step": 1414 }, { "clip_ratio": 0.0, - "completion_length": 1641.7478637695312, + "completion_length": 1384.6340026855469, "epoch": 0.42267194384287954, - "grad_norm": 9.182212829589844, - "kl": 0.228759765625, - "learning_rate": 7.149259942624287e-08, - "loss": 0.0402, - "reward": 0.5089286044239998, - "reward_std": 0.13670150935649872, + "grad_norm": 54.9464225769043, + "kl": 2.83203125, + "learning_rate": 3.574629971312143e-07, + "loss": 0.274, + "reward": 0.4927455633878708, + "reward_std": 0.1536429561674595, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4732143133878708, + "rewards/tag_count_reward": 0.4570312649011612, "step": 1415 }, { "clip_ratio": 0.0, - "completion_length": 1756.2165832519531, + "completion_length": 1531.2969665527344, "epoch": 0.422970651930401, - "grad_norm": 2.029055595397949, - "kl": 0.180419921875, - "learning_rate": 7.144550039292858e-08, - "loss": 0.0313, - "reward": 0.5770089626312256, - "reward_std": 0.09351379238069057, + "grad_norm": 23.288984298706055, + "kl": 4.8671875, + "learning_rate": 3.572275019646429e-07, + "loss": 0.4149, + "reward": 0.5329241305589676, + "reward_std": 0.12793505750596523, "rewards/accuracy_reward": 0.10937500488944352, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4654018059372902, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.423549123108387, "step": 1416 }, { "clip_ratio": 0.0, - "completion_length": 1770.9107666015625, + "completion_length": 1622.8996276855469, "epoch": 0.4232693600179225, - "grad_norm": 28.341960906982422, - "kl": 0.374755859375, - "learning_rate": 7.139837802901176e-08, - "loss": 0.0555, - "reward": 0.6261161044239998, - "reward_std": 0.1473751924932003, - "rewards/accuracy_reward": 0.1763392947614193, + "grad_norm": 84.50669860839844, + "kl": 6.5703125, + "learning_rate": 3.5699189014505884e-07, + "loss": 0.435, + "reward": 0.576450914144516, + "reward_std": 0.1688869632780552, + "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768059372902, + "rewards/tag_count_reward": 0.404575914144516, "step": 1417 }, { "clip_ratio": 0.0, - "completion_length": 1732.2076721191406, + "completion_length": 1502.6563415527344, "epoch": 0.42356806810544395, - "grad_norm": 1.5801912546157837, - "kl": 0.16943359375, - "learning_rate": 7.135123238575692e-08, - "loss": 0.0295, - "reward": 0.4771205559372902, - "reward_std": 0.07714268937706947, + "grad_norm": 29.186649322509766, + "kl": 4.86328125, + "learning_rate": 3.567561619287846e-07, + "loss": 0.3552, + "reward": 0.4380580484867096, + "reward_std": 0.11341207846999168, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562649011612, + "rewards/tag_count_reward": 0.4335937649011612, "step": 1418 }, { "clip_ratio": 0.0, - "completion_length": 1732.6027526855469, - "epoch": 0.4238667761929654, - "grad_norm": 2.0495262145996094, - "kl": 0.193603515625, - "learning_rate": 7.130406351445388e-08, - "loss": 0.03, - "reward": 0.5987723544239998, - "reward_std": 0.18800780177116394, - "rewards/accuracy_reward": 0.12946429150179029, + "completion_length": 1519.5089721679688, + "epoch": 0.4238667761929654, + "grad_norm": 96.34249114990234, + "kl": 6.21875, + "learning_rate": 3.5652031757226944e-07, + "loss": 0.4184, + "reward": 0.5139509215950966, + "reward_std": 0.20502140372991562, + "rewards/accuracy_reward": 0.09598214528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4693080559372902, + "rewards/tag_count_reward": 0.4179687723517418, "step": 1419 }, { "clip_ratio": 0.0, - "completion_length": 1710.4308776855469, + "completion_length": 1493.4085693359375, "epoch": 0.4241654842804869, - "grad_norm": 1.9320907592773438, - "kl": 0.16845703125, - "learning_rate": 7.125687146641776e-08, - "loss": 0.0452, - "reward": 0.517857164144516, - "reward_std": 0.1130204051733017, - "rewards/accuracy_reward": 0.05580357415601611, + "grad_norm": 29.343441009521484, + "kl": 4.21875, + "learning_rate": 3.5628435733208884e-07, + "loss": 0.3108, + "reward": 0.4810268133878708, + "reward_std": 0.14347251504659653, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4620535969734192, + "rewards/tag_count_reward": 0.431919664144516, "step": 1420 }, { "clip_ratio": 0.0, - "completion_length": 1806.2232971191406, + "completion_length": 1583.85498046875, "epoch": 0.42446419236800836, - "grad_norm": 3.1423752307891846, - "kl": 0.173095703125, - "learning_rate": 7.120965629298889e-08, - "loss": 0.0433, - "reward": 0.529575914144516, - "reward_std": 0.13162134028971195, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 50.71596908569336, + "kl": 5.875, + "learning_rate": 3.5604828146494446e-07, + "loss": 0.4257, + "reward": 0.487723246216774, + "reward_std": 0.17060846835374832, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455915205180645, + "rewards/tag_count_reward": 0.4095982387661934, "step": 1421 }, { "clip_ratio": 0.0, - "completion_length": 1766.0603332519531, + "completion_length": 1570.9866943359375, "epoch": 0.42476290045552983, - "grad_norm": 3.2638509273529053, - "kl": 0.186279296875, - "learning_rate": 7.116241804553275e-08, - "loss": 0.0274, - "reward": 0.5731026977300644, - "reward_std": 0.25132795609533787, - "rewards/accuracy_reward": 0.10937500861473382, + "grad_norm": 14.927299499511719, + "kl": 3.6953125, + "learning_rate": 3.5581209022766373e-07, + "loss": 0.2568, + "reward": 0.5027901902794838, + "reward_std": 0.223589938133955, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4637276977300644, + "rewards/tag_count_reward": 0.4179687649011612, "step": 1422 }, { "clip_ratio": 0.0, - "completion_length": 1794.94873046875, + "completion_length": 1520.9197082519531, "epoch": 0.4250616085430513, - "grad_norm": 2.8029048442840576, - "kl": 0.194091796875, - "learning_rate": 7.11151567754399e-08, - "loss": 0.041, - "reward": 0.5251116380095482, - "reward_std": 0.15213181264698505, - "rewards/accuracy_reward": 0.06919643119908869, + "grad_norm": 11.268389701843262, + "kl": 3.80859375, + "learning_rate": 3.555757838771995e-07, + "loss": 0.3062, + "reward": 0.4966517984867096, + "reward_std": 0.16206291317939758, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455915205180645, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1423 }, { "clip_ratio": 0.0, - "completion_length": 1737.6228332519531, + "completion_length": 1560.7277526855469, "epoch": 0.4253603166305728, - "grad_norm": 1.3011380434036255, - "kl": 0.1737060546875, - "learning_rate": 7.106787253412599e-08, - "loss": 0.0449, - "reward": 0.5920759290456772, - "reward_std": 0.13316684029996395, - "rewards/accuracy_reward": 0.12053571618162096, + "grad_norm": 32.90434646606445, + "kl": 4.36328125, + "learning_rate": 3.5533936267063e-07, + "loss": 0.3028, + "reward": 0.5390625223517418, + "reward_std": 0.18615295737981796, + "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4715401977300644, + "rewards/tag_count_reward": 0.4162946715950966, "step": 1424 }, { "clip_ratio": 0.0, - "completion_length": 1709.7590026855469, + "completion_length": 1510.4866943359375, "epoch": 0.42565902471809425, - "grad_norm": 4.745073318481445, - "kl": 0.15234375, - "learning_rate": 7.102056537303165e-08, - "loss": 0.0403, - "reward": 0.5664062649011612, - "reward_std": 0.14981608465313911, - "rewards/accuracy_reward": 0.10491072107106447, + "grad_norm": 44.8250617980957, + "kl": 2.921875, + "learning_rate": 3.5510282686515827e-07, + "loss": 0.2537, + "reward": 0.5552455708384514, + "reward_std": 0.15971957333385944, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4614955559372902, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1425 }, { "clip_ratio": 0.0, - "completion_length": 1719.6250610351562, + "completion_length": 1488.1250610351562, "epoch": 0.4259577328056157, - "grad_norm": 2.077944278717041, - "kl": 0.188720703125, - "learning_rate": 7.09732353436224e-08, - "loss": 0.0393, - "reward": 0.5044643133878708, - "reward_std": 0.11685349978506565, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 49.80710983276367, + "kl": 2.84765625, + "learning_rate": 3.54866176718112e-07, + "loss": 0.265, + "reward": 0.4804687723517418, + "reward_std": 0.1348081212490797, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455357164144516, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1426 }, { "clip_ratio": 0.0, - "completion_length": 1749.5067749023438, + "completion_length": 1542.5848999023438, "epoch": 0.4262564408931372, - "grad_norm": 7.690280437469482, - "kl": 0.2333984375, - "learning_rate": 7.09258824973887e-08, - "loss": 0.0348, - "reward": 0.5552455633878708, - "reward_std": 0.19595784693956375, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 31.625080108642578, + "kl": 3.41796875, + "learning_rate": 3.546294124869435e-07, + "loss": 0.2733, + "reward": 0.5625000298023224, + "reward_std": 0.2448401041328907, + "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669887661934, + "rewards/tag_count_reward": 0.424107164144516, "step": 1427 }, { "clip_ratio": 0.0, - "completion_length": 1811.2746276855469, + "completion_length": 1634.5960693359375, "epoch": 0.42655514898065866, - "grad_norm": 1.0722169876098633, - "kl": 0.184326171875, - "learning_rate": 7.08785068858458e-08, - "loss": 0.0434, - "reward": 0.5530134215950966, - "reward_std": 0.1804629061371088, - "rewards/accuracy_reward": 0.10267858067527413, + "grad_norm": 15.517450332641602, + "kl": 3.88671875, + "learning_rate": 3.54392534429229e-07, + "loss": 0.2862, + "reward": 0.5044643133878708, + "reward_std": 0.2133963294327259, + "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4503348469734192, + "rewards/tag_count_reward": 0.4062500223517418, "step": 1428 }, { "clip_ratio": 0.0, - "completion_length": 1703.5022888183594, + "completion_length": 1473.6875610351562, "epoch": 0.42685385706818013, - "grad_norm": 1.7046350240707397, - "kl": 0.1383056640625, - "learning_rate": 7.083110856053374e-08, - "loss": 0.0277, - "reward": 0.6400669887661934, - "reward_std": 0.1529311165213585, - "rewards/accuracy_reward": 0.1696428656578064, + "grad_norm": 43.29099655151367, + "kl": 2.48046875, + "learning_rate": 3.5415554280266866e-07, + "loss": 0.2151, + "reward": 0.5892857387661934, + "reward_std": 0.19006782211363316, + "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4704241305589676, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1429 }, { "clip_ratio": 0.0, - "completion_length": 1718.0090026855469, + "completion_length": 1550.7098693847656, "epoch": 0.4271525651557016, - "grad_norm": 3.160212755203247, - "kl": 0.167724609375, - "learning_rate": 7.07836875730172e-08, - "loss": 0.0438, - "reward": 0.501674123108387, - "reward_std": 0.1809944435954094, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 19.029016494750977, + "kl": 3.65234375, + "learning_rate": 3.53918437865086e-07, + "loss": 0.2727, + "reward": 0.4603794813156128, + "reward_std": 0.1681793536990881, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669813156128, + "rewards/tag_count_reward": 0.4246651977300644, "step": 1430 }, { "clip_ratio": 0.0, - "completion_length": 1785.2701721191406, + "completion_length": 1637.0603332519531, "epoch": 0.4274512732432231, - "grad_norm": 5.450253486633301, - "kl": 0.22705078125, - "learning_rate": 7.073624397488562e-08, - "loss": 0.037, - "reward": 0.5217634215950966, - "reward_std": 0.14870845340192318, - "rewards/accuracy_reward": 0.06919643376022577, + "grad_norm": 10.773706436157227, + "kl": 4.984375, + "learning_rate": 3.536812198744281e-07, + "loss": 0.3636, + "reward": 0.4776785895228386, + "reward_std": 0.16874925792217255, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669813156128, + "rewards/tag_count_reward": 0.4129464402794838, "step": 1431 }, { "clip_ratio": 0.0, - "completion_length": 1710.3951416015625, + "completion_length": 1546.6607971191406, "epoch": 0.42774998133074454, - "grad_norm": 2.681758403778076, - "kl": 0.179931640625, - "learning_rate": 7.068877781775298e-08, - "loss": 0.041, - "reward": 0.525669664144516, - "reward_std": 0.15878508985042572, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 8.724166870117188, + "kl": 4.21484375, + "learning_rate": 3.5344388908876486e-07, + "loss": 0.3301, + "reward": 0.4877232387661934, + "reward_std": 0.166711013764143, + "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410895228386, + "rewards/tag_count_reward": 0.427455373108387, "step": 1432 }, { "clip_ratio": 0.0, - "completion_length": 1718.7947082519531, + "completion_length": 1499.1340026855469, "epoch": 0.428048689418266, - "grad_norm": 4.986361980438232, - "kl": 0.186279296875, - "learning_rate": 7.064128915325777e-08, - "loss": 0.0418, - "reward": 0.5602678805589676, - "reward_std": 0.14029842987656593, - "rewards/accuracy_reward": 0.10044643026776612, + "grad_norm": 76.31558227539062, + "kl": 5.46484375, + "learning_rate": 3.5320644576628884e-07, + "loss": 0.3667, + "reward": 0.527343787252903, + "reward_std": 0.14706859178841114, + "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459821455180645, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1433 }, { "clip_ratio": 0.0, - "completion_length": 1771.7634887695312, + "completion_length": 1613.0447387695312, "epoch": 0.4283473975057875, - "grad_norm": 26.727081298828125, - "kl": 0.342041015625, - "learning_rate": 7.059377803306304e-08, - "loss": 0.0379, - "reward": 0.510602705180645, - "reward_std": 0.12217930890619755, - "rewards/accuracy_reward": 0.044642857974395156, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4637276977300644, + "grad_norm": 23.628870010375977, + "kl": 3.7578125, + "learning_rate": 3.5296889016531517e-07, + "loss": 0.2757, + "reward": 0.4654017984867096, + "reward_std": 0.13195237703621387, + "rewards/accuracy_reward": 0.044642860535532236, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4207589402794838, "step": 1434 }, { "clip_ratio": 0.0, - "completion_length": 1737.3282165527344, + "completion_length": 1527.8482971191406, "epoch": 0.42864610559330896, - "grad_norm": 2.2418785095214844, - "kl": 0.1806640625, - "learning_rate": 7.05462445088562e-08, - "loss": 0.0191, - "reward": 0.5786830559372902, - "reward_std": 0.1390148140490055, - "rewards/accuracy_reward": 0.11830357694998384, + "grad_norm": 11.513998031616211, + "kl": 4.36328125, + "learning_rate": 3.5273122254428103e-07, + "loss": 0.3325, + "reward": 0.514508955180645, + "reward_std": 0.14873602241277695, + "rewards/accuracy_reward": 0.1004464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794813156128, + "rewards/tag_count_reward": 0.4140625149011612, "step": 1435 }, { "clip_ratio": 0.0, - "completion_length": 1736.2612609863281, + "completion_length": 1513.5692749023438, "epoch": 0.42894481368083043, - "grad_norm": 2.415242910385132, - "kl": 0.16748046875, - "learning_rate": 7.04986886323491e-08, - "loss": 0.0402, - "reward": 0.5368303805589676, - "reward_std": 0.1331777311861515, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 85.2144775390625, + "kl": 6.234375, + "learning_rate": 3.524934431617455e-07, + "loss": 0.4462, + "reward": 0.5256696715950966, + "reward_std": 0.18492629751563072, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467633955180645, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1436 }, { "clip_ratio": 0.0, - "completion_length": 1700.1786499023438, + "completion_length": 1447.3929138183594, "epoch": 0.4292435217683519, - "grad_norm": 1.5120190382003784, - "kl": 0.14501953125, - "learning_rate": 7.045111045527784e-08, - "loss": 0.0371, - "reward": 0.5440848469734192, - "reward_std": 0.1247011786326766, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 13.547998428344727, + "kl": 3.53515625, + "learning_rate": 3.522555522763892e-07, + "loss": 0.2963, + "reward": 0.5474330633878708, + "reward_std": 0.18129126355051994, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4726562649011612, + "rewards/tag_count_reward": 0.4447544887661934, "step": 1437 }, { "clip_ratio": 0.0, - "completion_length": 1712.1340026855469, + "completion_length": 1547.4933471679688, "epoch": 0.42954222985587337, - "grad_norm": 5.5469489097595215, - "kl": 0.1898193359375, - "learning_rate": 7.040351002940284e-08, - "loss": 0.0393, - "reward": 0.533482164144516, - "reward_std": 0.15495873801410198, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 59.071956634521484, + "kl": 5.3046875, + "learning_rate": 3.5201755014701417e-07, + "loss": 0.3525, + "reward": 0.490513414144516, + "reward_std": 0.1838302668184042, + "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4598214477300644, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1438 }, { "clip_ratio": 0.0, - "completion_length": 1745.3884582519531, + "completion_length": 1557.091552734375, "epoch": 0.42984093794339484, - "grad_norm": 4.4258503913879395, - "kl": 0.24169921875, - "learning_rate": 7.035588740650869e-08, - "loss": 0.0373, - "reward": 0.6043526977300644, - "reward_std": 0.10688499361276627, - "rewards/accuracy_reward": 0.1540178656578064, + "grad_norm": 31.595563888549805, + "kl": 5.5703125, + "learning_rate": 3.5177943703254344e-07, + "loss": 0.3958, + "reward": 0.5652901902794838, + "reward_std": 0.13521074689924717, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4503348395228386, + "rewards/tag_count_reward": 0.4135044813156128, "step": 1439 }, { "clip_ratio": 0.0, - "completion_length": 1669.2723999023438, + "completion_length": 1499.9420166015625, "epoch": 0.4301396460309163, - "grad_norm": 14.657601356506348, - "kl": 0.222900390625, - "learning_rate": 7.030824263840411e-08, - "loss": 0.0361, - "reward": 0.5630580484867096, - "reward_std": 0.13529329933226109, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 32.997589111328125, + "kl": 4.14453125, + "learning_rate": 3.5154121319202056e-07, + "loss": 0.3236, + "reward": 0.519531287252903, + "reward_std": 0.16492615081369877, + "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4737723469734192, + "rewards/tag_count_reward": 0.4369419887661934, "step": 1440 }, { "clip_ratio": 0.0, - "completion_length": 1726.3170166015625, + "completion_length": 1500.37060546875, "epoch": 0.4304383541184378, - "grad_norm": 2.4075443744659424, - "kl": 0.184814453125, - "learning_rate": 7.0260575776922e-08, - "loss": 0.0308, - "reward": 0.5267857387661934, - "reward_std": 0.14173967763781548, - "rewards/accuracy_reward": 0.066964291036129, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4575893133878708, + "grad_norm": 94.89285278320312, + "kl": 5.6328125, + "learning_rate": 3.5130287888460996e-07, + "loss": 0.3785, + "reward": 0.518415205180645, + "reward_std": 0.17342890799045563, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.435825914144516, "step": 1441 }, { "clip_ratio": 0.0, - "completion_length": 1658.2254943847656, + "completion_length": 1469.9442749023438, "epoch": 0.43073706220595925, - "grad_norm": 3.9782540798187256, - "kl": 0.180419921875, - "learning_rate": 7.021288687391917e-08, - "loss": 0.0533, - "reward": 0.5111607238650322, - "reward_std": 0.14588196761906147, - "rewards/accuracy_reward": 0.06026786030270159, + "grad_norm": 40.93423843383789, + "kl": 5.12890625, + "learning_rate": 3.510644343695958e-07, + "loss": 0.3724, + "reward": 0.475446455180645, + "reward_std": 0.12870310619473457, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1442 }, { "clip_ratio": 0.0, - "completion_length": 1777.1072387695312, + "completion_length": 1567.0603332519531, "epoch": 0.43103577029348067, - "grad_norm": 2.7899763584136963, - "kl": 0.216064453125, - "learning_rate": 7.016517598127649e-08, - "loss": 0.0341, - "reward": 0.4681919887661934, - "reward_std": 0.14654777385294437, - "rewards/accuracy_reward": 0.020089287078008056, + "grad_norm": 81.92736053466797, + "kl": 5.6484375, + "learning_rate": 3.508258799063825e-07, + "loss": 0.3536, + "reward": 0.4386160969734192, + "reward_std": 0.12087966687977314, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.448102705180645, + "rewards/tag_count_reward": 0.4274553805589676, "step": 1443 }, { "clip_ratio": 0.0, - "completion_length": 1699.5491638183594, + "completion_length": 1543.9554138183594, "epoch": 0.43133447838100214, - "grad_norm": 0.9529091715812683, - "kl": 0.177490234375, - "learning_rate": 7.011744315089874e-08, - "loss": 0.0158, - "reward": 0.4960937723517418, - "reward_std": 0.13016806170344353, - "rewards/accuracy_reward": 0.040178573690354824, + "grad_norm": 45.483177185058594, + "kl": 3.30078125, + "learning_rate": 3.505872157544937e-07, + "loss": 0.2656, + "reward": 0.4497768059372902, + "reward_std": 0.15735751204192638, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4559151902794838, + "rewards/tag_count_reward": 0.416294664144516, "step": 1444 }, { "clip_ratio": 0.0, - "completion_length": 1671.22998046875, + "completion_length": 1445.2567443847656, "epoch": 0.4316331864685236, - "grad_norm": 3.639699697494507, - "kl": 0.1890869140625, - "learning_rate": 7.006968843471459e-08, - "loss": 0.0367, - "reward": 0.5094866380095482, - "reward_std": 0.12793422117829323, - "rewards/accuracy_reward": 0.046875001629814506, + "grad_norm": 21.5874080657959, + "kl": 3.9453125, + "learning_rate": 3.5034844217357296e-07, + "loss": 0.2762, + "reward": 0.4849330559372902, + "reward_std": 0.15366273932158947, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.462611623108387, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1445 }, { "clip_ratio": 0.0, - "completion_length": 1666.5647888183594, + "completion_length": 1499.8661193847656, "epoch": 0.4319318945560451, - "grad_norm": 5.040361404418945, - "kl": 0.23828125, - "learning_rate": 7.002191188467645e-08, - "loss": 0.052, - "reward": 0.5915178880095482, - "reward_std": 0.17016583681106567, - "rewards/accuracy_reward": 0.1361607238650322, + "grad_norm": 9.277941703796387, + "kl": 3.66015625, + "learning_rate": 3.5010955942338226e-07, + "loss": 0.2954, + "reward": 0.5212053880095482, + "reward_std": 0.15784416906535625, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455357164144516, + "rewards/tag_count_reward": 0.4140625223517418, "step": 1446 }, { "clip_ratio": 0.0, - "completion_length": 1764.305908203125, + "completion_length": 1587.1741943359375, "epoch": 0.43223060264356655, - "grad_norm": 1.748007893562317, - "kl": 0.182861328125, - "learning_rate": 6.997411355276053e-08, - "loss": 0.0352, - "reward": 0.5390625223517418, - "reward_std": 0.14689892902970314, - "rewards/accuracy_reward": 0.07812500302679837, + "grad_norm": 63.69154357910156, + "kl": 2.8203125, + "learning_rate": 3.4987056776380263e-07, + "loss": 0.2343, + "reward": 0.5078125149011612, + "reward_std": 0.1770716942846775, + "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375298023224, + "rewards/tag_count_reward": 0.422991082072258, "step": 1447 }, { "clip_ratio": 0.0, - "completion_length": 1631.2366638183594, + "completion_length": 1441.5491943359375, "epoch": 0.432529310731088, - "grad_norm": 2.322678804397583, - "kl": 0.192626953125, - "learning_rate": 6.992629349096674e-08, - "loss": 0.0393, - "reward": 0.5424107313156128, - "reward_std": 0.1258144360035658, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 54.58761978149414, + "kl": 2.1875, + "learning_rate": 3.4963146745483373e-07, + "loss": 0.1728, + "reward": 0.5167410969734192, + "reward_std": 0.1260305643081665, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4642857313156128, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1448 }, { "clip_ratio": 0.0, - "completion_length": 1628.4710693359375, + "completion_length": 1462.5291137695312, "epoch": 0.4328280188186095, - "grad_norm": 1.8233749866485596, - "kl": 0.1787109375, - "learning_rate": 6.987845175131863e-08, - "loss": 0.0494, - "reward": 0.6054687798023224, - "reward_std": 0.13540387898683548, - "rewards/accuracy_reward": 0.13839286309666932, + "grad_norm": 57.36056900024414, + "kl": 2.681640625, + "learning_rate": 3.493922587565932e-07, + "loss": 0.242, + "reward": 0.5636160969734192, + "reward_std": 0.1607311051338911, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4670759066939354, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1449 }, { "clip_ratio": 0.0, - "completion_length": 1782.3014221191406, + "completion_length": 1555.2098693847656, "epoch": 0.43312672690613097, - "grad_norm": 4.336082458496094, - "kl": 0.24560546875, - "learning_rate": 6.98305883858633e-08, - "loss": 0.0383, - "reward": 0.5948660895228386, - "reward_std": 0.11965364217758179, - "rewards/accuracy_reward": 0.13169643143191934, + "grad_norm": 75.47383880615234, + "kl": 3.25390625, + "learning_rate": 3.4915294192931653e-07, + "loss": 0.2861, + "reward": 0.5552455484867096, + "reward_std": 0.15638377144932747, + "rewards/accuracy_reward": 0.12946429080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4631696566939354, + "rewards/tag_count_reward": 0.4257812649011612, "step": 1450 }, { "clip_ratio": 0.0, - "completion_length": 1729.3862609863281, + "completion_length": 1499.2232666015625, "epoch": 0.43342543499365244, - "grad_norm": 2.238530397415161, - "kl": 0.1669921875, - "learning_rate": 6.978270344667142e-08, - "loss": 0.0331, - "reward": 0.5647321715950966, - "reward_std": 0.14220053888857365, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 55.58460235595703, + "kl": 2.71875, + "learning_rate": 3.489135172333571e-07, + "loss": 0.229, + "reward": 0.5318080633878708, + "reward_std": 0.15975451469421387, + "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893059372902, + "rewards/tag_count_reward": 0.4335937649011612, "step": 1451 }, { "clip_ratio": 0.0, - "completion_length": 1626.9933776855469, + "completion_length": 1398.4442443847656, "epoch": 0.4337241430811739, - "grad_norm": 5.047297477722168, - "kl": 0.17822265625, - "learning_rate": 6.97347969858371e-08, - "loss": 0.0663, - "reward": 0.6813616454601288, - "reward_std": 0.18194879591464996, - "rewards/accuracy_reward": 0.22321429289877415, + "grad_norm": 77.27457427978516, + "kl": 2.486328125, + "learning_rate": 3.486739849291855e-07, + "loss": 0.2463, + "reward": 0.6523437798023224, + "reward_std": 0.1756640113890171, + "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4581473469734192, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1452 }, { "clip_ratio": 0.0, - "completion_length": 1781.2366638183594, + "completion_length": 1590.3884582519531, "epoch": 0.4340228511686954, - "grad_norm": 2.281564950942993, - "kl": 0.2138671875, - "learning_rate": 6.968686905547789e-08, - "loss": 0.0309, - "reward": 0.4587053805589676, - "reward_std": 0.1100735291838646, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 43.664154052734375, + "kl": 3.078125, + "learning_rate": 3.4843434527738946e-07, + "loss": 0.2385, + "reward": 0.4497768059372902, + "reward_std": 0.1464109607040882, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768133878708, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1453 }, { "clip_ratio": 0.0, - "completion_length": 1745.5781860351562, + "completion_length": 1522.0670166015625, "epoch": 0.43432155925621685, - "grad_norm": 2.257305860519409, - "kl": 0.216796875, - "learning_rate": 6.963891970773465e-08, - "loss": 0.0408, - "reward": 0.5050223469734192, - "reward_std": 0.13397254049777985, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 35.21329116821289, + "kl": 3.52734375, + "learning_rate": 3.4819459853867325e-07, + "loss": 0.2855, + "reward": 0.490513414144516, + "reward_std": 0.15257912501692772, + "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4559151977300644, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1454 }, { "clip_ratio": 0.0, - "completion_length": 1635.1317749023438, + "completion_length": 1490.2902221679688, "epoch": 0.4346202673437383, - "grad_norm": 4.003342151641846, - "kl": 0.172119140625, - "learning_rate": 6.95909489947716e-08, - "loss": 0.0437, - "reward": 0.4921875298023224, - "reward_std": 0.1483860146254301, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 53.89543914794922, + "kl": 3.3984375, + "learning_rate": 3.47954744973858e-07, + "loss": 0.2765, + "reward": 0.4642857313156128, + "reward_std": 0.1521309707313776, + "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.458705373108387, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1455 }, { "clip_ratio": 0.0, - "completion_length": 1761.2188415527344, + "completion_length": 1535.40185546875, "epoch": 0.4349189754312598, - "grad_norm": 4.152327537536621, - "kl": 0.15576171875, - "learning_rate": 6.954295696877616e-08, - "loss": 0.0306, - "reward": 0.5234375149011612, - "reward_std": 0.20480914413928986, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 10.348383903503418, + "kl": 3.4375, + "learning_rate": 3.477147848438808e-07, + "loss": 0.2616, + "reward": 0.5117187798023224, + "reward_std": 0.21689007431268692, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1456 }, { "clip_ratio": 0.0, - "completion_length": 1804.8906860351562, + "completion_length": 1605.7724304199219, "epoch": 0.43521768351878126, - "grad_norm": 4.975869655609131, - "kl": 0.165771484375, - "learning_rate": 6.949494368195895e-08, - "loss": 0.0446, - "reward": 0.552455373108387, - "reward_std": 0.16121200285851955, - "rewards/accuracy_reward": 0.09375000465661287, + "grad_norm": 12.209020614624023, + "kl": 4.419921875, + "learning_rate": 3.4747471840979474e-07, + "loss": 0.3037, + "reward": 0.521205373108387, + "reward_std": 0.14623479172587395, + "rewards/accuracy_reward": 0.0848214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4587053805589676, + "rewards/tag_count_reward": 0.4363839402794838, "step": 1457 }, { "clip_ratio": 0.0, - "completion_length": 1730.9911499023438, + "completion_length": 1509.3013916015625, "epoch": 0.43551639160630273, - "grad_norm": 3.4852325916290283, - "kl": 0.174560546875, - "learning_rate": 6.944690918655373e-08, - "loss": 0.0413, - "reward": 0.621651828289032, - "reward_std": 0.17502212896943092, - "rewards/accuracy_reward": 0.1584821529686451, + "grad_norm": 57.3026123046875, + "kl": 5.6484375, + "learning_rate": 3.472345459327686e-07, + "loss": 0.4028, + "reward": 0.5909598469734192, + "reward_std": 0.20330269634723663, + "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.463169664144516, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1458 }, { "clip_ratio": 0.0, - "completion_length": 1700.5223999023438, + "completion_length": 1494.9219360351562, "epoch": 0.4358150996938242, - "grad_norm": 3.82846736907959, - "kl": 0.17724609375, - "learning_rate": 6.939885353481731e-08, - "loss": 0.0482, - "reward": 0.5625000223517418, - "reward_std": 0.17112857103347778, - "rewards/accuracy_reward": 0.10937500116415322, + "grad_norm": 26.43115234375, + "kl": 4.28125, + "learning_rate": 3.469942676740866e-07, + "loss": 0.3275, + "reward": 0.537946455180645, + "reward_std": 0.14555110223591328, + "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4531250223517418, + "rewards/tag_count_reward": 0.439732164144516, "step": 1459 }, { "clip_ratio": 0.0, - "completion_length": 1684.9509887695312, + "completion_length": 1504.8594360351562, "epoch": 0.4361138077813457, - "grad_norm": 2.43122935295105, - "kl": 0.1458740234375, - "learning_rate": 6.935077677902955e-08, - "loss": 0.0307, - "reward": 0.5133928954601288, - "reward_std": 0.13598651625216007, - "rewards/accuracy_reward": 0.04464286006987095, + "grad_norm": 42.313411712646484, + "kl": 4.2890625, + "learning_rate": 3.467538838951477e-07, + "loss": 0.2896, + "reward": 0.4732143133878708, + "reward_std": 0.15532775782048702, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4687500149011612, + "rewards/tag_count_reward": 0.444196455180645, "step": 1460 }, { "clip_ratio": 0.0, - "completion_length": 1770.3929138183594, + "completion_length": 1538.9598999023438, "epoch": 0.43641251586886715, - "grad_norm": 3.121317148208618, - "kl": 0.18408203125, - "learning_rate": 6.930267897149321e-08, - "loss": 0.0333, - "reward": 0.5239955559372902, - "reward_std": 0.15368079394102097, - "rewards/accuracy_reward": 0.06696428870782256, + "grad_norm": 26.194128036499023, + "kl": 4.3671875, + "learning_rate": 3.4651339485746604e-07, + "loss": 0.3041, + "reward": 0.509486623108387, + "reward_std": 0.14994021505117416, + "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4570312649011612, + "rewards/tag_count_reward": 0.444754496216774, "step": 1461 }, { "clip_ratio": 0.0, - "completion_length": 1789.0826721191406, + "completion_length": 1608.1161193847656, "epoch": 0.4367112239563886, - "grad_norm": 1.1817512512207031, - "kl": 0.20849609375, - "learning_rate": 6.925456016453403e-08, - "loss": 0.0425, - "reward": 0.4743303880095482, - "reward_std": 0.16072050482034683, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 7.320902347564697, + "kl": 4.19140625, + "learning_rate": 3.462728008226702e-07, + "loss": 0.303, + "reward": 0.4732143059372902, + "reward_std": 0.17594783008098602, + "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4453125149011612, + "rewards/tag_count_reward": 0.4308035969734192, "step": 1462 }, { "clip_ratio": 0.0, - "completion_length": 1805.618408203125, + "completion_length": 1653.4063110351562, "epoch": 0.4370099320439101, - "grad_norm": 5.794968128204346, - "kl": 0.2021484375, - "learning_rate": 6.920642041050055e-08, - "loss": 0.0343, - "reward": 0.538504496216774, - "reward_std": 0.16400157287716866, - "rewards/accuracy_reward": 0.07812500186264515, + "grad_norm": 20.456615447998047, + "kl": 3.9375, + "learning_rate": 3.4603210205250274e-07, + "loss": 0.2577, + "reward": 0.497209832072258, + "reward_std": 0.20526098273694515, + "rewards/accuracy_reward": 0.06473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794887661934, + "rewards/tag_count_reward": 0.432477705180645, "step": 1463 }, { "clip_ratio": 0.0, - "completion_length": 1805.0045471191406, + "completion_length": 1597.2768859863281, "epoch": 0.43730864013143156, - "grad_norm": 2.7312002182006836, - "kl": 0.220458984375, - "learning_rate": 6.915825976176408e-08, - "loss": 0.0395, - "reward": 0.5111607387661934, - "reward_std": 0.14537925273180008, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 53.321109771728516, + "kl": 4.92578125, + "learning_rate": 3.457912988088204e-07, + "loss": 0.3355, + "reward": 0.4654017984867096, + "reward_std": 0.14293274469673634, + "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.4319196566939354, "step": 1464 }, { "clip_ratio": 0.0, - "completion_length": 1730.4041137695312, + "completion_length": 1526.5715026855469, "epoch": 0.43760734821895303, - "grad_norm": 4.585665702819824, - "kl": 0.189453125, - "learning_rate": 6.91100782707187e-08, - "loss": 0.0476, - "reward": 0.510044664144516, - "reward_std": 0.11228752508759499, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 40.586669921875, + "kl": 4.89453125, + "learning_rate": 3.455503913535935e-07, + "loss": 0.3583, + "reward": 0.4787946790456772, + "reward_std": 0.1226488221436739, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375223517418, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1465 }, { "clip_ratio": 0.0, - "completion_length": 1747.40185546875, + "completion_length": 1609.5893859863281, "epoch": 0.4379060563064745, - "grad_norm": 6.700340270996094, - "kl": 0.201904296875, - "learning_rate": 6.906187598978116e-08, - "loss": 0.0424, - "reward": 0.463169664144516, - "reward_std": 0.1067213136702776, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 68.1153793334961, + "kl": 5.1015625, + "learning_rate": 3.453093799489058e-07, + "loss": 0.3132, + "reward": 0.4274553805589676, + "reward_std": 0.12730297818779945, + "rewards/accuracy_reward": 0.01116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410895228386, + "rewards/tag_count_reward": 0.416294664144516, "step": 1466 }, { "clip_ratio": 0.0, - "completion_length": 1733.35498046875, + "completion_length": 1517.0916137695312, "epoch": 0.43820476439399597, - "grad_norm": 9.505707740783691, - "kl": 0.28076171875, - "learning_rate": 6.90136529713908e-08, - "loss": 0.061, - "reward": 0.5111607313156128, - "reward_std": 0.1558833047747612, - "rewards/accuracy_reward": 0.06473214505240321, + "grad_norm": 34.04472351074219, + "kl": 3.26953125, + "learning_rate": 3.45068264856954e-07, + "loss": 0.2822, + "reward": 0.486607164144516, + "reward_std": 0.12591844610869884, + "rewards/accuracy_reward": 0.05133928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4464285895228386, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1467 }, { "clip_ratio": 0.0, - "completion_length": 1732.2366943359375, + "completion_length": 1498.4509582519531, "epoch": 0.43850347248151744, - "grad_norm": 4.460550308227539, - "kl": 0.177734375, - "learning_rate": 6.896540926800956e-08, - "loss": 0.0501, - "reward": 0.5781250298023224, - "reward_std": 0.20044038817286491, - "rewards/accuracy_reward": 0.12053572246804833, + "grad_norm": 60.066123962402344, + "kl": 2.5625, + "learning_rate": 3.4482704634004776e-07, + "loss": 0.2427, + "reward": 0.5502232313156128, + "reward_std": 0.15972594544291496, + "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893133878708, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1468 }, { "clip_ratio": 0.0, - "completion_length": 1824.5938110351562, + "completion_length": 1657.2277526855469, "epoch": 0.4388021805690389, - "grad_norm": 1.9409314393997192, - "kl": 0.24755859375, - "learning_rate": 6.891714493212181e-08, - "loss": 0.04, - "reward": 0.450892873108387, - "reward_std": 0.13363760523498058, - "rewards/accuracy_reward": 0.013392857508733869, + "grad_norm": 8.72428035736084, + "kl": 3.78515625, + "learning_rate": 3.445857246606091e-07, + "loss": 0.2448, + "reward": 0.4414062649011612, + "reward_std": 0.13980548828840256, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4375000149011612, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1469 }, { "clip_ratio": 0.0, - "completion_length": 1713.9375610351562, + "completion_length": 1545.3438110351562, "epoch": 0.4391008886565604, - "grad_norm": 5.538493633270264, - "kl": 0.251708984375, - "learning_rate": 6.886886001623449e-08, - "loss": 0.0604, - "reward": 0.5312500298023224, - "reward_std": 0.1658532302826643, - "rewards/accuracy_reward": 0.07366071734577417, + "grad_norm": 11.99938678741455, + "kl": 3.26953125, + "learning_rate": 3.4434430008117244e-07, + "loss": 0.2451, + "reward": 0.5083705559372902, + "reward_std": 0.1546725481748581, + "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575892984867096, + "rewards/tag_count_reward": 0.443638414144516, "step": 1470 }, { "clip_ratio": 0.0, - "completion_length": 1795.2880249023438, + "completion_length": 1526.4420471191406, "epoch": 0.43939959674408186, - "grad_norm": 3.8613600730895996, - "kl": 0.25390625, - "learning_rate": 6.882055457287681e-08, - "loss": 0.0399, - "reward": 0.5781250223517418, - "reward_std": 0.12114301975816488, - "rewards/accuracy_reward": 0.12053571967408061, + "grad_norm": 55.323673248291016, + "kl": 2.5546875, + "learning_rate": 3.4410277286438406e-07, + "loss": 0.2262, + "reward": 0.569196455180645, + "reward_std": 0.14800458028912544, + "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575893059372902, + "rewards/tag_count_reward": 0.439732164144516, "step": 1471 }, { "clip_ratio": 0.0, - "completion_length": 1683.3259582519531, + "completion_length": 1448.294677734375, "epoch": 0.4396983048316033, - "grad_norm": 4.575190544128418, - "kl": 0.208740234375, - "learning_rate": 6.877222865460036e-08, - "loss": 0.0466, - "reward": 0.5117187798023224, - "reward_std": 0.1446188259869814, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 24.32415008544922, + "kl": 3.083984375, + "learning_rate": 3.438611432730018e-07, + "loss": 0.257, + "reward": 0.4893973469734192, + "reward_std": 0.14233149029314518, + "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4492187649011612, + "rewards/tag_count_reward": 0.4313616380095482, "step": 1472 }, { "clip_ratio": 0.0, - "completion_length": 1804.0960388183594, + "completion_length": 1589.7590026855469, "epoch": 0.4399970129191248, - "grad_norm": 6.360369682312012, - "kl": 0.25146484375, - "learning_rate": 6.872388231397902e-08, - "loss": 0.0345, - "reward": 0.5306919813156128, - "reward_std": 0.12234256789088249, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 8.666537284851074, + "kl": 3.47265625, + "learning_rate": 3.436194115698951e-07, + "loss": 0.2496, + "reward": 0.498325914144516, + "reward_std": 0.11551776342093945, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4503348395228386, + "rewards/tag_count_reward": 0.4202009066939354, "step": 1473 }, { "clip_ratio": 0.0, - "completion_length": 1761.6719360351562, + "completion_length": 1503.8348999023438, "epoch": 0.44029572100664627, - "grad_norm": 4.692800045013428, - "kl": 0.197021484375, - "learning_rate": 6.867551560360886e-08, - "loss": 0.0521, - "reward": 0.529575914144516, - "reward_std": 0.14943641610443592, - "rewards/accuracy_reward": 0.06696428777649999, + "grad_norm": 43.407936096191406, + "kl": 2.63671875, + "learning_rate": 3.433775780180443e-07, + "loss": 0.2349, + "reward": 0.5078125298023224, + "reward_std": 0.12676135264337063, + "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.462611623108387, + "rewards/tag_count_reward": 0.4542410969734192, "step": 1474 }, { "clip_ratio": 0.0, - "completion_length": 1694.9175109863281, + "completion_length": 1487.1496276855469, "epoch": 0.44059442909416774, - "grad_norm": 7.108395099639893, - "kl": 0.214599609375, - "learning_rate": 6.862712857610812e-08, - "loss": 0.0682, - "reward": 0.569196455180645, - "reward_std": 0.11283695697784424, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 6.639986038208008, + "kl": 3.345703125, + "learning_rate": 3.4313564288054057e-07, + "loss": 0.2639, + "reward": 0.577008955180645, + "reward_std": 0.1256884355098009, + "rewards/accuracy_reward": 0.1250000058207661, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.452008955180645, "step": 1475 }, { "clip_ratio": 0.0, - "completion_length": 1735.6764221191406, + "completion_length": 1568.7165832519531, "epoch": 0.4408931371816892, - "grad_norm": 2.5402982234954834, - "kl": 0.214599609375, - "learning_rate": 6.857872128411713e-08, - "loss": 0.0457, - "reward": 0.584821455180645, - "reward_std": 0.2200615219771862, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 18.400861740112305, + "kl": 3.572265625, + "learning_rate": 3.428936064205856e-07, + "loss": 0.2642, + "reward": 0.5658482313156128, + "reward_std": 0.2613684870302677, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459821455180645, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1476 }, { "clip_ratio": 0.0, - "completion_length": 1833.8750915527344, + "completion_length": 1639.6898193359375, "epoch": 0.4411918452692107, - "grad_norm": 2.155735969543457, - "kl": 0.221435546875, - "learning_rate": 6.85302937802983e-08, - "loss": 0.0441, - "reward": 0.5345982313156128, - "reward_std": 0.13874274864792824, - "rewards/accuracy_reward": 0.08705357182770967, + "grad_norm": 58.76015090942383, + "kl": 5.453125, + "learning_rate": 3.426514689014915e-07, + "loss": 0.3518, + "reward": 0.5251116380095482, + "reward_std": 0.1464933268725872, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4475446566939354, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1477 }, { "clip_ratio": 0.0, - "completion_length": 1720.2523193359375, + "completion_length": 1561.919677734375, "epoch": 0.44149055335673215, - "grad_norm": 2.9566116333007812, - "kl": 0.23583984375, - "learning_rate": 6.848184611733601e-08, - "loss": 0.0475, - "reward": 0.5122768059372902, - "reward_std": 0.12652655877172947, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 72.22064971923828, + "kl": 5.54296875, + "learning_rate": 3.4240923058668e-07, + "loss": 0.332, + "reward": 0.4910714477300644, + "reward_std": 0.11900698393583298, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4453125223517418, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1478 }, { "clip_ratio": 0.0, - "completion_length": 1722.9531860351562, + "completion_length": 1569.6116638183594, "epoch": 0.4417892614442536, - "grad_norm": 4.475882053375244, - "kl": 0.1943359375, - "learning_rate": 6.843337834793654e-08, - "loss": 0.0366, - "reward": 0.529575914144516, - "reward_std": 0.1490265280008316, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 56.408321380615234, + "kl": 5.47265625, + "learning_rate": 3.4216689173968274e-07, + "loss": 0.3496, + "reward": 0.483258955180645, + "reward_std": 0.14187972992658615, + "rewards/accuracy_reward": 0.05580357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.451450914144516, + "rewards/tag_count_reward": 0.4274553805589676, "step": 1479 }, { "clip_ratio": 0.0, - "completion_length": 1755.4197082519531, + "completion_length": 1566.555908203125, "epoch": 0.4420879695317751, - "grad_norm": 2.6337547302246094, - "kl": 0.248046875, - "learning_rate": 6.838489052482811e-08, - "loss": 0.0388, - "reward": 0.571986623108387, - "reward_std": 0.11804737336933613, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 12.119752883911133, + "kl": 3.984375, + "learning_rate": 3.4192445262414055e-07, + "loss": 0.2823, + "reward": 0.5747768059372902, + "reward_std": 0.14305485226213932, + "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.451450914144516, + "rewards/tag_count_reward": 0.4386160969734192, "step": 1480 }, { "clip_ratio": 0.0, - "completion_length": 1718.9107971191406, + "completion_length": 1520.4330749511719, "epoch": 0.44238667761929656, - "grad_norm": 3.707733154296875, - "kl": 0.209716796875, - "learning_rate": 6.833638270076071e-08, - "loss": 0.0345, - "reward": 0.5172991380095482, - "reward_std": 0.12803454603999853, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 22.189239501953125, + "kl": 3.9765625, + "learning_rate": 3.4168191350380353e-07, + "loss": 0.2782, + "reward": 0.497767873108387, + "reward_std": 0.1384769007563591, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4547991305589676, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1481 }, { "clip_ratio": 0.0, - "completion_length": 1725.4554138183594, + "completion_length": 1538.6161499023438, "epoch": 0.44268538570681804, - "grad_norm": 2.7273573875427246, - "kl": 0.257568359375, - "learning_rate": 6.828785492850608e-08, - "loss": 0.0457, - "reward": 0.5781250298023224, - "reward_std": 0.11614903435111046, + "grad_norm": 34.014244079589844, + "kl": 4.52734375, + "learning_rate": 3.414392746425304e-07, + "loss": 0.3381, + "reward": 0.5457589477300644, + "reward_std": 0.13536109775304794, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4575892984867096, + "rewards/tag_count_reward": 0.4252232313156128, "step": 1482 }, { "clip_ratio": 0.0, - "completion_length": 1763.7076721191406, + "completion_length": 1558.8103637695312, "epoch": 0.4429840937943395, - "grad_norm": 1.843213438987732, - "kl": 0.23681640625, - "learning_rate": 6.823930726085767e-08, - "loss": 0.0508, - "reward": 0.5262276977300644, - "reward_std": 0.20135858096182346, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 82.58380889892578, + "kl": 2.3828125, + "learning_rate": 3.4119653630428837e-07, + "loss": 0.2231, + "reward": 0.4938616305589676, + "reward_std": 0.20607616938650608, + "rewards/accuracy_reward": 0.06473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4547991305589676, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1483 }, { "clip_ratio": 0.0, - "completion_length": 1803.6920471191406, + "completion_length": 1595.9063110351562, "epoch": 0.443282801881861, - "grad_norm": 7.721102237701416, - "kl": 0.287109375, - "learning_rate": 6.819073975063063e-08, - "loss": 0.0488, - "reward": 0.513950914144516, - "reward_std": 0.15922896191477776, - "rewards/accuracy_reward": 0.07142857648432255, + "grad_norm": 71.35625457763672, + "kl": 2.099609375, + "learning_rate": 3.409536987531532e-07, + "loss": 0.1807, + "reward": 0.5117187723517418, + "reward_std": 0.14415345899760723, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4425223469734192, + "rewards/tag_count_reward": 0.435825914144516, "step": 1484 }, { "clip_ratio": 0.0, - "completion_length": 1663.29248046875, + "completion_length": 1463.3259887695312, "epoch": 0.44358150996938245, - "grad_norm": 4.069639682769775, - "kl": 0.2509765625, - "learning_rate": 6.814215245066163e-08, - "loss": 0.0573, - "reward": 0.5340401977300644, - "reward_std": 0.14930472522974014, - "rewards/accuracy_reward": 0.0691964328289032, + "grad_norm": 99.4144515991211, + "kl": 2.330078125, + "learning_rate": 3.407107622533081e-07, + "loss": 0.2604, + "reward": 0.4972098469734192, + "reward_std": 0.16033474542200565, + "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4648437723517418, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1485 }, { "clip_ratio": 0.0, - "completion_length": 1773.4888916015625, + "completion_length": 1539.9621276855469, "epoch": 0.44388021805690386, - "grad_norm": 5.7992048263549805, - "kl": 0.26611328125, - "learning_rate": 6.809354541380886e-08, - "loss": 0.0607, - "reward": 0.5987723469734192, - "reward_std": 0.20901568233966827, - "rewards/accuracy_reward": 0.15178571827709675, + "grad_norm": 74.43984985351562, + "kl": 2.060546875, + "learning_rate": 3.404677270690443e-07, + "loss": 0.1973, + "reward": 0.5848214477300644, + "reward_std": 0.1568812057375908, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4469866380095482, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1486 }, { "clip_ratio": 0.0, - "completion_length": 1785.9598999023438, + "completion_length": 1640.8572082519531, "epoch": 0.44417892614442533, - "grad_norm": 6.109893321990967, - "kl": 0.269287109375, - "learning_rate": 6.804491869295206e-08, - "loss": 0.0422, - "reward": 0.5619419813156128, - "reward_std": 0.17488190159201622, - "rewards/accuracy_reward": 0.11830358020961285, + "grad_norm": 37.137569427490234, + "kl": 2.84765625, + "learning_rate": 3.402245934647603e-07, + "loss": 0.2019, + "reward": 0.518415205180645, + "reward_std": 0.18236207589507103, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.443638414144516, + "rewards/tag_count_reward": 0.4157366305589676, "step": 1487 }, { "clip_ratio": 0.0, - "completion_length": 1733.5179443359375, + "completion_length": 1533.2389221191406, "epoch": 0.4444776342319468, - "grad_norm": 3.326977252960205, - "kl": 0.19091796875, - "learning_rate": 6.79962723409923e-08, - "loss": 0.0386, - "reward": 0.5591518208384514, - "reward_std": 0.12357292883098125, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 11.866114616394043, + "kl": 4.046875, + "learning_rate": 3.399813617049615e-07, + "loss": 0.2837, + "reward": 0.547991082072258, + "reward_std": 0.16181799583137035, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467633955180645, + "rewards/tag_count_reward": 0.4408482313156128, "step": 1488 }, { "clip_ratio": 0.0, - "completion_length": 1812.5045776367188, + "completion_length": 1586.3147888183594, "epoch": 0.4447763423194683, - "grad_norm": 2.3242106437683105, - "kl": 0.2294921875, - "learning_rate": 6.794760641085207e-08, - "loss": 0.0484, - "reward": 0.4933036044239998, - "reward_std": 0.1400228850543499, - "rewards/accuracy_reward": 0.04017857275903225, + "grad_norm": 8.2020902633667, + "kl": 3.83984375, + "learning_rate": 3.397380320542603e-07, + "loss": 0.2906, + "reward": 0.4659598395228386, + "reward_std": 0.13205212727189064, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4531250223517418, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1489 }, { "clip_ratio": 0.0, - "completion_length": 1755.0179138183594, + "completion_length": 1571.9844360351562, "epoch": 0.44507505040698975, - "grad_norm": 7.687953472137451, - "kl": 0.25341796875, - "learning_rate": 6.78989209554751e-08, - "loss": 0.0489, - "reward": 0.5424107313156128, - "reward_std": 0.13834340125322342, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 128.61541748046875, + "kl": 7.0625, + "learning_rate": 3.394946047773755e-07, + "loss": 0.4516, + "reward": 0.5005580633878708, + "reward_std": 0.14292092248797417, + "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4531250223517418, + "rewards/tag_count_reward": 0.4179687798023224, "step": 1490 }, { "clip_ratio": 0.0, - "completion_length": 1682.9933471679688, + "completion_length": 1519.2924499511719, "epoch": 0.4453737584945112, - "grad_norm": 5.7059645652771, - "kl": 0.2666015625, - "learning_rate": 6.785021602782644e-08, - "loss": 0.0669, - "reward": 0.5256696566939354, - "reward_std": 0.15839041396975517, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 45.22102737426758, + "kl": 4.16015625, + "learning_rate": 3.3925108013913217e-07, + "loss": 0.2896, + "reward": 0.5111607313156128, + "reward_std": 0.16807224228978157, + "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768059372902, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1491 }, { "clip_ratio": 0.0, - "completion_length": 1785.2612609863281, + "completion_length": 1505.8482360839844, "epoch": 0.4456724665820327, - "grad_norm": 3.7409005165100098, - "kl": 0.29443359375, - "learning_rate": 6.780149168089225e-08, - "loss": 0.0524, - "reward": 0.4676339402794838, - "reward_std": 0.15025346726179123, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 38.634185791015625, + "kl": 3.82421875, + "learning_rate": 3.390074584044612e-07, + "loss": 0.2456, + "reward": 0.467633955180645, + "reward_std": 0.13928714580833912, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4363839477300644, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1492 }, { "clip_ratio": 0.0, - "completion_length": 1675.4866943359375, + "completion_length": 1439.7679443359375, "epoch": 0.44597117466955416, - "grad_norm": 2.305168390274048, - "kl": 0.2490234375, - "learning_rate": 6.775274796767981e-08, - "loss": 0.0367, - "reward": 0.5703125298023224, - "reward_std": 0.1642828807234764, - "rewards/accuracy_reward": 0.10937500279396772, + "grad_norm": 15.070361137390137, + "kl": 3.1875, + "learning_rate": 3.3876373983839904e-07, + "loss": 0.2673, + "reward": 0.5390625149011612, + "reward_std": 0.1723210271447897, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4609375149011612, + "rewards/tag_count_reward": 0.447544664144516, "step": 1493 }, { "clip_ratio": 0.0, - "completion_length": 1810.9286499023438, + "completion_length": 1623.99560546875, "epoch": 0.44626988275707563, - "grad_norm": 8.755234718322754, - "kl": 0.328125, - "learning_rate": 6.770398494121756e-08, - "loss": 0.0375, - "reward": 0.490513414144516, - "reward_std": 0.1681601796299219, - "rewards/accuracy_reward": 0.04017857392318547, + "grad_norm": 38.99897384643555, + "kl": 4.4140625, + "learning_rate": 3.385199247060878e-07, + "loss": 0.2821, + "reward": 0.447544664144516, + "reward_std": 0.14903432875871658, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4503348395228386, + "rewards/tag_count_reward": 0.4252232387661934, "step": 1494 }, { "clip_ratio": 0.0, - "completion_length": 1851.3081359863281, + "completion_length": 1688.185302734375, "epoch": 0.4465685908445971, - "grad_norm": 4.7878098487854, - "kl": 0.2734375, - "learning_rate": 6.765520265455487e-08, - "loss": 0.0431, - "reward": 0.5223214626312256, - "reward_std": 0.17797556146979332, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 22.86096954345703, + "kl": 4.47265625, + "learning_rate": 3.382760132727744e-07, + "loss": 0.2946, + "reward": 0.5033482387661934, + "reward_std": 0.20057121478021145, + "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4464285895228386, + "rewards/tag_count_reward": 0.416294664144516, "step": 1495 }, { "clip_ratio": 0.0, - "completion_length": 1693.1965026855469, + "completion_length": 1551.3348999023438, "epoch": 0.4468672989321186, - "grad_norm": 1.8489055633544922, - "kl": 0.253173828125, - "learning_rate": 6.760640116076209e-08, - "loss": 0.0565, - "reward": 0.5574777126312256, - "reward_std": 0.1406924668699503, - "rewards/accuracy_reward": 0.10491072246804833, + "grad_norm": 38.5744743347168, + "kl": 4.9453125, + "learning_rate": 3.380320058038105e-07, + "loss": 0.3182, + "reward": 0.509486623108387, + "reward_std": 0.14495181292295456, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669887661934, + "rewards/tag_count_reward": 0.4179687649011612, "step": 1496 }, { "clip_ratio": 0.0, - "completion_length": 1719.7478637695312, + "completion_length": 1547.85498046875, "epoch": 0.44716600701964004, - "grad_norm": 6.144237995147705, - "kl": 0.22265625, - "learning_rate": 6.755758051293046e-08, - "loss": 0.0498, - "reward": 0.4882812649011612, - "reward_std": 0.10345883294939995, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 32.32548522949219, + "kl": 3.8125, + "learning_rate": 3.377879025646523e-07, + "loss": 0.305, + "reward": 0.4464285895228386, + "reward_std": 0.13425000198185444, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4614955559372902, + "rewards/tag_count_reward": 0.4263393059372902, "step": 1497 }, { "clip_ratio": 0.0, - "completion_length": 1808.837158203125, + "completion_length": 1646.5670166015625, "epoch": 0.4474647151071615, - "grad_norm": 5.352488040924072, - "kl": 0.3310546875, - "learning_rate": 6.750874076417205e-08, - "loss": 0.0434, - "reward": 0.5273437723517418, - "reward_std": 0.186806483194232, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 15.072153091430664, + "kl": 3.32421875, + "learning_rate": 3.375437038208603e-07, + "loss": 0.2123, + "reward": 0.491629496216774, + "reward_std": 0.160063149407506, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4425223395228386, + "rewards/tag_count_reward": 0.4224330559372902, "step": 1498 }, { "clip_ratio": 0.0, - "completion_length": 1818.85498046875, + "completion_length": 1623.4241943359375, "epoch": 0.447763423194683, - "grad_norm": 2.198775291442871, - "kl": 0.227783203125, - "learning_rate": 6.745988196761975e-08, - "loss": 0.0381, - "reward": 0.5329241380095482, - "reward_std": 0.18520035222172737, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 5.24835205078125, + "kl": 3.67578125, + "learning_rate": 3.372994098380987e-07, + "loss": 0.2491, + "reward": 0.5027901977300644, + "reward_std": 0.16960198618471622, + "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.454799123108387, + "rewards/tag_count_reward": 0.4268973395228386, "step": 1499 }, { "clip_ratio": 0.0, - "completion_length": 1628.3147888183594, + "completion_length": 1423.0603332519531, "epoch": 0.44806213128220446, - "grad_norm": 4.369704246520996, - "kl": 0.223388671875, - "learning_rate": 6.74110041764271e-08, - "loss": 0.048, - "reward": 0.518973246216774, - "reward_std": 0.16064660251140594, - "rewards/accuracy_reward": 0.06919643003493547, + "grad_norm": 19.80891227722168, + "kl": 3.4453125, + "learning_rate": 3.370550208821355e-07, + "loss": 0.2943, + "reward": 0.542410746216774, + "reward_std": 0.18154528737068176, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497767984867096, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1500 }, { "clip_ratio": 0.0, - "completion_length": 1716.2455749511719, + "completion_length": 1470.6898193359375, "epoch": 0.4483608393697259, - "grad_norm": 8.060559272766113, - "kl": 0.266845703125, - "learning_rate": 6.736210744376838e-08, - "loss": 0.0441, - "reward": 0.6344866380095482, - "reward_std": 0.12660973332822323, - "rewards/accuracy_reward": 0.16741072689183056, + "grad_norm": 29.724716186523438, + "kl": 3.375, + "learning_rate": 3.368105372188419e-07, + "loss": 0.2923, + "reward": 0.6149553805589676, + "reward_std": 0.1389238890260458, + "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.467075914144516, + "rewards/tag_count_reward": 0.4475446715950966, "step": 1501 }, { "clip_ratio": 0.0, - "completion_length": 1751.5648193359375, + "completion_length": 1558.1875915527344, "epoch": 0.4486595474572474, - "grad_norm": 4.674781322479248, - "kl": 0.271240234375, - "learning_rate": 6.731319182283844e-08, - "loss": 0.0354, - "reward": 0.517857164144516, - "reward_std": 0.18095066398382187, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 6.352499961853027, + "kl": 3.0234375, + "learning_rate": 3.3656595911419217e-07, + "loss": 0.2082, + "reward": 0.4860491380095482, + "reward_std": 0.18849169462919235, + "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4598214477300644, + "rewards/tag_count_reward": 0.4280134215950966, "step": 1502 }, { "clip_ratio": 0.0, - "completion_length": 1751.38623046875, + "completion_length": 1565.8036499023438, "epoch": 0.44895825554476887, - "grad_norm": 3.4245877265930176, - "kl": 0.251220703125, - "learning_rate": 6.726425736685265e-08, - "loss": 0.0425, - "reward": 0.5027901902794838, - "reward_std": 0.15551226399838924, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 18.030637741088867, + "kl": 4.5, + "learning_rate": 3.3632128683426323e-07, + "loss": 0.3112, + "reward": 0.4592634066939354, + "reward_std": 0.15346484631299973, + "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.451450914144516, + "rewards/tag_count_reward": 0.4324776902794838, "step": 1503 }, { "clip_ratio": 0.0, - "completion_length": 1767.430908203125, + "completion_length": 1601.0759582519531, "epoch": 0.44925696363229034, - "grad_norm": 1.7648547887802124, - "kl": 0.21142578125, - "learning_rate": 6.721530412904693e-08, - "loss": 0.0415, - "reward": 0.4843750149011612, - "reward_std": 0.1491565089672804, - "rewards/accuracy_reward": 0.024553572293370962, + "grad_norm": 43.94953155517578, + "kl": 5.046875, + "learning_rate": 3.360765206452346e-07, + "loss": 0.3307, + "reward": 0.4525669887661934, + "reward_std": 0.14980255626142025, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459821455180645, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1504 }, { "clip_ratio": 0.0, - "completion_length": 1714.2813110351562, + "completion_length": 1543.4889221191406, "epoch": 0.4495556717198118, - "grad_norm": 4.464597702026367, - "kl": 0.212646484375, - "learning_rate": 6.71663321626776e-08, - "loss": 0.0517, - "reward": 0.5658482313156128, - "reward_std": 0.130269268527627, - "rewards/accuracy_reward": 0.10044643259607255, + "grad_norm": 30.042516708374023, + "kl": 4.8359375, + "learning_rate": 3.35831660813388e-07, + "loss": 0.3344, + "reward": 0.5161830633878708, + "reward_std": 0.1402545627206564, + "rewards/accuracy_reward": 0.0848214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4654017984867096, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1505 }, { "clip_ratio": 0.0, - "completion_length": 1812.180908203125, + "completion_length": 1676.6049499511719, "epoch": 0.4498543798073333, - "grad_norm": 5.551121234893799, - "kl": 0.2646484375, - "learning_rate": 6.711734152102136e-08, - "loss": 0.0304, - "reward": 0.5418526977300644, - "reward_std": 0.1599228847771883, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 42.993568420410156, + "kl": 3.1953125, + "learning_rate": 3.355867076051068e-07, + "loss": 0.2286, + "reward": 0.4927455633878708, + "reward_std": 0.1646582894027233, + "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669887661934, + "rewards/tag_count_reward": 0.4146205559372902, "step": 1506 }, { "clip_ratio": 0.0, - "completion_length": 1738.5536499023438, + "completion_length": 1582.7590026855469, "epoch": 0.45015308789485475, - "grad_norm": 5.085516929626465, - "kl": 0.252197265625, - "learning_rate": 6.706833225737522e-08, - "loss": 0.0462, - "reward": 0.5106026977300644, - "reward_std": 0.13200715370476246, - "rewards/accuracy_reward": 0.05803571571595967, + "grad_norm": 13.065696716308594, + "kl": 4.1015625, + "learning_rate": 3.353416612868761e-07, + "loss": 0.2979, + "reward": 0.4949776902794838, + "reward_std": 0.16566613502800465, + "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669813156128, + "rewards/tag_count_reward": 0.4302455484867096, "step": 1507 }, { "clip_ratio": 0.0, - "completion_length": 1780.5179443359375, + "completion_length": 1593.8572387695312, "epoch": 0.4504517959823762, - "grad_norm": 3.296800374984741, - "kl": 0.254150390625, - "learning_rate": 6.701930442505648e-08, - "loss": 0.0347, - "reward": 0.5647321715950966, - "reward_std": 0.1523994691669941, - "rewards/accuracy_reward": 0.11383929289877415, + "grad_norm": 36.64884948730469, + "kl": 3.48828125, + "learning_rate": 3.350965221252824e-07, + "loss": 0.2778, + "reward": 0.548549123108387, + "reward_std": 0.14168071933090687, + "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.443638414144516, "step": 1508 }, { "clip_ratio": 0.0, - "completion_length": 1762.0670776367188, + "completion_length": 1591.6027526855469, "epoch": 0.4507505040698977, - "grad_norm": 5.8201518058776855, - "kl": 0.24560546875, - "learning_rate": 6.697025807740258e-08, - "loss": 0.0541, - "reward": 0.494419664144516, - "reward_std": 0.15148286893963814, - "rewards/accuracy_reward": 0.03571428684517741, + "grad_norm": 36.84761428833008, + "kl": 4.8046875, + "learning_rate": 3.348512903870129e-07, + "loss": 0.3053, + "reward": 0.4497767984867096, + "reward_std": 0.1509563997387886, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4587053805589676, + "rewards/tag_count_reward": 0.4274553805589676, "step": 1509 }, { "clip_ratio": 0.0, - "completion_length": 1752.6920471191406, + "completion_length": 1587.0179138183594, "epoch": 0.45104921215741917, - "grad_norm": 104.9288101196289, - "kl": 0.98681640625, - "learning_rate": 6.692119326777116e-08, - "loss": 0.0815, - "reward": 0.5781250149011612, - "reward_std": 0.13811324536800385, - "rewards/accuracy_reward": 0.12500000465661287, + "grad_norm": 30.43524169921875, + "kl": 3.6328125, + "learning_rate": 3.346059663388558e-07, + "loss": 0.2904, + "reward": 0.544642873108387, + "reward_std": 0.13620512187480927, + "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4531250223517418, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1510 }, { "clip_ratio": 0.0, - "completion_length": 1690.7456359863281, + "completion_length": 1455.8616943359375, "epoch": 0.45134792024494064, - "grad_norm": 5.091546535491943, - "kl": 0.228271484375, - "learning_rate": 6.687211004953991e-08, - "loss": 0.066, - "reward": 0.4469866305589676, - "reward_std": 0.12991252727806568, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 33.104095458984375, + "kl": 2.546875, + "learning_rate": 3.3436055024769956e-07, + "loss": 0.215, + "reward": 0.4609375223517418, + "reward_std": 0.10308028198778629, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4402901977300644, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1511 }, { "clip_ratio": 0.0, - "completion_length": 1740.0893859863281, + "completion_length": 1600.2656860351562, "epoch": 0.4516466283324621, - "grad_norm": 4.242791175842285, - "kl": 0.28515625, - "learning_rate": 6.682300847610658e-08, - "loss": 0.0468, - "reward": 0.521763414144516, - "reward_std": 0.18117473274469376, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 23.384510040283203, + "kl": 4.92578125, + "learning_rate": 3.341150423805329e-07, + "loss": 0.3487, + "reward": 0.4988839626312256, + "reward_std": 0.1812890563160181, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4369419887661934, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1512 }, { "clip_ratio": 0.0, - "completion_length": 1608.3036499023438, + "completion_length": 1417.7567749023438, "epoch": 0.4519453364199836, - "grad_norm": 3.3788812160491943, - "kl": 0.23974609375, - "learning_rate": 6.677388860088886e-08, - "loss": 0.0567, - "reward": 0.694754496216774, - "reward_std": 0.1574759017676115, - "rewards/accuracy_reward": 0.2433035783469677, + "grad_norm": 27.088300704956055, + "kl": 2.89453125, + "learning_rate": 3.3386944300444433e-07, + "loss": 0.2482, + "reward": 0.6875000447034836, + "reward_std": 0.18422217667102814, + "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.451450914144516, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1513 }, { "clip_ratio": 0.0, - "completion_length": 1677.2835693359375, + "completion_length": 1489.8304443359375, "epoch": 0.45224404450750505, - "grad_norm": 36.08264923095703, - "kl": 0.477294921875, - "learning_rate": 6.672475047732435e-08, - "loss": 0.0752, - "reward": 0.6372768133878708, - "reward_std": 0.13933584466576576, - "rewards/accuracy_reward": 0.1830357275903225, + "grad_norm": 9.002484321594238, + "kl": 3.859375, + "learning_rate": 3.336237523866218e-07, + "loss": 0.2967, + "reward": 0.6060268133878708, + "reward_std": 0.1347727421671152, + "rewards/accuracy_reward": 0.16964286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410895228386, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1514 }, { "clip_ratio": 0.0, - "completion_length": 1691.7277526855469, + "completion_length": 1484.7634582519531, "epoch": 0.4525427525950265, - "grad_norm": 5.863283157348633, - "kl": 0.24365234375, - "learning_rate": 6.667559415887054e-08, - "loss": 0.0524, - "reward": 0.5189732238650322, - "reward_std": 0.14016072638332844, - "rewards/accuracy_reward": 0.06250000093132257, + "grad_norm": 27.8223876953125, + "kl": 3.33203125, + "learning_rate": 3.333779707943527e-07, + "loss": 0.2867, + "reward": 0.5011160895228386, + "reward_std": 0.1420827228575945, + "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4564732313156128, + "rewards/tag_count_reward": 0.443080373108387, "step": 1515 }, { "clip_ratio": 0.0, - "completion_length": 1658.4822387695312, + "completion_length": 1449.5201416015625, "epoch": 0.452841460682548, - "grad_norm": 6.656203269958496, - "kl": 0.330810546875, - "learning_rate": 6.662641969900466e-08, - "loss": 0.0535, - "reward": 0.5809152126312256, - "reward_std": 0.17422176524996758, - "rewards/accuracy_reward": 0.12500000465661287, + "grad_norm": 52.28901672363281, + "kl": 2.767578125, + "learning_rate": 3.331320984950233e-07, + "loss": 0.2475, + "reward": 0.5719866305589676, + "reward_std": 0.1677960641682148, + "rewards/accuracy_reward": 0.12946429662406445, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.455915205180645, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1516 }, { "clip_ratio": 0.0, - "completion_length": 1816.4732971191406, + "completion_length": 1625.49560546875, "epoch": 0.45314016877006946, - "grad_norm": 4.262150287628174, - "kl": 0.276611328125, - "learning_rate": 6.657722715122371e-08, - "loss": 0.0534, - "reward": 0.5139509215950966, - "reward_std": 0.14562761411070824, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 60.30420684814453, + "kl": 6.2890625, + "learning_rate": 3.328861357561186e-07, + "loss": 0.4202, + "reward": 0.4910714477300644, + "reward_std": 0.13211988098919392, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4335937723517418, + "rewards/tag_count_reward": 0.4174107313156128, "step": 1517 }, { "clip_ratio": 0.0, - "completion_length": 1745.3661804199219, + "completion_length": 1580.3616638183594, "epoch": 0.45343887685759093, - "grad_norm": 3.7692458629608154, - "kl": 0.2216796875, - "learning_rate": 6.652801656904436e-08, - "loss": 0.04, - "reward": 0.581473246216774, - "reward_std": 0.13311555795371532, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 9.959226608276367, + "kl": 3.390625, + "learning_rate": 3.326400828452218e-07, + "loss": 0.2469, + "reward": 0.5703125223517418, + "reward_std": 0.14995422959327698, + "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.447544664144516, + "rewards/tag_count_reward": 0.431919664144516, "step": 1518 }, { "clip_ratio": 0.0, - "completion_length": 1792.0848999023438, + "completion_length": 1618.0402526855469, "epoch": 0.4537375849451124, - "grad_norm": 2.9019274711608887, - "kl": 0.31689453125, - "learning_rate": 6.64787880060029e-08, - "loss": 0.0488, - "reward": 0.4771205633878708, - "reward_std": 0.14316329918801785, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 15.71210765838623, + "kl": 4.37109375, + "learning_rate": 3.3239394003001453e-07, + "loss": 0.3111, + "reward": 0.4386160895228386, + "reward_std": 0.15605469979345798, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4436384066939354, + "rewards/tag_count_reward": 0.4140625223517418, "step": 1519 }, { "clip_ratio": 0.0, - "completion_length": 1777.5201721191406, + "completion_length": 1550.0982971191406, "epoch": 0.4540362930326339, - "grad_norm": 4.189798831939697, - "kl": 0.2255859375, - "learning_rate": 6.64295415156552e-08, - "loss": 0.0414, - "reward": 0.494419664144516, - "reward_std": 0.14198914170265198, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 53.627159118652344, + "kl": 4.71484375, + "learning_rate": 3.32147707578276e-07, + "loss": 0.3368, + "reward": 0.4765625223517418, + "reward_std": 0.15795458853244781, + "rewards/accuracy_reward": 0.042410714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768059372902, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1520 }, { "clip_ratio": 0.0, - "completion_length": 1736.1652526855469, + "completion_length": 1599.47998046875, "epoch": 0.45433500112015535, - "grad_norm": 10.14603328704834, - "kl": 0.37939453125, - "learning_rate": 6.638027715157659e-08, - "loss": 0.0496, - "reward": 0.463727705180645, - "reward_std": 0.11404452286660671, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 92.64781951904297, + "kl": 5.8515625, + "learning_rate": 3.3190138575788294e-07, + "loss": 0.3588, + "reward": 0.4285714477300644, + "reward_std": 0.1163560189306736, + "rewards/accuracy_reward": 0.0066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4503348395228386, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1521 }, { "clip_ratio": 0.0, - "completion_length": 1749.0647888183594, + "completion_length": 1469.4152221679688, "epoch": 0.4546337092076768, - "grad_norm": 4.326074600219727, - "kl": 0.3056640625, - "learning_rate": 6.633099496736187e-08, - "loss": 0.0558, + "grad_norm": 38.57954406738281, + "kl": 3.0703125, + "learning_rate": 3.316549748368094e-07, + "loss": 0.2453, "reward": 0.4938616305589676, - "reward_std": 0.14635180868208408, - "rewards/accuracy_reward": 0.051339287078008056, + "reward_std": 0.13310185261070728, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4425223395228386, + "rewards/tag_count_reward": 0.4402901977300644, "step": 1522 }, { "clip_ratio": 0.0, - "completion_length": 1746.4085388183594, + "completion_length": 1576.7790832519531, "epoch": 0.4549324172951983, - "grad_norm": 6.320223331451416, - "kl": 0.3095703125, - "learning_rate": 6.628169501662526e-08, - "loss": 0.0362, - "reward": 0.5106026902794838, - "reward_std": 0.17062296718358994, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 37.82828903198242, + "kl": 5.26171875, + "learning_rate": 3.314084750831263e-07, + "loss": 0.3584, + "reward": 0.4720982387661934, + "reward_std": 0.16154007986187935, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4525669887661934, + "rewards/tag_count_reward": 0.4296875298023224, "step": 1523 }, { "clip_ratio": 0.0, - "completion_length": 1785.0335693359375, + "completion_length": 1543.19873046875, "epoch": 0.45523112538271976, - "grad_norm": 29.362642288208008, - "kl": 0.52099609375, - "learning_rate": 6.623237735300024e-08, - "loss": 0.0801, - "reward": 0.5362723544239998, - "reward_std": 0.18161597102880478, + "grad_norm": 72.25894165039062, + "kl": 5.796875, + "learning_rate": 3.3116188676500116e-07, + "loss": 0.4166, + "reward": 0.528459832072258, + "reward_std": 0.16235975921154022, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4313616305589676, + "rewards/tag_count_reward": 0.4235491156578064, "step": 1524 }, { "clip_ratio": 0.0, - "completion_length": 1741.9331359863281, + "completion_length": 1505.185302734375, "epoch": 0.45552983347024123, - "grad_norm": 4.49923849105835, - "kl": 0.28173828125, - "learning_rate": 6.618304203013957e-08, - "loss": 0.0481, - "reward": 0.5273437574505806, - "reward_std": 0.1418753769248724, - "rewards/accuracy_reward": 0.07366071850992739, + "grad_norm": 18.44721031188965, + "kl": 4.01953125, + "learning_rate": 3.309152101506979e-07, + "loss": 0.2914, + "reward": 0.4994419887661934, + "reward_std": 0.12928788736462593, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4536830484867096, + "rewards/tag_count_reward": 0.4347098469734192, "step": 1525 }, { "clip_ratio": 0.0, - "completion_length": 1743.4710693359375, + "completion_length": 1553.6607971191406, "epoch": 0.4558285415577627, - "grad_norm": 4.344564437866211, - "kl": 0.22119140625, - "learning_rate": 6.613368910171531e-08, - "loss": 0.052, - "reward": 0.5279018133878708, - "reward_std": 0.15228487737476826, - "rewards/accuracy_reward": 0.07142857485450804, + "grad_norm": 35.04893493652344, + "kl": 2.89453125, + "learning_rate": 3.306684455085766e-07, + "loss": 0.2387, + "reward": 0.5279018208384514, + "reward_std": 0.1604275368154049, + "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4564732313156128, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1526 }, { "clip_ratio": 0.0, - "completion_length": 1754.2991943359375, + "completion_length": 1548.3014221191406, "epoch": 0.45612724964528417, - "grad_norm": 4.315561771392822, - "kl": 0.3037109375, - "learning_rate": 6.608431862141859e-08, - "loss": 0.0603, - "reward": 0.6093750298023224, - "reward_std": 0.15723979473114014, - "rewards/accuracy_reward": 0.16741072130389512, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4419643059372902, + "grad_norm": 11.50898551940918, + "kl": 4.09765625, + "learning_rate": 3.3042159310709294e-07, + "loss": 0.3116, + "reward": 0.6177455559372902, + "reward_std": 0.159631185233593, + "rewards/accuracy_reward": 0.18526786309666932, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4324776902794838, "step": 1527 }, { "clip_ratio": 0.0, - "completion_length": 1727.9554443359375, + "completion_length": 1595.4130249023438, "epoch": 0.45642595773280564, - "grad_norm": 5.377536296844482, - "kl": 0.316650390625, - "learning_rate": 6.603493064295962e-08, - "loss": 0.0556, - "reward": 0.5435268133878708, - "reward_std": 0.1838950328528881, - "rewards/accuracy_reward": 0.09375000861473382, + "grad_norm": 18.10394287109375, + "kl": 3.9296875, + "learning_rate": 3.3017465321479807e-07, + "loss": 0.3052, + "reward": 0.5133928805589676, + "reward_std": 0.19171272590756416, + "rewards/accuracy_reward": 0.08705357811413705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768133878708, + "rewards/tag_count_reward": 0.4263392984867096, "step": 1528 }, { "clip_ratio": 0.0, - "completion_length": 1746.0737609863281, + "completion_length": 1528.7857666015625, "epoch": 0.45672466582032706, - "grad_norm": 4.988655090332031, - "kl": 0.289794921875, - "learning_rate": 6.598552522006771e-08, - "loss": 0.0536, - "reward": 0.5033482387661934, - "reward_std": 0.1800468172878027, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 29.602020263671875, + "kl": 3.06640625, + "learning_rate": 3.299276261003386e-07, + "loss": 0.2645, + "reward": 0.494977705180645, + "reward_std": 0.19291643798351288, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4542410895228386, + "rewards/tag_count_reward": 0.443638414144516, "step": 1529 }, { "clip_ratio": 0.0, - "completion_length": 1701.1273193359375, + "completion_length": 1509.0514221191406, "epoch": 0.45702337390784853, - "grad_norm": 6.007601261138916, - "kl": 0.203369140625, - "learning_rate": 6.593610240649113e-08, - "loss": 0.0508, - "reward": 0.5982143133878708, - "reward_std": 0.18733007088303566, - "rewards/accuracy_reward": 0.1383928619325161, + "grad_norm": 10.063702583312988, + "kl": 2.4765625, + "learning_rate": 3.2968051203245563e-07, + "loss": 0.1769, + "reward": 0.6032366305589676, + "reward_std": 0.21050818637013435, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4598214477300644, + "rewards/tag_count_reward": 0.4514509066939354, "step": 1530 }, { "clip_ratio": 0.0, - "completion_length": 1758.1429748535156, + "completion_length": 1581.0714721679688, "epoch": 0.45732208199537, - "grad_norm": 4.976958274841309, - "kl": 0.29248046875, - "learning_rate": 6.588666225599701e-08, - "loss": 0.0547, - "reward": 0.5100446715950966, - "reward_std": 0.15009317360818386, - "rewards/accuracy_reward": 0.06250000186264515, + "grad_norm": 30.84069061279297, + "kl": 3.8046875, + "learning_rate": 3.294333112799851e-07, + "loss": 0.2768, + "reward": 0.4726562574505806, + "reward_std": 0.12849678099155426, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.447544664144516, + "rewards/tag_count_reward": 0.4257812649011612, "step": 1531 }, { "clip_ratio": 0.0, - "completion_length": 1684.7746276855469, + "completion_length": 1515.9844360351562, "epoch": 0.45762079008289147, - "grad_norm": 51.1270637512207, - "kl": 0.49365234375, - "learning_rate": 6.583720482237142e-08, - "loss": 0.0672, - "reward": 0.6316964626312256, - "reward_std": 0.21022489666938782, - "rewards/accuracy_reward": 0.196428582072258, + "grad_norm": 11.016000747680664, + "kl": 3.521484375, + "learning_rate": 3.291860241118571e-07, + "loss": 0.2573, + "reward": 0.6132812798023224, + "reward_std": 0.19825753197073936, + "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4352678805589676, + "rewards/tag_count_reward": 0.432477705180645, "step": 1532 }, { "clip_ratio": 0.0, - "completion_length": 1833.2232971191406, + "completion_length": 1682.74560546875, "epoch": 0.45791949817041294, - "grad_norm": 3.832747220993042, - "kl": 0.284912109375, - "learning_rate": 6.57877301594192e-08, - "loss": 0.0493, - "reward": 0.4838169887661934, - "reward_std": 0.1346392072737217, + "grad_norm": 22.07780647277832, + "kl": 3.6015625, + "learning_rate": 3.28938650797096e-07, + "loss": 0.2411, + "reward": 0.4570312798023224, + "reward_std": 0.13189779594540596, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4391741305589676, + "rewards/tag_count_reward": 0.412388414144516, "step": 1533 }, { "clip_ratio": 0.0, - "completion_length": 1796.6920471191406, + "completion_length": 1622.18310546875, "epoch": 0.4582182062579344, - "grad_norm": 7.391679763793945, - "kl": 0.269775390625, - "learning_rate": 6.573823832096394e-08, - "loss": 0.0521, - "reward": 0.498325914144516, - "reward_std": 0.1873866803944111, - "rewards/accuracy_reward": 0.049107145983725786, + "grad_norm": 5.965703964233398, + "kl": 3.65625, + "learning_rate": 3.286911916048197e-07, + "loss": 0.2527, + "reward": 0.4793526902794838, + "reward_std": 0.1735621690750122, + "rewards/accuracy_reward": 0.04687500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4492187723517418, + "rewards/tag_count_reward": 0.432477705180645, "step": 1534 }, { "clip_ratio": 0.0, - "completion_length": 1730.07373046875, + "completion_length": 1618.3572082519531, "epoch": 0.4585169143454559, - "grad_norm": 4.622313022613525, - "kl": 0.295166015625, - "learning_rate": 6.568872936084788e-08, - "loss": 0.0688, - "reward": 0.5853794887661934, - "reward_std": 0.16905754432082176, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 72.41387176513672, + "kl": 4.8203125, + "learning_rate": 3.284436468042394e-07, + "loss": 0.3124, + "reward": 0.5680803805589676, + "reward_std": 0.15009262412786484, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4469866305589676, + "rewards/tag_count_reward": 0.431919664144516, "step": 1535 }, { "clip_ratio": 0.0, - "completion_length": 1732.8773193359375, + "completion_length": 1566.0513916015625, "epoch": 0.45881562243297735, - "grad_norm": 2.1004316806793213, - "kl": 0.294677734375, - "learning_rate": 6.563920333293194e-08, - "loss": 0.0406, - "reward": 0.5597098544239998, - "reward_std": 0.16391370445489883, - "rewards/accuracy_reward": 0.10491071874275804, + "grad_norm": 6.137658596038818, + "kl": 4.25390625, + "learning_rate": 3.2819601666465975e-07, + "loss": 0.307, + "reward": 0.5340401902794838, + "reward_std": 0.1548993308097124, + "rewards/accuracy_reward": 0.10044643189758062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4547991305589676, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1536 }, { "clip_ratio": 0.0, - "completion_length": 1679.8594665527344, + "completion_length": 1534.4554443359375, "epoch": 0.4591143305204988, - "grad_norm": 3.4295976161956787, - "kl": 0.270751953125, - "learning_rate": 6.558966029109561e-08, - "loss": 0.0485, - "reward": 0.5658482313156128, - "reward_std": 0.1517888680100441, - "rewards/accuracy_reward": 0.11607143771834671, + "grad_norm": 54.062076568603516, + "kl": 4.828125, + "learning_rate": 3.2794830145547804e-07, + "loss": 0.3313, + "reward": 0.5401785895228386, + "reward_std": 0.16778460703790188, + "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4497768133878708, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1537 }, { "clip_ratio": 0.0, - "completion_length": 1812.7188415527344, + "completion_length": 1638.7254943847656, "epoch": 0.4594130386080203, - "grad_norm": 4.3170013427734375, - "kl": 0.3125, - "learning_rate": 6.554010028923681e-08, - "loss": 0.043, - "reward": 0.5786830633878708, - "reward_std": 0.14550139755010605, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 89.19023132324219, + "kl": 5.52734375, + "learning_rate": 3.2770050144618405e-07, + "loss": 0.334, + "reward": 0.553571455180645, + "reward_std": 0.1346146371215582, + "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4402901977300644, + "rewards/tag_count_reward": 0.419642873108387, "step": 1538 }, { "clip_ratio": 0.0, - "completion_length": 1705.1407165527344, + "completion_length": 1447.7433776855469, "epoch": 0.45971174669554177, - "grad_norm": 7.492647171020508, - "kl": 0.352294921875, - "learning_rate": 6.549052338127198e-08, - "loss": 0.0459, - "reward": 0.5831473395228386, - "reward_std": 0.15557076409459114, - "rewards/accuracy_reward": 0.12276785937137902, + "grad_norm": 12.270461082458496, + "kl": 3.30859375, + "learning_rate": 3.2745261690635987e-07, + "loss": 0.2763, + "reward": 0.5664062798023224, + "reward_std": 0.16433368995785713, + "rewards/accuracy_reward": 0.11607143632136285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4603794813156128, + "rewards/tag_count_reward": 0.4503348469734192, "step": 1539 }, { "clip_ratio": 0.0, - "completion_length": 1754.3505249023438, + "completion_length": 1611.8148193359375, "epoch": 0.46001045478306324, - "grad_norm": 3.0670812129974365, - "kl": 0.255126953125, - "learning_rate": 6.544092962113596e-08, - "loss": 0.046, - "reward": 0.4905134215950966, - "reward_std": 0.1511122677475214, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 11.946927070617676, + "kl": 3.517578125, + "learning_rate": 3.2720464810567985e-07, + "loss": 0.2456, + "reward": 0.4743303880095482, + "reward_std": 0.16173738799989223, + "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.459263414144516, + "rewards/tag_count_reward": 0.4408482238650322, "step": 1540 }, { "clip_ratio": 0.0, - "completion_length": 1740.9911193847656, + "completion_length": 1504.0357666015625, "epoch": 0.4603091628705847, - "grad_norm": 7.682904243469238, - "kl": 0.259765625, - "learning_rate": 6.539131906278188e-08, - "loss": 0.0654, - "reward": 0.5390625298023224, - "reward_std": 0.180904658511281, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 19.580068588256836, + "kl": 3.3203125, + "learning_rate": 3.269565953139094e-07, + "loss": 0.2606, + "reward": 0.522321455180645, + "reward_std": 0.17548412643373013, + "rewards/accuracy_reward": 0.08482143585570157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4430803805589676, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1541 }, { "clip_ratio": 0.0, - "completion_length": 1829.0491943359375, + "completion_length": 1650.96435546875, "epoch": 0.4606078709581062, - "grad_norm": 4.020365238189697, - "kl": 0.341064453125, - "learning_rate": 6.534169176018117e-08, - "loss": 0.0519, - "reward": 0.5245535969734192, - "reward_std": 0.1716657616198063, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 28.742706298828125, + "kl": 4.46484375, + "learning_rate": 3.2670845880090587e-07, + "loss": 0.2845, + "reward": 0.5133928880095482, + "reward_std": 0.14286575093865395, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4308035895228386, + "rewards/tag_count_reward": 0.4174107387661934, "step": 1542 }, { "clip_ratio": 0.0, - "completion_length": 1829.2657165527344, + "completion_length": 1719.8438415527344, "epoch": 0.46090657904562765, - "grad_norm": 13.73774242401123, - "kl": 0.379638671875, - "learning_rate": 6.529204776732348e-08, - "loss": 0.0566, - "reward": 0.4497768059372902, - "reward_std": 0.15785526297986507, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 40.35120391845703, + "kl": 5.37109375, + "learning_rate": 3.264602388366174e-07, + "loss": 0.3464, + "reward": 0.4051339402794838, + "reward_std": 0.1628190465271473, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4296875298023224, + "rewards/tag_count_reward": 0.3917410895228386, "step": 1543 }, { "clip_ratio": 0.0, - "completion_length": 1751.0781860351562, + "completion_length": 1552.8527221679688, "epoch": 0.4612052871331491, - "grad_norm": 5.338003635406494, - "kl": 0.197998046875, - "learning_rate": 6.524238713821661e-08, - "loss": 0.0418, - "reward": 0.5234375223517418, - "reward_std": 0.13006016984581947, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 89.93563842773438, + "kl": 1.576171875, + "learning_rate": 3.26211935691083e-07, + "loss": 0.1509, + "reward": 0.5440848544239998, + "reward_std": 0.1459367722272873, + "rewards/accuracy_reward": 0.09375000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4587053805589676, + "rewards/tag_count_reward": 0.4503348544239998, "step": 1544 }, { "clip_ratio": 0.0, - "completion_length": 1699.8995971679688, + "completion_length": 1517.7723999023438, "epoch": 0.4615039952206706, - "grad_norm": 6.159023761749268, - "kl": 0.370849609375, - "learning_rate": 6.519270992688642e-08, - "loss": 0.0591, - "reward": 0.5435268133878708, - "reward_std": 0.20818424224853516, - "rewards/accuracy_reward": 0.10491071757860482, + "grad_norm": 85.09063720703125, + "kl": 2.30859375, + "learning_rate": 3.259635496344321e-07, + "loss": 0.2216, + "reward": 0.5558036044239998, + "reward_std": 0.1861688680946827, + "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4386160969734192, + "rewards/tag_count_reward": 0.435267873108387, "step": 1545 }, { "clip_ratio": 0.0, - "completion_length": 1757.8951721191406, + "completion_length": 1596.4263916015625, "epoch": 0.46180270330819206, - "grad_norm": 4.535447597503662, - "kl": 0.326171875, - "learning_rate": 6.514301618737689e-08, - "loss": 0.0666, - "reward": 0.471540205180645, - "reward_std": 0.15616635233163834, - "rewards/accuracy_reward": 0.026785715948790312, + "grad_norm": 34.276458740234375, + "kl": 3.40625, + "learning_rate": 3.2571508093688443e-07, + "loss": 0.2693, + "reward": 0.458705373108387, + "reward_std": 0.14789941906929016, + "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4447544887661934, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1546 }, { "clip_ratio": 0.0, - "completion_length": 1765.2857971191406, + "completion_length": 1615.2925109863281, "epoch": 0.46210141139571353, - "grad_norm": 5.503843307495117, - "kl": 0.321044921875, - "learning_rate": 6.509330597374992e-08, - "loss": 0.0524, - "reward": 0.459821455180645, - "reward_std": 0.1394188068807125, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 33.636871337890625, + "kl": 3.41796875, + "learning_rate": 3.254665298687496e-07, + "loss": 0.2709, + "reward": 0.4375000149011612, + "reward_std": 0.13667511753737926, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4464285895228386, + "rewards/tag_count_reward": 0.4285714402794838, "step": 1547 }, { "clip_ratio": 0.0, - "completion_length": 1802.2701721191406, + "completion_length": 1665.727783203125, "epoch": 0.462400119483235, - "grad_norm": 7.010849952697754, - "kl": 0.3447265625, - "learning_rate": 6.504357934008536e-08, - "loss": 0.0722, - "reward": 0.5379464402794838, - "reward_std": 0.17844453267753124, - "rewards/accuracy_reward": 0.11607143376022577, + "grad_norm": 17.28813362121582, + "kl": 4.625, + "learning_rate": 3.2521789670042683e-07, + "loss": 0.2981, + "reward": 0.5340401902794838, + "reward_std": 0.18080026656389236, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4218750223517418, + "rewards/tag_count_reward": 0.4112723469734192, "step": 1548 }, { "clip_ratio": 0.0, - "completion_length": 1713.5335388183594, + "completion_length": 1516.3594360351562, "epoch": 0.4626988275707565, - "grad_norm": 4.498828411102295, - "kl": 0.249755859375, - "learning_rate": 6.499383634048091e-08, - "loss": 0.0536, - "reward": 0.5217633992433548, - "reward_std": 0.13293885439634323, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 22.431278228759766, + "kl": 4.3359375, + "learning_rate": 3.249691817024046e-07, + "loss": 0.2993, + "reward": 0.4960937723517418, + "reward_std": 0.1388443373143673, + "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4436384066939354, + "rewards/tag_count_reward": 0.4268973395228386, "step": 1549 }, { "clip_ratio": 0.0, - "completion_length": 1753.2969360351562, + "completion_length": 1526.3348999023438, "epoch": 0.46299753565827795, - "grad_norm": 9.077760696411133, - "kl": 0.349609375, - "learning_rate": 6.494407702905207e-08, - "loss": 0.0662, - "reward": 0.4944196566939354, - "reward_std": 0.17740370333194733, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 30.365924835205078, + "kl": 4.6171875, + "learning_rate": 3.2472038514526036e-07, + "loss": 0.3297, + "reward": 0.494419664144516, + "reward_std": 0.15363386273384094, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.431919664144516, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1550 }, { "clip_ratio": 0.0, - "completion_length": 1778.8683776855469, + "completion_length": 1559.0045471191406, "epoch": 0.4632962437457994, - "grad_norm": 5.229265213012695, - "kl": 0.293701171875, - "learning_rate": 6.489430145993209e-08, - "loss": 0.0493, - "reward": 0.5262276977300644, - "reward_std": 0.13358874432742596, + "grad_norm": 38.4730224609375, + "kl": 2.8125, + "learning_rate": 3.244715072996605e-07, + "loss": 0.2349, + "reward": 0.5323661044239998, + "reward_std": 0.12168741971254349, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062649011612, + "rewards/tag_count_reward": 0.447544664144516, "step": 1551 }, { "clip_ratio": 0.0, - "completion_length": 1750.6451721191406, + "completion_length": 1571.024658203125, "epoch": 0.4635949518333209, - "grad_norm": 5.818874359130859, - "kl": 0.3056640625, - "learning_rate": 6.484450968727195e-08, - "loss": 0.0598, - "reward": 0.5496651977300644, - "reward_std": 0.18730313703417778, - "rewards/accuracy_reward": 0.11160715017467737, + "grad_norm": 10.339950561523438, + "kl": 3.9140625, + "learning_rate": 3.242225484363597e-07, + "loss": 0.3001, + "reward": 0.5385044887661934, + "reward_std": 0.199076347053051, + "rewards/accuracy_reward": 0.10491071990691125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4380580559372902, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1552 }, { "clip_ratio": 0.0, - "completion_length": 1748.5000915527344, + "completion_length": 1556.0134582519531, "epoch": 0.46389365992084236, - "grad_norm": 4.385819911956787, - "kl": 0.29638671875, - "learning_rate": 6.479470176524014e-08, - "loss": 0.0604, - "reward": 0.4960937723517418, - "reward_std": 0.14529666863381863, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 9.494263648986816, + "kl": 3.9296875, + "learning_rate": 3.2397350882620073e-07, + "loss": 0.3112, + "reward": 0.4838169887661934, + "reward_std": 0.13662613928318024, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4425223395228386, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1553 }, { "clip_ratio": 0.0, - "completion_length": 1759.5134887695312, + "completion_length": 1632.8192749023438, "epoch": 0.46419236800836383, - "grad_norm": 2.454691171646118, - "kl": 0.365234375, - "learning_rate": 6.474487774802289e-08, - "loss": 0.0568, - "reward": 0.4815848469734192, - "reward_std": 0.14757275953888893, - "rewards/accuracy_reward": 0.04910714505240321, + "grad_norm": 10.194806098937988, + "kl": 4.4453125, + "learning_rate": 3.2372438874011444e-07, + "loss": 0.3105, + "reward": 0.4598214477300644, + "reward_std": 0.12361835315823555, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4324776977300644, + "rewards/tag_count_reward": 0.4174107313156128, "step": 1554 }, { "clip_ratio": 0.0, - "completion_length": 1783.0134887695312, + "completion_length": 1631.2791137695312, "epoch": 0.4644910760958853, - "grad_norm": 2.363880157470703, - "kl": 0.2734375, - "learning_rate": 6.469503768982378e-08, - "loss": 0.048, - "reward": 0.5195312798023224, - "reward_std": 0.1504480503499508, - "rewards/accuracy_reward": 0.06473214784637094, + "grad_norm": 5.9457268714904785, + "kl": 4.29296875, + "learning_rate": 3.2347518844911894e-07, + "loss": 0.3065, + "reward": 0.494977705180645, + "reward_std": 0.1569997202605009, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.454799123108387, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1555 }, { "clip_ratio": 0.0, - "completion_length": 1765.5112609863281, + "completion_length": 1629.4732971191406, "epoch": 0.4647897841834068, - "grad_norm": 2.9426026344299316, - "kl": 0.271728515625, - "learning_rate": 6.464518164486395e-08, - "loss": 0.0575, - "reward": 0.5887277126312256, - "reward_std": 0.24885709397494793, - "rewards/accuracy_reward": 0.1517857201397419, + "grad_norm": 56.91993713378906, + "kl": 4.5078125, + "learning_rate": 3.2322590822431973e-07, + "loss": 0.2911, + "reward": 0.5837053954601288, + "reward_std": 0.23714406602084637, + "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4369419813156128, + "rewards/tag_count_reward": 0.427455373108387, "step": 1556 }, { "clip_ratio": 0.0, - "completion_length": 1719.9933776855469, + "completion_length": 1543.5692749023438, "epoch": 0.46508849227092824, - "grad_norm": 3.367624521255493, - "kl": 0.267822265625, - "learning_rate": 6.459530966738185e-08, - "loss": 0.0537, - "reward": 0.526227705180645, - "reward_std": 0.1424461118876934, - "rewards/accuracy_reward": 0.06919643399305642, + "grad_norm": 29.340484619140625, + "kl": 4.2109375, + "learning_rate": 3.2297654833690923e-07, + "loss": 0.2816, + "reward": 0.5083705559372902, + "reward_std": 0.14019900187849998, + "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4570312723517418, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1557 }, { "clip_ratio": 0.0, - "completion_length": 1746.6161499023438, + "completion_length": 1581.0335693359375, "epoch": 0.4653872003584497, - "grad_norm": 2.040289878845215, - "kl": 0.27783203125, - "learning_rate": 6.454542181163334e-08, - "loss": 0.0501, - "reward": 0.5558035969734192, - "reward_std": 0.16366050951182842, - "rewards/accuracy_reward": 0.10491071874275804, + "grad_norm": 90.05815887451172, + "kl": 2.0546875, + "learning_rate": 3.227271090581667e-07, + "loss": 0.1945, + "reward": 0.5758928880095482, + "reward_std": 0.17292364686727524, + "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.444196455180645, "step": 1558 }, { "clip_ratio": 0.0, - "completion_length": 1747.7902526855469, + "completion_length": 1599.9576721191406, "epoch": 0.4656859084459712, - "grad_norm": 4.2875752449035645, - "kl": 0.35888671875, - "learning_rate": 6.44955181318915e-08, - "loss": 0.0513, - "reward": 0.486607164144516, - "reward_std": 0.19215291179716587, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 65.6618881225586, + "kl": 2.81640625, + "learning_rate": 3.2247759065945745e-07, + "loss": 0.2279, + "reward": 0.4419643059372902, + "reward_std": 0.16049774549901485, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.439732164144516, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1559 }, { "clip_ratio": 0.0, - "completion_length": 1851.7813415527344, + "completion_length": 1693.5536804199219, "epoch": 0.46598461653349266, - "grad_norm": 23.876989364624023, - "kl": 0.5634765625, - "learning_rate": 6.444559868244664e-08, - "loss": 0.0626, - "reward": 0.504464291036129, - "reward_std": 0.20746169239282608, - "rewards/accuracy_reward": 0.07812500209547579, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.424107164144516, + "grad_norm": 37.3591423034668, + "kl": 4.9921875, + "learning_rate": 3.222279934122332e-07, + "loss": 0.3103, + "reward": 0.4642857313156128, + "reward_std": 0.1592205073684454, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500149011612, "step": 1560 }, { "clip_ratio": 0.0, - "completion_length": 1785.8505249023438, + "completion_length": 1608.9241638183594, "epoch": 0.46628332462101413, - "grad_norm": 16.395713806152344, - "kl": 0.552734375, - "learning_rate": 6.439566351760624e-08, - "loss": 0.0628, - "reward": 0.4893973469734192, - "reward_std": 0.15524799190461636, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 61.330169677734375, + "kl": 2.08203125, + "learning_rate": 3.219783175880312e-07, + "loss": 0.1713, + "reward": 0.5217634290456772, + "reward_std": 0.1701500378549099, + "rewards/accuracy_reward": 0.08482143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4335937723517418, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1561 }, { "clip_ratio": 0.0, - "completion_length": 1793.305908203125, + "completion_length": 1619.2232666015625, "epoch": 0.4665820327085356, - "grad_norm": 4.487662315368652, - "kl": 0.323974609375, - "learning_rate": 6.434571269169486e-08, - "loss": 0.0515, - "reward": 0.5044643133878708, - "reward_std": 0.20387940853834152, - "rewards/accuracy_reward": 0.06473214505240321, + "grad_norm": 48.30647277832031, + "kl": 2.8671875, + "learning_rate": 3.217285634584743e-07, + "loss": 0.2203, + "reward": 0.4732143059372902, + "reward_std": 0.17660940252244473, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.439732164144516, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1562 }, { "clip_ratio": 0.0, - "completion_length": 1638.21435546875, + "completion_length": 1475.6384582519531, "epoch": 0.46688074079605707, - "grad_norm": 8.489108085632324, - "kl": 0.235107421875, - "learning_rate": 6.429574625905411e-08, - "loss": 0.0493, - "reward": 0.5552455708384514, - "reward_std": 0.17934800684452057, - "rewards/accuracy_reward": 0.1004464365541935, + "grad_norm": 42.27071762084961, + "kl": 2.3984375, + "learning_rate": 3.2147873129527054e-07, + "loss": 0.1991, + "reward": 0.5552455633878708, + "reward_std": 0.18660282716155052, + "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4547991305589676, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1563 }, { "clip_ratio": 0.0, - "completion_length": 1712.4219360351562, + "completion_length": 1567.2545471191406, "epoch": 0.46717944888357854, - "grad_norm": 9.873926162719727, - "kl": 0.369384765625, - "learning_rate": 6.424576427404255e-08, - "loss": 0.0768, - "reward": 0.4866071790456772, - "reward_std": 0.19306475669145584, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 62.49799346923828, + "kl": 4.78125, + "learning_rate": 3.2122882137021273e-07, + "loss": 0.3108, + "reward": 0.470982164144516, + "reward_std": 0.18207499384880066, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4441964402794838, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1564 }, { "clip_ratio": 0.0, - "completion_length": 1772.60498046875, + "completion_length": 1621.69873046875, "epoch": 0.4674781569711, - "grad_norm": 3.071673631668091, - "kl": 0.384765625, - "learning_rate": 6.41957667910357e-08, - "loss": 0.059, - "reward": 0.543526791036129, - "reward_std": 0.17389465868473053, - "rewards/accuracy_reward": 0.10937500651925802, + "grad_norm": 61.61653137207031, + "kl": 5.3984375, + "learning_rate": 3.209788339551785e-07, + "loss": 0.3547, + "reward": 0.5156250223517418, + "reward_std": 0.15292499586939812, + "rewards/accuracy_reward": 0.10044643492437899, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4341518059372902, + "rewards/tag_count_reward": 0.4151785969734192, "step": 1565 }, { "clip_ratio": 0.0, - "completion_length": 1799.1920471191406, + "completion_length": 1650.9040832519531, "epoch": 0.4677768650586215, - "grad_norm": 5.487895965576172, - "kl": 0.36474609375, - "learning_rate": 6.414575386442592e-08, - "loss": 0.0529, - "reward": 0.4949776977300644, - "reward_std": 0.15885735116899014, - "rewards/accuracy_reward": 0.06250000186264515, + "grad_norm": 103.44009399414062, + "kl": 5.984375, + "learning_rate": 3.207287693221296e-07, + "loss": 0.3605, + "reward": 0.479352705180645, + "reward_std": 0.1632378101348877, + "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4324776977300644, + "rewards/tag_count_reward": 0.4190848469734192, "step": 1566 }, { "clip_ratio": 0.0, - "completion_length": 1587.868408203125, - "epoch": 0.46807557314614295, - "grad_norm": 6.504334926605225, - "kl": 0.345703125, - "learning_rate": 6.409572554862237e-08, - "loss": 0.0968, - "reward": 0.5239955708384514, - "reward_std": 0.20595330372452736, - "rewards/accuracy_reward": 0.09151786006987095, + "completion_length": 1393.1496276855469, + "epoch": 0.46807557314614295, + "grad_norm": 38.79689407348633, + "kl": 4.21875, + "learning_rate": 3.2047862774311187e-07, + "loss": 0.3088, + "reward": 0.5390625298023224, + "reward_std": 0.18461522832512856, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4324776977300644, + "rewards/tag_count_reward": 0.4453125298023224, "step": 1567 }, { "clip_ratio": 0.0, - "completion_length": 1679.3460388183594, + "completion_length": 1518.2344360351562, "epoch": 0.4683742812336644, - "grad_norm": 2.0752196311950684, - "kl": 0.29833984375, - "learning_rate": 6.404568189805094e-08, - "loss": 0.0568, - "reward": 0.6568080708384514, - "reward_std": 0.15356515534222126, - "rewards/accuracy_reward": 0.2053571492433548, + "grad_norm": 33.23543930053711, + "kl": 3.984375, + "learning_rate": 3.202284094902547e-07, + "loss": 0.2903, + "reward": 0.6383928805589676, + "reward_std": 0.16594479978084564, + "rewards/accuracy_reward": 0.19196429592557251, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4514509215950966, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1568 }, { "clip_ratio": 0.0, - "completion_length": 1731.90185546875, + "completion_length": 1589.930908203125, "epoch": 0.4686729893211859, - "grad_norm": 4.472391605377197, - "kl": 0.40283203125, - "learning_rate": 6.399562296715424e-08, - "loss": 0.0618, - "reward": 0.5184152126312256, - "reward_std": 0.20998453721404076, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 25.382930755615234, + "kl": 4.2109375, + "learning_rate": 3.199781148357712e-07, + "loss": 0.3079, + "reward": 0.5027901977300644, + "reward_std": 0.22245344147086143, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4246651902794838, + "rewards/tag_count_reward": 0.420200914144516, "step": 1569 }, { "clip_ratio": 0.0, - "completion_length": 1735.5089721679688, + "completion_length": 1608.415283203125, "epoch": 0.46897169740870737, - "grad_norm": 11.224042892456055, - "kl": 0.37158203125, - "learning_rate": 6.394554881039146e-08, - "loss": 0.0504, - "reward": 0.523437537252903, - "reward_std": 0.12735274992883205, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 19.218530654907227, + "kl": 3.4140625, + "learning_rate": 3.1972774405195736e-07, + "loss": 0.2359, + "reward": 0.5251116454601288, + "reward_std": 0.17153823748230934, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4564732313156128, + "rewards/tag_count_reward": 0.4335937723517418, "step": 1570 }, { "clip_ratio": 0.0, - "completion_length": 1765.3728332519531, + "completion_length": 1629.10498046875, "epoch": 0.46927040549622884, - "grad_norm": 5.138625144958496, - "kl": 0.42724609375, - "learning_rate": 6.38954594822384e-08, - "loss": 0.0664, - "reward": 0.5474330633878708, - "reward_std": 0.1920524314045906, - "rewards/accuracy_reward": 0.11607143841683865, + "grad_norm": 34.38093185424805, + "kl": 3.37890625, + "learning_rate": 3.19477297411192e-07, + "loss": 0.2601, + "reward": 0.5418527126312256, + "reward_std": 0.1840256005525589, + "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4313616305589676, + "rewards/tag_count_reward": 0.4235491305589676, "step": 1571 }, { "clip_ratio": 0.0, - "completion_length": 1796.8951721191406, + "completion_length": 1644.6473999023438, "epoch": 0.46956911358375025, - "grad_norm": 3.310072183609009, - "kl": 0.37109375, - "learning_rate": 6.384535503718732e-08, - "loss": 0.0557, - "reward": 0.479910746216774, - "reward_std": 0.14197684079408646, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 31.99071502685547, + "kl": 3.2109375, + "learning_rate": 3.192267751859366e-07, + "loss": 0.2251, + "reward": 0.4681919887661934, + "reward_std": 0.11340091750025749, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4263393133878708, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1572 }, { "clip_ratio": 0.0, - "completion_length": 1741.6451721191406, + "completion_length": 1619.4263916015625, "epoch": 0.4698678216712717, - "grad_norm": 9.207204818725586, - "kl": 0.352294921875, - "learning_rate": 6.379523552974695e-08, - "loss": 0.06, - "reward": 0.5212053805589676, - "reward_std": 0.11646961979568005, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 44.48149871826172, + "kl": 3.2265625, + "learning_rate": 3.1897617764873477e-07, + "loss": 0.2344, + "reward": 0.5279018059372902, + "reward_std": 0.1346139870584011, + "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.443080373108387, + "rewards/tag_count_reward": 0.4408482313156128, "step": 1573 }, { "clip_ratio": 0.0, - "completion_length": 1636.2656860351562, + "completion_length": 1505.6206359863281, "epoch": 0.4701665297587932, - "grad_norm": 3.1511409282684326, - "kl": 0.348876953125, - "learning_rate": 6.374510101444242e-08, - "loss": 0.0767, - "reward": 0.5803571790456772, - "reward_std": 0.15102360770106316, - "rewards/accuracy_reward": 0.14285714738070965, + "grad_norm": 31.10768699645996, + "kl": 2.376953125, + "learning_rate": 3.187255050722121e-07, + "loss": 0.2023, + "reward": 0.6116071790456772, + "reward_std": 0.14574355073273182, + "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4375000223517418, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1574 }, { "clip_ratio": 0.0, - "completion_length": 1739.3616638183594, + "completion_length": 1603.7388916015625, "epoch": 0.47046523784631467, - "grad_norm": 2.8934614658355713, - "kl": 0.31298828125, - "learning_rate": 6.369495154581511e-08, - "loss": 0.043, - "reward": 0.5111607387661934, - "reward_std": 0.11986970342695713, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 23.50874137878418, + "kl": 2.97265625, + "learning_rate": 3.184747577290756e-07, + "loss": 0.2236, + "reward": 0.5027901977300644, + "reward_std": 0.11774946935474873, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4508928805589676, + "rewards/tag_count_reward": 0.4492187723517418, "step": 1575 }, { "clip_ratio": 0.0, - "completion_length": 1735.5804138183594, + "completion_length": 1535.8661499023438, "epoch": 0.47076394593383614, - "grad_norm": 17.844751358032227, - "kl": 0.53369140625, - "learning_rate": 6.36447871784228e-08, - "loss": 0.0534, - "reward": 0.511160746216774, - "reward_std": 0.17645346000790596, - "rewards/accuracy_reward": 0.06919643189758062, + "grad_norm": 10.750513076782227, + "kl": 2.921875, + "learning_rate": 3.18223935892114e-07, + "loss": 0.2114, + "reward": 0.5312500223517418, + "reward_std": 0.1793605498969555, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4419643059372902, + "rewards/tag_count_reward": 0.4441964477300644, "step": 1576 }, { "clip_ratio": 0.0, - "completion_length": 1732.399658203125, + "completion_length": 1622.2723999023438, "epoch": 0.4710626540213576, - "grad_norm": 4.609043598175049, - "kl": 0.47607421875, - "learning_rate": 6.359460796683936e-08, - "loss": 0.0709, - "reward": 0.5044643133878708, - "reward_std": 0.1358845140784979, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 9.507172584533691, + "kl": 4.21875, + "learning_rate": 3.1797303983419684e-07, + "loss": 0.2863, + "reward": 0.4888393059372902, + "reward_std": 0.12348124012351036, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4263393059372902, + "rewards/tag_count_reward": 0.4263393133878708, "step": 1577 }, { "clip_ratio": 0.0, - "completion_length": 1658.6094360351562, + "completion_length": 1501.0447082519531, "epoch": 0.4713613621088791, - "grad_norm": 7.840850353240967, - "kl": 0.400390625, - "learning_rate": 6.354441396565486e-08, - "loss": 0.07, - "reward": 0.5178571715950966, - "reward_std": 0.18210314959287643, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 17.04880714416504, + "kl": 3.32421875, + "learning_rate": 3.177220698282743e-07, + "loss": 0.2475, + "reward": 0.5206473544239998, + "reward_std": 0.1659452598541975, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4375000223517418, + "rewards/tag_count_reward": 0.4358259215950966, "step": 1578 }, { "clip_ratio": 0.0, - "completion_length": 1790.6607971191406, + "completion_length": 1623.4844665527344, "epoch": 0.47166007019640055, - "grad_norm": 8.148682594299316, - "kl": 0.5263671875, - "learning_rate": 6.349420522947547e-08, - "loss": 0.0704, - "reward": 0.4637276977300644, - "reward_std": 0.13106250017881393, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 19.65870475769043, + "kl": 4.984375, + "learning_rate": 3.1747102614737734e-07, + "loss": 0.3338, + "reward": 0.4648437798023224, + "reward_std": 0.10936222970485687, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4257812723517418, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1579 }, { "clip_ratio": 0.0, - "completion_length": 1704.0916137695312, + "completion_length": 1555.9085388183594, "epoch": 0.471958778283922, - "grad_norm": 2.569093942642212, - "kl": 0.33837890625, - "learning_rate": 6.344398181292337e-08, - "loss": 0.0662, - "reward": 0.5664062649011612, - "reward_std": 0.16480090655386448, - "rewards/accuracy_reward": 0.12723214738070965, + "grad_norm": 36.362430572509766, + "kl": 2.65625, + "learning_rate": 3.172199090646169e-07, + "loss": 0.2174, + "reward": 0.5513393059372902, + "reward_std": 0.1377736497670412, + "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4391741305589676, + "rewards/tag_count_reward": 0.4531250223517418, "step": 1580 }, { "clip_ratio": 0.0, - "completion_length": 1661.524658203125, + "completion_length": 1509.3147888183594, "epoch": 0.4722574863714435, - "grad_norm": 3.2620863914489746, - "kl": 0.37353515625, - "learning_rate": 6.339374377063672e-08, - "loss": 0.0595, - "reward": 0.5468750223517418, - "reward_std": 0.1743851602077484, - "rewards/accuracy_reward": 0.098214291036129, + "grad_norm": 14.71524429321289, + "kl": 3.98046875, + "learning_rate": 3.1696871885318354e-07, + "loss": 0.2729, + "reward": 0.5301339626312256, + "reward_std": 0.14458046853542328, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4486607313156128, + "rewards/tag_count_reward": 0.4408482313156128, "step": 1581 }, { "clip_ratio": 0.0, - "completion_length": 1694.9130249023438, + "completion_length": 1516.2054443359375, "epoch": 0.47255619445896496, - "grad_norm": 3.901660680770874, - "kl": 0.41943359375, - "learning_rate": 6.334349115726954e-08, - "loss": 0.072, - "reward": 0.6049107387661934, - "reward_std": 0.17706026323139668, - "rewards/accuracy_reward": 0.16294643515720963, + "grad_norm": 98.69776916503906, + "kl": 4.51171875, + "learning_rate": 3.1671745578634767e-07, + "loss": 0.3097, + "reward": 0.5904017984867096, + "reward_std": 0.14577055722475052, + "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4419643059372902, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1582 }, { "clip_ratio": 0.0, - "completion_length": 1706.5870971679688, + "completion_length": 1521.0313110351562, "epoch": 0.47285490254648643, - "grad_norm": 4.158044815063477, - "kl": 0.40478515625, - "learning_rate": 6.329322402749179e-08, - "loss": 0.0739, - "reward": 0.546316996216774, - "reward_std": 0.16958929784595966, - "rewards/accuracy_reward": 0.10714286123402417, + "grad_norm": 12.08185863494873, + "kl": 3.98828125, + "learning_rate": 3.16466120137459e-07, + "loss": 0.3266, + "reward": 0.559709832072258, + "reward_std": 0.1821522656828165, + "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4391741305589676, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1583 }, { "clip_ratio": 0.0, - "completion_length": 1765.3772888183594, + "completion_length": 1546.9040832519531, "epoch": 0.4731536106340079, - "grad_norm": 27.7097225189209, - "kl": 0.58251953125, - "learning_rate": 6.32429424359892e-08, - "loss": 0.0714, - "reward": 0.5172991305589676, - "reward_std": 0.15652623400092125, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 10.836796760559082, + "kl": 4.203125, + "learning_rate": 3.16214712179946e-07, + "loss": 0.3069, + "reward": 0.4977678880095482, + "reward_std": 0.13999264873564243, + "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4458705559372902, + "rewards/tag_count_reward": 0.4397321715950966, "step": 1584 }, { "clip_ratio": 0.0, - "completion_length": 1734.1875610351562, + "completion_length": 1560.3795471191406, "epoch": 0.4734523187215294, - "grad_norm": 67.55708312988281, - "kl": 1.052734375, - "learning_rate": 6.319264643746318e-08, - "loss": 0.1201, - "reward": 0.5513393208384514, - "reward_std": 0.15244140475988388, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 32.49421691894531, + "kl": 3.6953125, + "learning_rate": 3.1596323218731583e-07, + "loss": 0.3059, + "reward": 0.5546875298023224, + "reward_std": 0.15854356437921524, + "rewards/accuracy_reward": 0.12723214854486287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.424107164144516, + "rewards/tag_count_reward": 0.427455373108387, "step": 1585 }, { "clip_ratio": 0.0, - "completion_length": 1632.2634582519531, + "completion_length": 1468.0915832519531, "epoch": 0.47375102680905085, - "grad_norm": 10.303548812866211, - "kl": 0.47216796875, - "learning_rate": 6.314233608663085e-08, - "loss": 0.0766, - "reward": 0.6021205633878708, - "reward_std": 0.18224189057946205, - "rewards/accuracy_reward": 0.15625000488944352, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4458705559372902, + "grad_norm": 16.733299255371094, + "kl": 3.361328125, + "learning_rate": 3.157116804331542e-07, + "loss": 0.2518, + "reward": 0.585937537252903, + "reward_std": 0.1736029703170061, + "rewards/accuracy_reward": 0.14508929592557251, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1586 }, { "clip_ratio": 0.0, - "completion_length": 1706.5223999023438, + "completion_length": 1500.790283203125, "epoch": 0.4740497348965723, - "grad_norm": 2.7993016242980957, - "kl": 0.348876953125, - "learning_rate": 6.309201143822493e-08, - "loss": 0.0735, - "reward": 0.525111623108387, - "reward_std": 0.13789206743240356, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 25.521772384643555, + "kl": 3.15625, + "learning_rate": 3.1546005719112466e-07, + "loss": 0.264, + "reward": 0.5379464626312256, + "reward_std": 0.11564843729138374, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4380580559372902, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1587 }, { "clip_ratio": 0.0, - "completion_length": 1682.2590026855469, + "completion_length": 1544.05810546875, "epoch": 0.4743484429840938, - "grad_norm": 3.1041646003723145, - "kl": 0.31494140625, - "learning_rate": 6.304167254699375e-08, - "loss": 0.0569, - "reward": 0.5552455633878708, - "reward_std": 0.1789007242769003, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 14.779484748840332, + "kl": 3.859375, + "learning_rate": 3.1520836273496876e-07, + "loss": 0.2732, + "reward": 0.5546875149011612, + "reward_std": 0.15612070448696613, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062649011612, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1588 }, { "clip_ratio": 0.0, - "completion_length": 1618.3728332519531, + "completion_length": 1464.6496276855469, "epoch": 0.47464715107161526, - "grad_norm": 10.295415878295898, - "kl": 0.50244140625, - "learning_rate": 6.299131946770104e-08, - "loss": 0.0709, - "reward": 0.5368303805589676, - "reward_std": 0.11305011808872223, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 63.947322845458984, + "kl": 3.31640625, + "learning_rate": 3.149565973385052e-07, + "loss": 0.3219, + "reward": 0.5312500298023224, + "reward_std": 0.12138961628079414, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4430803805589676, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1589 }, { "clip_ratio": 0.0, - "completion_length": 1721.8103332519531, + "completion_length": 1566.8594665527344, "epoch": 0.47494585915913673, - "grad_norm": 104.73397827148438, - "kl": 0.85791015625, - "learning_rate": 6.294095225512604e-08, - "loss": 0.091, - "reward": 0.5083705484867096, - "reward_std": 0.15028479136526585, - "rewards/accuracy_reward": 0.07142857694998384, + "grad_norm": 23.637889862060547, + "kl": 3.86328125, + "learning_rate": 3.147047612756302e-07, + "loss": 0.3248, + "reward": 0.4888393059372902, + "reward_std": 0.15691402554512024, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4369419738650322, + "rewards/tag_count_reward": 0.428571455180645, "step": 1590 }, { "clip_ratio": 0.0, - "completion_length": 1721.2478332519531, + "completion_length": 1572.9978332519531, "epoch": 0.4752445672466582, - "grad_norm": 11.208069801330566, - "kl": 0.57275390625, - "learning_rate": 6.289057096406334e-08, - "loss": 0.0658, - "reward": 0.5306920036673546, - "reward_std": 0.1912027932703495, - "rewards/accuracy_reward": 0.12053571571595967, + "grad_norm": 50.00650405883789, + "kl": 4.8515625, + "learning_rate": 3.1445285482031667e-07, + "loss": 0.3137, + "reward": 0.5468750074505806, + "reward_std": 0.171761654317379, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4101562649011612, + "rewards/tag_count_reward": 0.4218750223517418, "step": 1591 }, { "clip_ratio": 0.0, - "completion_length": 1681.0290832519531, + "completion_length": 1514.8504943847656, "epoch": 0.47554327533417967, - "grad_norm": 6.741684436798096, - "kl": 0.48779296875, - "learning_rate": 6.284017564932283e-08, - "loss": 0.0684, - "reward": 0.5719866305589676, - "reward_std": 0.16540068574249744, - "rewards/accuracy_reward": 0.13616071874275804, + "grad_norm": 12.264228820800781, + "kl": 3.80078125, + "learning_rate": 3.142008782466142e-07, + "loss": 0.2759, + "reward": 0.569754496216774, + "reward_std": 0.16302416287362576, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.435825914144516, + "rewards/tag_count_reward": 0.4268973395228386, "step": 1592 }, { "clip_ratio": 0.0, - "completion_length": 1675.5268859863281, + "completion_length": 1558.05810546875, "epoch": 0.47584198342170114, - "grad_norm": 4.316386699676514, - "kl": 0.41748046875, - "learning_rate": 6.27897663657297e-08, - "loss": 0.0565, - "reward": 0.5440848469734192, - "reward_std": 0.19970395043492317, - "rewards/accuracy_reward": 0.1026785783469677, + "grad_norm": 21.55459213256836, + "kl": 3.45703125, + "learning_rate": 3.1394883182864844e-07, + "loss": 0.222, + "reward": 0.546316996216774, + "reward_std": 0.17352686449885368, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062723517418, + "rewards/tag_count_reward": 0.443638414144516, "step": 1593 }, { "clip_ratio": 0.0, - "completion_length": 1785.9889221191406, + "completion_length": 1635.2232971191406, "epoch": 0.4761406915092226, - "grad_norm": 17.980981826782227, - "kl": 0.60888671875, - "learning_rate": 6.273934316812428e-08, - "loss": 0.0774, - "reward": 0.5078125223517418, - "reward_std": 0.16190063580870628, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 15.093851089477539, + "kl": 3.3828125, + "learning_rate": 3.136967158406214e-07, + "loss": 0.2464, + "reward": 0.5133928805589676, + "reward_std": 0.10575600154697895, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4207589477300644, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1594 }, { "clip_ratio": 0.0, - "completion_length": 1803.0469665527344, + "completion_length": 1640.8527526855469, "epoch": 0.4764393995967441, - "grad_norm": 6.21627140045166, - "kl": 0.50634765625, - "learning_rate": 6.268890611136211e-08, - "loss": 0.0707, - "reward": 0.466517873108387, - "reward_std": 0.17311358451843262, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 40.03180694580078, + "kl": 5.03125, + "learning_rate": 3.134445305568105e-07, + "loss": 0.3387, + "reward": 0.4810268059372902, + "reward_std": 0.191373523324728, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4062500149011612, + "rewards/tag_count_reward": 0.4095982387661934, "step": 1595 }, { "clip_ratio": 0.0, - "completion_length": 1796.1384887695312, + "completion_length": 1643.66748046875, "epoch": 0.47673810768426556, - "grad_norm": 19.383399963378906, - "kl": 0.5322265625, - "learning_rate": 6.26384552503137e-08, - "loss": 0.0569, - "reward": 0.5703125223517418, - "reward_std": 0.20387874729931355, - "rewards/accuracy_reward": 0.13392857648432255, + "grad_norm": 23.007722854614258, + "kl": 3.248046875, + "learning_rate": 3.1319227625156853e-07, + "loss": 0.2371, + "reward": 0.5518973469734192, + "reward_std": 0.1575364824384451, + "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4363839402794838, + "rewards/tag_count_reward": 0.4358259215950966, "step": 1596 }, { "clip_ratio": 0.0, - "completion_length": 1801.7969665527344, + "completion_length": 1572.9152221679688, "epoch": 0.477036815771787, - "grad_norm": 5.77317476272583, - "kl": 0.47412109375, - "learning_rate": 6.258799063986471e-08, - "loss": 0.0697, - "reward": 0.5133928805589676, - "reward_std": 0.1526054348796606, - "rewards/accuracy_reward": 0.08258928940631449, + "grad_norm": 74.79681396484375, + "kl": 2.078125, + "learning_rate": 3.129399531993235e-07, + "loss": 0.187, + "reward": 0.5290178805589676, + "reward_std": 0.1349135059863329, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4308035895228386, + "rewards/tag_count_reward": 0.439732164144516, "step": 1597 }, { "clip_ratio": 0.0, - "completion_length": 1638.8928833007812, + "completion_length": 1485.5804443359375, "epoch": 0.4773355238593085, - "grad_norm": 5.524146556854248, - "kl": 0.49755859375, - "learning_rate": 6.253751233491564e-08, - "loss": 0.0878, - "reward": 0.5803571715950966, - "reward_std": 0.20217519253492355, - "rewards/accuracy_reward": 0.1495535746216774, + "grad_norm": 27.57383918762207, + "kl": 4.3203125, + "learning_rate": 3.1268756167457823e-07, + "loss": 0.3266, + "reward": 0.5993303880095482, + "reward_std": 0.16706236079335213, + "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4308035969734192, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1598 }, { "clip_ratio": 0.0, - "completion_length": 1817.4040832519531, + "completion_length": 1646.9085693359375, "epoch": 0.47763423194682997, - "grad_norm": 8.294092178344727, - "kl": 0.55078125, - "learning_rate": 6.248702039038198e-08, - "loss": 0.0696, - "reward": 0.4547991305589676, - "reward_std": 0.18450363352894783, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 18.822795867919922, + "kl": 3.375, + "learning_rate": 3.1243510195190984e-07, + "loss": 0.2399, + "reward": 0.4804687723517418, + "reward_std": 0.14158467017114162, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4168526977300644, + "rewards/tag_count_reward": 0.4402901977300644, "step": 1599 }, { "clip_ratio": 0.0, - "completion_length": 1782.5513916015625, + "completion_length": 1611.4822082519531, "epoch": 0.47793294003435144, - "grad_norm": 7.082760334014893, - "kl": 0.4892578125, - "learning_rate": 6.243651486119396e-08, - "loss": 0.0679, - "reward": 0.5117187574505806, - "reward_std": 0.19003254547715187, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 15.994298934936523, + "kl": 3.2890625, + "learning_rate": 3.1218257430596985e-07, + "loss": 0.2325, + "reward": 0.5239955484867096, + "reward_std": 0.16436783596873283, + "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4268973469734192, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1600 }, { "clip_ratio": 0.0, - "completion_length": 1755.5023193359375, + "completion_length": 1586.5134582519531, "epoch": 0.4782316481218729, - "grad_norm": 4.070363998413086, - "kl": 0.45703125, - "learning_rate": 6.238599580229673e-08, - "loss": 0.0643, - "reward": 0.5541295036673546, - "reward_std": 0.1804242990911007, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 31.604032516479492, + "kl": 3.59765625, + "learning_rate": 3.119299790114836e-07, + "loss": 0.2808, + "reward": 0.5145089402794838, + "reward_std": 0.12953691743314266, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.435825914144516, + "rewards/tag_count_reward": 0.431919664144516, "step": 1601 }, { "clip_ratio": 0.0, - "completion_length": 1769.4375610351562, + "completion_length": 1630.5000610351562, "epoch": 0.4785303562093944, - "grad_norm": 3.5379927158355713, - "kl": 0.45361328125, - "learning_rate": 6.233546326864999e-08, - "loss": 0.0713, - "reward": 0.5206473395228386, - "reward_std": 0.1439653132110834, - "rewards/accuracy_reward": 0.08928571850992739, + "grad_norm": 11.223973274230957, + "kl": 3.634765625, + "learning_rate": 3.1167731634324994e-07, + "loss": 0.2538, + "reward": 0.5306919887661934, + "reward_std": 0.12438629567623138, + "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.431361623108387, + "rewards/tag_count_reward": 0.439174123108387, "step": 1602 }, { "clip_ratio": 0.0, - "completion_length": 1817.8304443359375, + "completion_length": 1746.6563110351562, "epoch": 0.47882906429691585, - "grad_norm": 16.082406997680664, - "kl": 0.5703125, - "learning_rate": 6.228491731522822e-08, - "loss": 0.0601, - "reward": 0.4754464477300644, - "reward_std": 0.16443906538188457, - "rewards/accuracy_reward": 0.05580357415601611, + "grad_norm": 82.99742889404297, + "kl": 5.9453125, + "learning_rate": 3.1142458657614104e-07, + "loss": 0.3618, + "reward": 0.4603794887661934, + "reward_std": 0.1524161472916603, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4196428656578064, + "rewards/tag_count_reward": 0.4045759066939354, "step": 1603 }, { "clip_ratio": 0.0, - "completion_length": 1797.9219970703125, + "completion_length": 1662.8013916015625, "epoch": 0.4791277723844373, - "grad_norm": 55.410926818847656, - "kl": 0.708984375, - "learning_rate": 6.223435799702047e-08, - "loss": 0.0827, - "reward": 0.446986623108387, - "reward_std": 0.17786629125475883, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 88.9205322265625, + "kl": 6.0234375, + "learning_rate": 3.1117178998510237e-07, + "loss": 0.348, + "reward": 0.4603794813156128, + "reward_std": 0.1535599809139967, + "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4135044813156128, + "rewards/tag_count_reward": 0.4202009215950966, "step": 1604 }, { "clip_ratio": 0.0, - "completion_length": 1713.6273193359375, + "completion_length": 1580.5402221679688, "epoch": 0.4794264804719588, - "grad_norm": 3.1916909217834473, - "kl": 0.4091796875, - "learning_rate": 6.218378536903032e-08, - "loss": 0.0581, - "reward": 0.513950914144516, - "reward_std": 0.14392922818660736, - "rewards/accuracy_reward": 0.08482143096625805, + "grad_norm": 27.044967651367188, + "kl": 3.75390625, + "learning_rate": 3.109189268451516e-07, + "loss": 0.2562, + "reward": 0.545200914144516, + "reward_std": 0.12642978690564632, + "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4291294813156128, + "rewards/tag_count_reward": 0.4536830559372902, "step": 1605 }, { "clip_ratio": 0.0, - "completion_length": 1752.0246276855469, + "completion_length": 1582.51123046875, "epoch": 0.47972518855948026, - "grad_norm": 8.17403793334961, - "kl": 0.525390625, - "learning_rate": 6.213319948627582e-08, - "loss": 0.086, - "reward": 0.5546875223517418, - "reward_std": 0.1699029766023159, - "rewards/accuracy_reward": 0.133928582072258, + "grad_norm": 21.528409957885742, + "kl": 4.34375, + "learning_rate": 3.106659974313791e-07, + "loss": 0.2878, + "reward": 0.581473246216774, + "reward_std": 0.13880587741732597, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4207589477300644, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1606 }, { "clip_ratio": 0.0, - "completion_length": 1731.1697082519531, + "completion_length": 1566.4241638183594, "epoch": 0.48002389664700174, - "grad_norm": 25.722646713256836, - "kl": 0.763671875, - "learning_rate": 6.208260040378945e-08, - "loss": 0.0824, - "reward": 0.5491071715950966, - "reward_std": 0.22282039374113083, - "rewards/accuracy_reward": 0.12500000419095159, + "grad_norm": 26.889516830444336, + "kl": 3.6953125, + "learning_rate": 3.1041300201894725e-07, + "loss": 0.2483, + "reward": 0.5373884215950966, + "reward_std": 0.18998852744698524, + "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.424107164144516, + "rewards/tag_count_reward": 0.4347098469734192, "step": 1607 }, { "clip_ratio": 0.0, - "completion_length": 1768.9598999023438, + "completion_length": 1631.82373046875, "epoch": 0.4803226047345232, - "grad_norm": 384.60382080078125, - "kl": 3.41162109375, - "learning_rate": 6.203198817661807e-08, - "loss": 0.2166, - "reward": 0.5139509215950966, - "reward_std": 0.2136705070734024, - "rewards/accuracy_reward": 0.10044643469154835, + "grad_norm": 14.014554023742676, + "kl": 3.83203125, + "learning_rate": 3.101599408830904e-07, + "loss": 0.2543, + "reward": 0.5329241454601288, + "reward_std": 0.18147086165845394, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4135044887661934, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1608 }, { "clip_ratio": 0.0, - "completion_length": 1644.3036499023438, + "completion_length": 1539.5223999023438, "epoch": 0.4806213128220447, - "grad_norm": 18.917036056518555, - "kl": 0.5419921875, - "learning_rate": 6.198136285982283e-08, - "loss": 0.0727, - "reward": 0.5385044887661934, - "reward_std": 0.20567690208554268, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 32.147117614746094, + "kl": 2.77734375, + "learning_rate": 3.099068142991142e-07, + "loss": 0.2087, + "reward": 0.5044643059372902, + "reward_std": 0.18120743706822395, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.440290205180645, + "rewards/tag_count_reward": 0.4397321566939354, "step": 1609 }, { "clip_ratio": 0.0, - "completion_length": 1704.0804748535156, + "completion_length": 1521.1585388183594, "epoch": 0.48092002090956615, - "grad_norm": 5.697133541107178, - "kl": 0.43359375, - "learning_rate": 6.193072450847908e-08, - "loss": 0.0878, - "reward": 0.4486607313156128, - "reward_std": 0.16183778084814548, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 14.000472068786621, + "kl": 2.986328125, + "learning_rate": 3.096536225423954e-07, + "loss": 0.2222, + "reward": 0.4737723469734192, + "reward_std": 0.13127341866493225, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.424107164144516, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1610 }, { "clip_ratio": 0.0, - "completion_length": 1729.0201721191406, + "completion_length": 1623.7188415527344, "epoch": 0.4812187289970876, - "grad_norm": 5.445528030395508, - "kl": 0.49462890625, - "learning_rate": 6.188007317767642e-08, - "loss": 0.0785, - "reward": 0.5758928805589676, - "reward_std": 0.20779858902096748, - "rewards/accuracy_reward": 0.13616072200238705, + "grad_norm": 37.74376678466797, + "kl": 3.640625, + "learning_rate": 3.094003658883821e-07, + "loss": 0.2782, + "reward": 0.5569196790456772, + "reward_std": 0.17462435364723206, + "rewards/accuracy_reward": 0.12276786682195961, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.439732164144516, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1611 }, { "clip_ratio": 0.0, - "completion_length": 1765.9911804199219, + "completion_length": 1657.6228332519531, "epoch": 0.4815174370846091, - "grad_norm": 15.061380386352539, - "kl": 0.6142578125, - "learning_rate": 6.182940892251851e-08, - "loss": 0.0775, - "reward": 0.533482164144516, - "reward_std": 0.17306295968592167, - "rewards/accuracy_reward": 0.10714286100119352, + "grad_norm": 29.947187423706055, + "kl": 3.64453125, + "learning_rate": 3.0914704461259255e-07, + "loss": 0.2558, + "reward": 0.5212053805589676, + "reward_std": 0.1573030073195696, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4263393059372902, + "rewards/tag_count_reward": 0.4252232387661934, "step": 1612 }, { "clip_ratio": 0.0, - "completion_length": 1815.8460998535156, + "completion_length": 1693.9107666015625, "epoch": 0.48181614517213056, - "grad_norm": 5.426417350769043, - "kl": 0.4892578125, - "learning_rate": 6.17787317981231e-08, - "loss": 0.055, - "reward": 0.4564732387661934, - "reward_std": 0.15321358293294907, - "rewards/accuracy_reward": 0.03571428591385484, + "grad_norm": 15.240757942199707, + "kl": 4.29296875, + "learning_rate": 3.088936589906155e-07, + "loss": 0.2749, + "reward": 0.4352678805589676, + "reward_std": 0.1524873785674572, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4207589477300644, + "rewards/tag_count_reward": 0.408482164144516, "step": 1613 }, { "clip_ratio": 0.0, - "completion_length": 1782.0625610351562, + "completion_length": 1712.1139221191406, "epoch": 0.48211485325965203, - "grad_norm": 3.61557674407959, - "kl": 0.43505859375, - "learning_rate": 6.172804185962192e-08, - "loss": 0.0659, - "reward": 0.5373884215950966, - "reward_std": 0.15995229221880436, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 10.832959175109863, + "kl": 3.5625, + "learning_rate": 3.086402092981096e-07, + "loss": 0.2321, + "reward": 0.5385044887661934, + "reward_std": 0.15707102604210377, + "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4257812723517418, + "rewards/tag_count_reward": 0.420200914144516, "step": 1614 }, { "clip_ratio": 0.0, - "completion_length": 1705.3773193359375, + "completion_length": 1564.9263916015625, "epoch": 0.48241356134717345, - "grad_norm": 4.914855480194092, - "kl": 0.51220703125, - "learning_rate": 6.167733916216068e-08, - "loss": 0.0944, - "reward": 0.4960937798023224, - "reward_std": 0.19483665749430656, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 7.38329553604126, + "kl": 3.388671875, + "learning_rate": 3.0838669581080334e-07, + "loss": 0.2406, + "reward": 0.5318080633878708, + "reward_std": 0.15097877755761147, + "rewards/accuracy_reward": 0.08482143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4246651977300644, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1615 }, { "clip_ratio": 0.0, - "completion_length": 1774.9889221191406, + "completion_length": 1668.0603332519531, "epoch": 0.4827122694346949, - "grad_norm": 33.073944091796875, - "kl": 0.859375, - "learning_rate": 6.162662376089893e-08, - "loss": 0.0861, - "reward": 0.4469866305589676, - "reward_std": 0.17271001636981964, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 8.034990310668945, + "kl": 3.376953125, + "learning_rate": 3.0813311880449466e-07, + "loss": 0.2071, + "reward": 0.474888414144516, + "reward_std": 0.15962528437376022, + "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366305589676, + "rewards/tag_count_reward": 0.4414062723517418, "step": 1616 }, { "clip_ratio": 0.0, - "completion_length": 1710.82373046875, + "completion_length": 1559.60498046875, "epoch": 0.4830109775222164, - "grad_norm": 12.482277870178223, - "kl": 0.6796875, - "learning_rate": 6.157589571101005e-08, - "loss": 0.1069, - "reward": 0.6032366305589676, - "reward_std": 0.18625910952687263, - "rewards/accuracy_reward": 0.1718750074505806, + "grad_norm": 37.72300720214844, + "kl": 4.8515625, + "learning_rate": 3.0787947855505025e-07, + "loss": 0.3389, + "reward": 0.5864955484867096, + "reward_std": 0.17711183056235313, + "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4313616305589676, + "rewards/tag_count_reward": 0.423549123108387, "step": 1617 }, { "clip_ratio": 0.0, - "completion_length": 1685.8527526855469, + "completion_length": 1595.3348999023438, "epoch": 0.48330968560973786, - "grad_norm": 81.19678497314453, - "kl": 0.87060546875, - "learning_rate": 6.152515506768121e-08, - "loss": 0.0973, - "reward": 0.5474330708384514, - "reward_std": 0.15721174702048302, - "rewards/accuracy_reward": 0.1160714365541935, + "grad_norm": 31.655437469482422, + "kl": 4.26953125, + "learning_rate": 3.0762577533840606e-07, + "loss": 0.2795, + "reward": 0.5396205484867096, + "reward_std": 0.1576180700212717, + "rewards/accuracy_reward": 0.11160714854486287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4313616305589676, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1618 }, { "clip_ratio": 0.0, - "completion_length": 1759.790283203125, + "completion_length": 1615.43310546875, "epoch": 0.48360839369725933, - "grad_norm": 12.79055118560791, - "kl": 0.5419921875, - "learning_rate": 6.147440188611323e-08, - "loss": 0.0731, - "reward": 0.577566996216774, - "reward_std": 0.2085178941488266, - "rewards/accuracy_reward": 0.1562500037252903, + "grad_norm": 10.194987297058105, + "kl": 4.078125, + "learning_rate": 3.073720094305662e-07, + "loss": 0.2891, + "reward": 0.5686384215950966, + "reward_std": 0.15565957874059677, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4213169813156128, + "rewards/tag_count_reward": 0.4257812649011612, "step": 1619 }, { "clip_ratio": 0.0, - "completion_length": 1663.5045166015625, + "completion_length": 1561.27685546875, "epoch": 0.4839071017847808, - "grad_norm": 32.05697250366211, - "kl": 0.62451171875, - "learning_rate": 6.142363622152062e-08, - "loss": 0.0902, - "reward": 0.519531287252903, - "reward_std": 0.17043817602097988, - "rewards/accuracy_reward": 0.08258929010480642, + "grad_norm": 10.495670318603516, + "kl": 3.4921875, + "learning_rate": 3.0711818110760313e-07, + "loss": 0.2713, + "reward": 0.5217634066939354, + "reward_std": 0.18035482801496983, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4369419887661934, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1620 }, { "clip_ratio": 0.0, - "completion_length": 1840.7366943359375, + "completion_length": 1681.0313110351562, "epoch": 0.4842058098723023, - "grad_norm": 4.3837809562683105, - "kl": 0.46630859375, - "learning_rate": 6.137285812913145e-08, - "loss": 0.0695, - "reward": 0.4743303880095482, - "reward_std": 0.22579462826251984, + "grad_norm": 43.72733688354492, + "kl": 2.89453125, + "learning_rate": 3.068642906456572e-07, + "loss": 0.2237, + "reward": 0.4676339477300644, + "reward_std": 0.20020072534680367, "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.416294664144516, + "rewards/tag_count_reward": 0.4095982313156128, "step": 1621 }, { "clip_ratio": 0.0, - "completion_length": 1752.2300109863281, + "completion_length": 1608.6429443359375, "epoch": 0.48450451795982374, - "grad_norm": 2.878976821899414, - "kl": 0.3955078125, - "learning_rate": 6.132206766418727e-08, - "loss": 0.0614, - "reward": 0.5809152126312256, - "reward_std": 0.20084932073950768, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 80.25322723388672, + "kl": 1.953125, + "learning_rate": 3.0661033832093636e-07, + "loss": 0.1704, + "reward": 0.5468750149011612, + "reward_std": 0.16547834873199463, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4358259066939354, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1622 }, { "clip_ratio": 0.0, - "completion_length": 1621.3438110351562, + "completion_length": 1448.5915832519531, "epoch": 0.4848032260473452, - "grad_norm": 1.7370119094848633, - "kl": 0.338134765625, - "learning_rate": 6.127126488194317e-08, - "loss": 0.0628, - "reward": 0.662388414144516, - "reward_std": 0.19780636578798294, - "rewards/accuracy_reward": 0.2232142984867096, + "grad_norm": 15.838859558105469, + "kl": 3.26171875, + "learning_rate": 3.063563244097159e-07, + "loss": 0.2468, + "reward": 0.6668527200818062, + "reward_std": 0.16469843313097954, + "rewards/accuracy_reward": 0.21651787403970957, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4391741305589676, + "rewards/tag_count_reward": 0.4503348395228386, "step": 1623 }, { "clip_ratio": 0.0, - "completion_length": 1672.15185546875, + "completion_length": 1540.9219055175781, "epoch": 0.4851019341348667, - "grad_norm": 42.609649658203125, - "kl": 0.627197265625, - "learning_rate": 6.122044983766757e-08, - "loss": 0.0767, - "reward": 0.537946455180645, - "reward_std": 0.17945720627903938, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 10.395541191101074, + "kl": 3.39453125, + "learning_rate": 3.061022491883378e-07, + "loss": 0.2279, + "reward": 0.5279018133878708, + "reward_std": 0.15787720493972301, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4486607313156128, + "rewards/tag_count_reward": 0.447544664144516, "step": 1624 }, { "clip_ratio": 0.0, - "completion_length": 1792.3348999023438, + "completion_length": 1614.696533203125, "epoch": 0.48540064222238816, - "grad_norm": 7.707005977630615, - "kl": 0.55029296875, - "learning_rate": 6.116962258664228e-08, - "loss": 0.0803, - "reward": 0.4380580559372902, - "reward_std": 0.17013679072260857, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 31.912437438964844, + "kl": 3.84765625, + "learning_rate": 3.058481129332114e-07, + "loss": 0.2524, + "reward": 0.4525669813156128, + "reward_std": 0.14270638301968575, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4224330559372902, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1625 }, { "clip_ratio": 0.0, - "completion_length": 1713.8416137695312, + "completion_length": 1548.9241943359375, "epoch": 0.4856993503099096, - "grad_norm": 15.0860595703125, - "kl": 0.64111328125, - "learning_rate": 6.111878318416235e-08, - "loss": 0.0938, - "reward": 0.5189732387661934, - "reward_std": 0.17481471225619316, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 41.06375503540039, + "kl": 4.67578125, + "learning_rate": 3.0559391592081173e-07, + "loss": 0.3377, + "reward": 0.5279018133878708, + "reward_std": 0.17372934147715569, + "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.427455373108387, + "rewards/tag_count_reward": 0.4252232238650322, "step": 1626 }, { "clip_ratio": 0.0, - "completion_length": 1728.5960693359375, + "completion_length": 1635.6384582519531, "epoch": 0.4859980583974311, - "grad_norm": 13.14074420928955, - "kl": 0.41162109375, - "learning_rate": 6.106793168553607e-08, - "loss": 0.0486, - "reward": 0.5066964626312256, - "reward_std": 0.17504486069083214, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 21.54033660888672, + "kl": 3.703125, + "learning_rate": 3.0533965842768037e-07, + "loss": 0.2198, + "reward": 0.5150669887661934, + "reward_std": 0.19958655163645744, + "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.444196455180645, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1627 }, { "clip_ratio": 0.0, - "completion_length": 1749.0067749023438, + "completion_length": 1596.5313110351562, "epoch": 0.48629676648495257, - "grad_norm": 5.4552717208862305, - "kl": 0.387451171875, - "learning_rate": 6.101706814608489e-08, - "loss": 0.0767, - "reward": 0.4849330633878708, - "reward_std": 0.17185164988040924, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 100.40029907226562, + "kl": 1.890625, + "learning_rate": 3.050853407304245e-07, + "loss": 0.1781, + "reward": 0.506138414144516, + "reward_std": 0.17503629811108112, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4268973469734192, + "rewards/tag_count_reward": 0.4391741305589676, "step": 1628 }, { "clip_ratio": 0.0, - "completion_length": 1667.77685546875, + "completion_length": 1529.1295166015625, "epoch": 0.48659547457247404, - "grad_norm": 4.6129045486450195, - "kl": 0.3505859375, - "learning_rate": 6.096619262114338e-08, - "loss": 0.0693, - "reward": 0.6054687798023224, - "reward_std": 0.21003437414765358, - "rewards/accuracy_reward": 0.15848215203732252, + "grad_norm": 111.42853546142578, + "kl": 1.544921875, + "learning_rate": 3.0483096310571687e-07, + "loss": 0.1891, + "reward": 0.5825893133878708, + "reward_std": 0.19627559185028076, + "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4469866305589676, + "rewards/tag_count_reward": 0.4464285969734192, "step": 1629 }, { "clip_ratio": 0.0, - "completion_length": 1753.2880249023438, + "completion_length": 1611.6652526855469, "epoch": 0.4868941826599955, - "grad_norm": 4.990596294403076, - "kl": 0.50341796875, - "learning_rate": 6.091530516605907e-08, - "loss": 0.0721, - "reward": 0.4921875149011612, - "reward_std": 0.15482242032885551, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 117.06646728515625, + "kl": 1.939453125, + "learning_rate": 3.0457652583029535e-07, + "loss": 0.1851, + "reward": 0.4799107387661934, + "reward_std": 0.1327403448522091, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4296875149011612, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1630 }, { "clip_ratio": 0.0, - "completion_length": 1660.4978332519531, + "completion_length": 1490.8237609863281, "epoch": 0.487192890747517, - "grad_norm": 4.324692726135254, - "kl": 0.48486328125, - "learning_rate": 6.086440583619256e-08, - "loss": 0.072, - "reward": 0.4765625298023224, - "reward_std": 0.16289180517196655, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 110.13700103759766, + "kl": 1.658203125, + "learning_rate": 3.0432202918096283e-07, + "loss": 0.2052, + "reward": 0.4983259066939354, + "reward_std": 0.13787688687443733, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4185268059372902, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1631 }, { "clip_ratio": 0.0, - "completion_length": 1763.8973693847656, + "completion_length": 1647.9263916015625, "epoch": 0.48749159883503845, - "grad_norm": 10.922829627990723, - "kl": 0.44287109375, - "learning_rate": 6.081349468691734e-08, - "loss": 0.0607, - "reward": 0.524553582072258, - "reward_std": 0.1738486923277378, - "rewards/accuracy_reward": 0.10491072130389512, + "grad_norm": 40.727535247802734, + "kl": 2.99609375, + "learning_rate": 3.040674734345867e-07, + "loss": 0.2216, + "reward": 0.5385044813156128, + "reward_std": 0.1657584086060524, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.419642873108387, + "rewards/tag_count_reward": 0.4335937798023224, "step": 1632 }, { "clip_ratio": 0.0, - "completion_length": 1768.5826416015625, + "completion_length": 1564.6630249023438, "epoch": 0.4877903069225599, - "grad_norm": 4.357282638549805, - "kl": 0.5634765625, - "learning_rate": 6.076257177361973e-08, - "loss": 0.0917, - "reward": 0.4408482313156128, - "reward_std": 0.18168465048074722, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 42.8799934387207, + "kl": 3.125, + "learning_rate": 3.0381285886809867e-07, + "loss": 0.2713, + "reward": 0.4687500149011612, + "reward_std": 0.16412829980254173, + "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4095982238650322, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1633 }, { "clip_ratio": 0.0, - "completion_length": 1740.8661499023438, + "completion_length": 1620.4152221679688, "epoch": 0.4880890150100814, - "grad_norm": 26.61054801940918, - "kl": 0.65576171875, - "learning_rate": 6.071163715169888e-08, - "loss": 0.1071, - "reward": 0.5189732313156128, - "reward_std": 0.1786775179207325, - "rewards/accuracy_reward": 0.09598214575089514, + "grad_norm": 28.52981948852539, + "kl": 3.2109375, + "learning_rate": 3.0355818575849443e-07, + "loss": 0.2458, + "reward": 0.5418526977300644, + "reward_std": 0.1686634048819542, + "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4229910895228386, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1634 }, { "clip_ratio": 0.0, - "completion_length": 1793.8951721191406, + "completion_length": 1640.69873046875, "epoch": 0.48838772309760287, - "grad_norm": 3.9609267711639404, - "kl": 0.37744140625, - "learning_rate": 6.066069087656665e-08, - "loss": 0.0657, - "reward": 0.4715401977300644, - "reward_std": 0.22470982745289803, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 21.642587661743164, + "kl": 3.24609375, + "learning_rate": 3.033034543828332e-07, + "loss": 0.2368, + "reward": 0.503906287252903, + "reward_std": 0.21493875980377197, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4135044887661934, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1635 }, { "clip_ratio": 0.0, - "completion_length": 1768.2455749511719, + "completion_length": 1591.6228332519531, "epoch": 0.48868643118512434, - "grad_norm": 6.364513874053955, - "kl": 0.47705078125, - "learning_rate": 6.060973300364761e-08, - "loss": 0.0935, - "reward": 0.5513393059372902, - "reward_std": 0.22203736007213593, - "rewards/accuracy_reward": 0.12723215110599995, + "grad_norm": 49.88224411010742, + "kl": 4.1171875, + "learning_rate": 3.030486650182381e-07, + "loss": 0.2587, + "reward": 0.557477705180645, + "reward_std": 0.18476995266973972, + "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.424107164144516, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1636 }, { "clip_ratio": 0.0, - "completion_length": 1716.4822387695312, + "completion_length": 1551.6205749511719, "epoch": 0.4889851392726458, - "grad_norm": 8.353410720825195, - "kl": 0.55908203125, - "learning_rate": 6.055876358837894e-08, - "loss": 0.0806, - "reward": 0.4598214477300644, - "reward_std": 0.17935436218976974, - "rewards/accuracy_reward": 0.04017857392318547, + "grad_norm": 60.93635940551758, + "kl": 4.5625, + "learning_rate": 3.0279381794189466e-07, + "loss": 0.3116, + "reward": 0.474888414144516, + "reward_std": 0.1465768702328205, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.419642873108387, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1637 }, { "clip_ratio": 0.0, - "completion_length": 1689.7388916015625, + "completion_length": 1541.1429138183594, "epoch": 0.4892838473601673, - "grad_norm": 4.713935852050781, - "kl": 0.4873046875, - "learning_rate": 6.050778268621034e-08, - "loss": 0.0932, - "reward": 0.4654018208384514, - "reward_std": 0.19227033481001854, - "rewards/accuracy_reward": 0.046875002793967724, + "grad_norm": 41.259971618652344, + "kl": 4.62109375, + "learning_rate": 3.025389134310517e-07, + "loss": 0.3341, + "reward": 0.5083705559372902, + "reward_std": 0.17395376414060593, + "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4185268059372902, + "rewards/tag_count_reward": 0.434709832072258, "step": 1638 }, { "clip_ratio": 0.0, - "completion_length": 1677.7344665527344, + "completion_length": 1559.9978332519531, "epoch": 0.48958255544768875, - "grad_norm": 7.4252848625183105, - "kl": 0.396728515625, - "learning_rate": 6.045679035260406e-08, - "loss": 0.0763, - "reward": 0.5306919887661934, - "reward_std": 0.15078383684158325, - "rewards/accuracy_reward": 0.08928571571595967, + "grad_norm": 16.896432876586914, + "kl": 3.431640625, + "learning_rate": 3.022839517630203e-07, + "loss": 0.2416, + "reward": 0.5279017984867096, + "reward_std": 0.11502685211598873, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062723517418, + "rewards/tag_count_reward": 0.443080373108387, "step": 1639 }, { "clip_ratio": 0.0, - "completion_length": 1765.9063110351562, + "completion_length": 1565.7545471191406, "epoch": 0.4898812635352102, - "grad_norm": 5.294791221618652, - "kl": 0.59033203125, - "learning_rate": 6.040578664303475e-08, - "loss": 0.0895, - "reward": 0.4804687649011612, - "reward_std": 0.2337493672966957, - "rewards/accuracy_reward": 0.06473214458674192, + "grad_norm": 111.08955383300781, + "kl": 6.1953125, + "learning_rate": 3.0202893321517374e-07, + "loss": 0.416, + "reward": 0.506138414144516, + "reward_std": 0.21261441707611084, + "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.415736623108387, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1640 }, { "clip_ratio": 0.0, - "completion_length": 1730.5380249023438, + "completion_length": 1600.1384582519531, "epoch": 0.4901799716227317, - "grad_norm": 18.846065521240234, - "kl": 0.65380859375, - "learning_rate": 6.035477161298945e-08, - "loss": 0.0817, - "reward": 0.5675223469734192, - "reward_std": 0.19691664539277554, - "rewards/accuracy_reward": 0.1651785783469677, + "grad_norm": 41.925968170166016, + "kl": 4.67578125, + "learning_rate": 3.0177385806494726e-07, + "loss": 0.2947, + "reward": 0.5797991305589676, + "reward_std": 0.16164417937397957, + "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437649011612, + "rewards/tag_count_reward": 0.4302455484867096, "step": 1641 }, { "clip_ratio": 0.0, - "completion_length": 1696.71435546875, + "completion_length": 1529.8639221191406, "epoch": 0.49047867971025316, - "grad_norm": 5.42765998840332, - "kl": 0.355712890625, - "learning_rate": 6.03037453179675e-08, - "loss": 0.0728, - "reward": 0.5267857387661934, - "reward_std": 0.1813175193965435, - "rewards/accuracy_reward": 0.08928572107106447, + "grad_norm": 56.106483459472656, + "kl": 4.5703125, + "learning_rate": 3.0151872658983753e-07, + "loss": 0.2975, + "reward": 0.5591517984867096, + "reward_std": 0.204350546002388, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4375000223517418, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1642 }, { "clip_ratio": 0.0, - "completion_length": 1772.3795776367188, + "completion_length": 1635.0983276367188, "epoch": 0.49077738779777463, - "grad_norm": 5.826740741729736, - "kl": 0.5791015625, - "learning_rate": 6.025270781348054e-08, - "loss": 0.0694, - "reward": 0.5708705633878708, - "reward_std": 0.2134014330804348, - "rewards/accuracy_reward": 0.16294643399305642, + "grad_norm": 59.32293701171875, + "kl": 5.03125, + "learning_rate": 3.012635390674027e-07, + "loss": 0.3155, + "reward": 0.595982164144516, + "reward_std": 0.1909337416291237, + "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4079241305589676, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1643 }, { "clip_ratio": 0.0, - "completion_length": 1708.7590026855469, + "completion_length": 1551.1451416015625, "epoch": 0.4910760958852961, - "grad_norm": 9.371573448181152, - "kl": 0.591796875, - "learning_rate": 6.020165915505234e-08, - "loss": 0.0674, - "reward": 0.4799107238650322, - "reward_std": 0.23036371171474457, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 10.997405052185059, + "kl": 3.255859375, + "learning_rate": 3.010082957752617e-07, + "loss": 0.2317, + "reward": 0.5217634215950966, + "reward_std": 0.1865183226764202, + "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.415178582072258, + "rewards/tag_count_reward": 0.4503348469734192, "step": 1644 }, { "clip_ratio": 0.0, - "completion_length": 1765.8326721191406, + "completion_length": 1654.7791137695312, "epoch": 0.4913748039728176, - "grad_norm": 17.154138565063477, - "kl": 0.6689453125, - "learning_rate": 6.015059939821885e-08, - "loss": 0.0933, - "reward": 0.4419643059372902, - "reward_std": 0.17402052879333496, - "rewards/accuracy_reward": 0.026785715017467737, + "grad_norm": 23.717939376831055, + "kl": 3.98828125, + "learning_rate": 3.0075299699109425e-07, + "loss": 0.262, + "reward": 0.4614955633878708, + "reward_std": 0.14975935220718384, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.415178582072258, + "rewards/tag_count_reward": 0.4391741305589676, "step": 1645 }, { "clip_ratio": 0.0, - "completion_length": 1691.3750915527344, + "completion_length": 1510.0848693847656, "epoch": 0.49167351206033905, - "grad_norm": 34.059329986572266, - "kl": 0.74365234375, - "learning_rate": 6.009952859852808e-08, - "loss": 0.0875, - "reward": 0.5731026902794838, - "reward_std": 0.2100190743803978, - "rewards/accuracy_reward": 0.15625000605359674, + "grad_norm": 21.089120864868164, + "kl": 2.7109375, + "learning_rate": 3.004976429926404e-07, + "loss": 0.2141, + "reward": 0.5909598469734192, + "reward_std": 0.17239592224359512, + "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.416852705180645, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1646 }, { "clip_ratio": 0.0, - "completion_length": 1647.3750915527344, + "completion_length": 1519.7411193847656, "epoch": 0.4919722201478605, - "grad_norm": 20.188562393188477, - "kl": 0.49609375, - "learning_rate": 6.004844681154007e-08, - "loss": 0.0923, - "reward": 0.604910746216774, - "reward_std": 0.1528300978243351, - "rewards/accuracy_reward": 0.1785714365541935, + "grad_norm": 12.201148986816406, + "kl": 2.859375, + "learning_rate": 3.002422340577003e-07, + "loss": 0.2377, + "reward": 0.6266741380095482, + "reward_std": 0.1403534021228552, + "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4263393133878708, + "rewards/tag_count_reward": 0.443638414144516, "step": 1647 }, { "clip_ratio": 0.0, - "completion_length": 1656.8572082519531, + "completion_length": 1542.9419860839844, "epoch": 0.492270928235382, - "grad_norm": 3.0236032009124756, - "kl": 0.4091796875, - "learning_rate": 5.999735409282677e-08, - "loss": 0.0677, - "reward": 0.5558035969734192, - "reward_std": 0.2005566991865635, - "rewards/accuracy_reward": 0.12053571757860482, + "grad_norm": 10.617005348205566, + "kl": 3.63671875, + "learning_rate": 2.9998677046413383e-07, + "loss": 0.2845, + "reward": 0.5675223469734192, + "reward_std": 0.153344189748168, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4352678805589676, + "rewards/tag_count_reward": 0.442522332072258, "step": 1648 }, { "clip_ratio": 0.0, - "completion_length": 1719.2232971191406, + "completion_length": 1626.1295471191406, "epoch": 0.49256963632290346, - "grad_norm": 4.87608528137207, - "kl": 0.484375, - "learning_rate": 5.994625049797205e-08, - "loss": 0.0649, - "reward": 0.5329241380095482, - "reward_std": 0.21102599054574966, - "rewards/accuracy_reward": 0.0982142873108387, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4324776977300644, + "grad_norm": 17.984851837158203, + "kl": 3.4609375, + "learning_rate": 2.9973125248986026e-07, + "loss": 0.2449, + "reward": 0.5172991380095482, + "reward_std": 0.18670892529189587, + "rewards/accuracy_reward": 0.08258928917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1649 }, { "clip_ratio": 0.0, - "completion_length": 1771.66748046875, + "completion_length": 1656.1920166015625, "epoch": 0.49286834441042493, - "grad_norm": 5.949090957641602, - "kl": 0.416015625, - "learning_rate": 5.989513608257164e-08, - "loss": 0.0687, - "reward": 0.467075914144516, - "reward_std": 0.1593930423259735, - "rewards/accuracy_reward": 0.03571428754366934, + "grad_norm": 17.57750129699707, + "kl": 3.39453125, + "learning_rate": 2.994756804128582e-07, + "loss": 0.2218, + "reward": 0.4988839626312256, + "reward_std": 0.1599156092852354, + "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4313616305589676, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1650 }, { "clip_ratio": 0.0, - "completion_length": 1706.15185546875, + "completion_length": 1470.9509887695312, "epoch": 0.4931670524979464, - "grad_norm": 6.32806921005249, - "kl": 0.45556640625, - "learning_rate": 5.9844010902233e-08, - "loss": 0.0677, - "reward": 0.5189732238650322, - "reward_std": 0.19353551417589188, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 61.94028091430664, + "kl": 2.546875, + "learning_rate": 2.99220054511165e-07, + "loss": 0.25, + "reward": 0.5295759215950966, + "reward_std": 0.1381382755935192, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4296875223517418, + "rewards/tag_count_reward": 0.4581473395228386, "step": 1651 }, { "clip_ratio": 0.0, - "completion_length": 1748.4063110351562, + "completion_length": 1577.7902526855469, "epoch": 0.49346576058546787, - "grad_norm": 23.076936721801758, - "kl": 0.6923828125, - "learning_rate": 5.979287501257531e-08, - "loss": 0.0909, - "reward": 0.554687537252903, - "reward_std": 0.20327558740973473, - "rewards/accuracy_reward": 0.1473214328289032, + "grad_norm": 32.88896942138672, + "kl": 4.3984375, + "learning_rate": 2.9896437506287654e-07, + "loss": 0.2957, + "reward": 0.5825893208384514, + "reward_std": 0.14154371060431004, + "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4073660895228386, + "rewards/tag_count_reward": 0.4263393059372902, "step": 1652 }, { "clip_ratio": 0.0, - "completion_length": 1736.5134887695312, + "completion_length": 1583.4911499023438, "epoch": 0.49376446867298934, - "grad_norm": 4.717783451080322, - "kl": 0.53271484375, - "learning_rate": 5.974172846922941e-08, - "loss": 0.0893, - "reward": 0.4375000223517418, - "reward_std": 0.19234313443303108, - "rewards/accuracy_reward": 0.029017857974395156, + "grad_norm": 15.538206100463867, + "kl": 3.1875, + "learning_rate": 2.9870864234614706e-07, + "loss": 0.2173, + "reward": 0.4681919813156128, + "reward_std": 0.14478963240981102, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4084821566939354, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1653 }, { "clip_ratio": 0.0, - "completion_length": 1842.0112609863281, + "completion_length": 1710.3884582519531, "epoch": 0.4940631767605108, - "grad_norm": 4.555828094482422, - "kl": 0.599609375, - "learning_rate": 5.969057132783774e-08, - "loss": 0.0697, - "reward": 0.4687500223517418, - "reward_std": 0.16255217418074608, - "rewards/accuracy_reward": 0.0535714291036129, + "grad_norm": 11.06654167175293, + "kl": 4.0625, + "learning_rate": 2.9845285663918876e-07, + "loss": 0.2636, + "reward": 0.4877232313156128, + "reward_std": 0.15105099231004715, + "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.415178582072258, + "rewards/tag_count_reward": 0.4341518059372902, "step": 1654 }, { "clip_ratio": 0.0, - "completion_length": 1647.6384887695312, + "completion_length": 1520.5692749023438, "epoch": 0.4943618848480323, - "grad_norm": 4.375738143920898, - "kl": 0.42822265625, - "learning_rate": 5.963940364405425e-08, - "loss": 0.0825, - "reward": 0.5139509215950966, - "reward_std": 0.20753148943185806, - "rewards/accuracy_reward": 0.09151786006987095, + "grad_norm": 9.066946029663086, + "kl": 3.09765625, + "learning_rate": 2.9819701822027123e-07, + "loss": 0.2356, + "reward": 0.5318080633878708, + "reward_std": 0.16076518595218658, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4224330633878708, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1655 }, { "clip_ratio": 0.0, - "completion_length": 1802.9777526855469, + "completion_length": 1633.97998046875, "epoch": 0.49466059293555376, - "grad_norm": 22.651226043701172, - "kl": 0.64501953125, - "learning_rate": 5.958822547354433e-08, - "loss": 0.0798, - "reward": 0.454799123108387, - "reward_std": 0.17427783645689487, - "rewards/accuracy_reward": 0.03571428661234677, + "grad_norm": 44.1402587890625, + "kl": 4.42578125, + "learning_rate": 2.9794112736772166e-07, + "loss": 0.2724, + "reward": 0.4626116305589676, + "reward_std": 0.15510892868041992, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4190848395228386, + "rewards/tag_count_reward": 0.435825914144516, "step": 1656 }, { "clip_ratio": 0.0, - "completion_length": 1758.6317749023438, + "completion_length": 1607.6585388183594, "epoch": 0.4949593010230752, - "grad_norm": 6.204532623291016, - "kl": 0.5810546875, - "learning_rate": 5.953703687198486e-08, - "loss": 0.0887, - "reward": 0.5340401977300644, - "reward_std": 0.19181105121970177, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 21.735055923461914, + "kl": 3.828125, + "learning_rate": 2.9768518435992427e-07, + "loss": 0.296, + "reward": 0.5357143059372902, + "reward_std": 0.16888817213475704, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366305589676, + "rewards/tag_count_reward": 0.430803582072258, "step": 1657 }, { "clip_ratio": 0.0, - "completion_length": 1611.82373046875, + "completion_length": 1437.6451416015625, "epoch": 0.49525800911059664, - "grad_norm": 26.897924423217773, - "kl": 0.61279296875, - "learning_rate": 5.9485837895063995e-08, - "loss": 0.1106, - "reward": 0.5474330633878708, - "reward_std": 0.1510153654962778, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 23.107942581176758, + "kl": 3.28515625, + "learning_rate": 2.9742918947531996e-07, + "loss": 0.2672, + "reward": 0.5680803880095482, + "reward_std": 0.13175320997834206, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4335937649011612, + "rewards/tag_count_reward": 0.4564732387661934, "step": 1658 }, { "clip_ratio": 0.0, - "completion_length": 1798.5000915527344, + "completion_length": 1615.8616638183594, "epoch": 0.4955567171981181, - "grad_norm": 24.521799087524414, - "kl": 0.783203125, - "learning_rate": 5.9434628598481204e-08, - "loss": 0.1006, - "reward": 0.4101562723517418, - "reward_std": 0.19064077362418175, - "rewards/accuracy_reward": 0.022321430034935474, + "grad_norm": 54.83425521850586, + "kl": 3.30859375, + "learning_rate": 2.9717314299240606e-07, + "loss": 0.2688, + "reward": 0.4492187649011612, + "reward_std": 0.14813455007970333, + "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3878348395228386, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1659 }, { "clip_ratio": 0.0, - "completion_length": 1640.66748046875, + "completion_length": 1523.9710388183594, "epoch": 0.4958554252856396, - "grad_norm": 15.229390144348145, - "kl": 0.62255859375, - "learning_rate": 5.9383409037947206e-08, - "loss": 0.0837, - "reward": 0.5005580559372902, - "reward_std": 0.17149320617318153, - "rewards/accuracy_reward": 0.06473214528523386, + "grad_norm": 10.82107925415039, + "kl": 2.865234375, + "learning_rate": 2.9691704518973603e-07, + "loss": 0.1888, + "reward": 0.5228794887661934, + "reward_std": 0.15354564413428307, + "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4358259066939354, + "rewards/tag_count_reward": 0.446986623108387, "step": 1660 }, { "clip_ratio": 0.0, - "completion_length": 1755.665283203125, + "completion_length": 1639.6563110351562, "epoch": 0.49615413337316105, - "grad_norm": 2626.53076171875, - "kl": 18.29296875, - "learning_rate": 5.933217926918386e-08, - "loss": 0.8316, - "reward": 0.4765625223517418, - "reward_std": 0.18247055634856224, - "rewards/accuracy_reward": 0.07366071850992739, + "grad_norm": 68.94493865966797, + "kl": 4.54296875, + "learning_rate": 2.966608963459193e-07, + "loss": 0.2543, + "reward": 0.501674123108387, + "reward_std": 0.1595360618084669, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029018059372902, + "rewards/tag_count_reward": 0.412388414144516, "step": 1661 }, { "clip_ratio": 0.0, - "completion_length": 1801.5625915527344, + "completion_length": 1690.2478332519531, "epoch": 0.4964528414606825, - "grad_norm": 5.070015907287598, - "kl": 0.5126953125, - "learning_rate": 5.9280939347924094e-08, - "loss": 0.0723, - "reward": 0.5039062574505806, - "reward_std": 0.15643328055739403, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 51.9144401550293, + "kl": 4.859375, + "learning_rate": 2.964046967396205e-07, + "loss": 0.2756, + "reward": 0.5172991380095482, + "reward_std": 0.14233889803290367, + "rewards/accuracy_reward": 0.08928572130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4168526977300644, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1662 }, { "clip_ratio": 0.0, - "completion_length": 1700.7366943359375, + "completion_length": 1574.1340026855469, "epoch": 0.496751549548204, - "grad_norm": 3.789243698120117, - "kl": 0.4267578125, - "learning_rate": 5.922968932991196e-08, - "loss": 0.0578, - "reward": 0.4369419887661934, - "reward_std": 0.13482568599283695, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 36.03261184692383, + "kl": 3.80078125, + "learning_rate": 2.961484466495598e-07, + "loss": 0.2245, + "reward": 0.4648437649011612, + "reward_std": 0.12114820070564747, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4302455633878708, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1663 }, { "clip_ratio": 0.0, - "completion_length": 1765.4889221191406, + "completion_length": 1650.4040832519531, "epoch": 0.49705025763572547, - "grad_norm": 6.950103282928467, - "kl": 0.50146484375, - "learning_rate": 5.917842927090244e-08, - "loss": 0.0706, - "reward": 0.4949776977300644, - "reward_std": 0.219305157661438, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 11.337340354919434, + "kl": 4.05859375, + "learning_rate": 2.958921463545122e-07, + "loss": 0.2739, + "reward": 0.486049123108387, + "reward_std": 0.2034171111881733, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4190848395228386, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1664 }, { "clip_ratio": 0.0, - "completion_length": 1819.0536193847656, + "completion_length": 1675.9442443847656, "epoch": 0.49734896572324694, - "grad_norm": 8.978273391723633, - "kl": 0.72265625, - "learning_rate": 5.912715922666146e-08, - "loss": 0.0923, - "reward": 0.4531250298023224, - "reward_std": 0.20954928919672966, - "rewards/accuracy_reward": 0.05357143213041127, + "grad_norm": 21.9161319732666, + "kl": 3.52734375, + "learning_rate": 2.956357961333073e-07, + "loss": 0.2511, + "reward": 0.5117187723517418, + "reward_std": 0.16327261365950108, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3995535969734192, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1665 }, { "clip_ratio": 0.0, - "completion_length": 1800.1004943847656, + "completion_length": 1733.4398498535156, "epoch": 0.4976476738107684, - "grad_norm": 11.274443626403809, - "kl": 0.64892578125, - "learning_rate": 5.907587925296579e-08, - "loss": 0.0876, - "reward": 0.5083705633878708, - "reward_std": 0.22375128790736198, - "rewards/accuracy_reward": 0.11830357275903225, + "grad_norm": 32.65961456298828, + "kl": 4.6953125, + "learning_rate": 2.9537939626482895e-07, + "loss": 0.2837, + "reward": 0.5279018133878708, + "reward_std": 0.18383228033781052, + "rewards/accuracy_reward": 0.11160714598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669813156128, + "rewards/tag_count_reward": 0.4162946566939354, "step": 1666 }, { "clip_ratio": 0.0, - "completion_length": 1778.6005554199219, + "completion_length": 1657.2099304199219, "epoch": 0.4979463818982899, - "grad_norm": 4.53627347946167, - "kl": 0.59033203125, - "learning_rate": 5.902458940560303e-08, - "loss": 0.0832, - "reward": 0.542410746216774, - "reward_std": 0.21412479132413864, - "rewards/accuracy_reward": 0.1339285746216774, + "grad_norm": 21.11324691772461, + "kl": 4.046875, + "learning_rate": 2.9512294702801515e-07, + "loss": 0.2521, + "reward": 0.5474330633878708, + "reward_std": 0.18474116921424866, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4084821566939354, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1667 }, { "clip_ratio": 0.0, - "completion_length": 1765.0402526855469, + "completion_length": 1638.196533203125, "epoch": 0.49824508998581135, - "grad_norm": 12.456587791442871, - "kl": 0.63525390625, - "learning_rate": 5.89732897403715e-08, - "loss": 0.0796, - "reward": 0.4564732238650322, - "reward_std": 0.17004967480897903, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 33.767112731933594, + "kl": 4.009765625, + "learning_rate": 2.948664487018575e-07, + "loss": 0.2482, + "reward": 0.497209832072258, + "reward_std": 0.16728181391954422, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4095982313156128, + "rewards/tag_count_reward": 0.4347098469734192, "step": 1668 }, { "clip_ratio": 0.0, - "completion_length": 1702.49560546875, + "completion_length": 1588.3572082519531, "epoch": 0.4985437980733328, - "grad_norm": 3.8181681632995605, - "kl": 0.46875, - "learning_rate": 5.892198031308022e-08, - "loss": 0.0637, - "reward": 0.4927455708384514, - "reward_std": 0.18562466651201248, - "rewards/accuracy_reward": 0.06696428940631449, + "grad_norm": 35.746307373046875, + "kl": 2.53125, + "learning_rate": 2.946099015654011e-07, + "loss": 0.1953, + "reward": 0.5228794887661934, + "reward_std": 0.1865379586815834, + "rewards/accuracy_reward": 0.08035714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4257812723517418, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1669 }, { "clip_ratio": 0.0, - "completion_length": 1674.7366943359375, + "completion_length": 1573.3371276855469, "epoch": 0.4988425061608543, - "grad_norm": 4.616519451141357, - "kl": 0.44970703125, - "learning_rate": 5.8870661179548796e-08, - "loss": 0.0796, - "reward": 0.5000000149011612, - "reward_std": 0.14474791660904884, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 60.449493408203125, + "kl": 2.484375, + "learning_rate": 2.9435330589774397e-07, + "loss": 0.2002, + "reward": 0.5295759215950966, + "reward_std": 0.1064897608011961, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4129464477300644, + "rewards/tag_count_reward": 0.446986623108387, "step": 1670 }, { "clip_ratio": 0.0, - "completion_length": 1752.7813415527344, + "completion_length": 1640.2723999023438, "epoch": 0.49914121424837576, - "grad_norm": 7.0126752853393555, - "kl": 0.44189453125, - "learning_rate": 5.8819332395607425e-08, - "loss": 0.0758, - "reward": 0.4497768059372902, - "reward_std": 0.20002481341362, - "rewards/accuracy_reward": 0.04464286006987095, + "grad_norm": 52.75844955444336, + "kl": 2.3583984375, + "learning_rate": 2.940966619780371e-07, + "loss": 0.1735, + "reward": 0.4776785969734192, + "reward_std": 0.15617157891392708, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4051339477300644, + "rewards/tag_count_reward": 0.439732164144516, "step": 1671 }, { "clip_ratio": 0.0, - "completion_length": 1694.8125610351562, + "completion_length": 1547.63623046875, "epoch": 0.49943992233589724, - "grad_norm": 3.3934783935546875, - "kl": 0.352294921875, - "learning_rate": 5.87679940170968e-08, - "loss": 0.0659, - "reward": 0.4782366305589676, - "reward_std": 0.18657098710536957, - "rewards/accuracy_reward": 0.049107145285233855, + "grad_norm": 19.882774353027344, + "kl": 2.62109375, + "learning_rate": 2.93839970085484e-07, + "loss": 0.1842, + "reward": 0.5094866380095482, + "reward_std": 0.17309125140309334, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4291294887661934, + "rewards/tag_count_reward": 0.451450914144516, "step": 1672 }, { "clip_ratio": 0.0, - "completion_length": 1786.7635192871094, + "completion_length": 1643.51123046875, "epoch": 0.4997386304234187, - "grad_norm": 4.240797996520996, - "kl": 0.43896484375, - "learning_rate": 5.871664609986804e-08, - "loss": 0.087, - "reward": 0.4626116380095482, - "reward_std": 0.1858838051557541, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 25.425533294677734, + "kl": 4.45703125, + "learning_rate": 2.935832304993402e-07, + "loss": 0.287, + "reward": 0.4771205484867096, + "reward_std": 0.12992664612829685, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.404575914144516, + "rewards/tag_count_reward": 0.4213169813156128, "step": 1673 }, { "clip_ratio": 0.0, - "completion_length": 1741.4420776367188, + "completion_length": 1586.6942443847656, "epoch": 0.5000373385109402, - "grad_norm": 5.75970983505249, - "kl": 0.369140625, - "learning_rate": 5.8665288699782644e-08, - "loss": 0.072, - "reward": 0.5039062798023224, - "reward_std": 0.21580183133482933, - "rewards/accuracy_reward": 0.08705357392318547, + "grad_norm": 12.34133529663086, + "kl": 3.01953125, + "learning_rate": 2.933264434989132e-07, + "loss": 0.1938, + "reward": 0.5452009290456772, + "reward_std": 0.16517481207847595, + "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4168526977300644, + "rewards/tag_count_reward": 0.446986623108387, "step": 1674 }, { "clip_ratio": 0.0, - "completion_length": 1788.3773193359375, + "completion_length": 1694.665283203125, "epoch": 0.5003360465984616, - "grad_norm": 16.08133888244629, - "kl": 0.5673828125, - "learning_rate": 5.8613921872712434e-08, - "loss": 0.0785, - "reward": 0.4966518208384514, - "reward_std": 0.26300402730703354, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 13.038610458374023, + "kl": 3.54296875, + "learning_rate": 2.930696093635622e-07, + "loss": 0.2234, + "reward": 0.504464328289032, + "reward_std": 0.21597828716039658, + "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.407366082072258, + "rewards/tag_count_reward": 0.4397321566939354, "step": 1675 }, { "clip_ratio": 0.0, - "completion_length": 1709.5558471679688, + "completion_length": 1621.8482666015625, "epoch": 0.5006347546859832, - "grad_norm": 3.3101046085357666, - "kl": 0.4287109375, - "learning_rate": 5.856254567453949e-08, - "loss": 0.0693, - "reward": 0.521763414144516, - "reward_std": 0.1228205468505621, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 25.91422462463379, + "kl": 2.87109375, + "learning_rate": 2.9281272837269744e-07, + "loss": 0.1909, + "reward": 0.546316996216774, + "reward_std": 0.11880997382104397, + "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4414062723517418, + "rewards/tag_count_reward": 0.448102705180645, "step": 1676 }, { "clip_ratio": 0.0, - "completion_length": 1627.6518249511719, + "completion_length": 1527.2098693847656, "epoch": 0.5009334627735046, - "grad_norm": 3.6088690757751465, - "kl": 0.51416015625, - "learning_rate": 5.851116016115606e-08, - "loss": 0.084, - "reward": 0.4916294887661934, - "reward_std": 0.17392845638096333, - "rewards/accuracy_reward": 0.06473214528523386, + "grad_norm": 10.468071937561035, + "kl": 3.12109375, + "learning_rate": 2.925558008057803e-07, + "loss": 0.2032, + "reward": 0.5195312723517418, + "reward_std": 0.1293280217796564, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4268973395228386, + "rewards/tag_count_reward": 0.4525669813156128, "step": 1677 }, { "clip_ratio": 0.0, - "completion_length": 1798.5447387695312, + "completion_length": 1625.3884887695312, "epoch": 0.5012321708610261, - "grad_norm": 7.0432658195495605, - "kl": 0.6376953125, - "learning_rate": 5.8459765388464566e-08, - "loss": 0.1007, - "reward": 0.4581473469734192, - "reward_std": 0.18804604560136795, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 37.449989318847656, + "kl": 3.080078125, + "learning_rate": 2.9229882694232285e-07, + "loss": 0.2147, + "reward": 0.495535746216774, + "reward_std": 0.1395468283444643, + "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.435267873108387, "step": 1678 }, { "clip_ratio": 0.0, - "completion_length": 1735.1741943359375, + "completion_length": 1586.0536499023438, "epoch": 0.5015308789485475, - "grad_norm": 4.378961563110352, - "kl": 0.56396484375, - "learning_rate": 5.840836141237747e-08, - "loss": 0.0834, - "reward": 0.4893973395228386, - "reward_std": 0.17666474729776382, - "rewards/accuracy_reward": 0.07366071618162096, + "grad_norm": 54.981346130371094, + "kl": 2.37890625, + "learning_rate": 2.9204180706188735e-07, + "loss": 0.1903, + "reward": 0.5089285895228386, + "reward_std": 0.13314694166183472, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.415736623108387, + "rewards/tag_count_reward": 0.450892873108387, "step": 1679 }, { "clip_ratio": 0.0, - "completion_length": 1766.5223999023438, + "completion_length": 1674.2054443359375, "epoch": 0.501829587036069, - "grad_norm": 3.6837942600250244, - "kl": 0.50537109375, - "learning_rate": 5.835694828881728e-08, - "loss": 0.065, - "reward": 0.460379496216774, - "reward_std": 0.18324894830584526, - "rewards/accuracy_reward": 0.04464285960420966, + "grad_norm": 6.192504405975342, + "kl": 3.515625, + "learning_rate": 2.917847414440864e-07, + "loss": 0.2105, + "reward": 0.459263414144516, + "reward_std": 0.1515572890639305, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366305589676, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1680 }, { "clip_ratio": 0.0, - "completion_length": 1775.9130249023438, + "completion_length": 1641.3616943359375, "epoch": 0.5021282951235905, - "grad_norm": 5.9482808113098145, - "kl": 0.54541015625, - "learning_rate": 5.830552607371639e-08, - "loss": 0.0763, - "reward": 0.4888393133878708, - "reward_std": 0.1790812648832798, - "rewards/accuracy_reward": 0.08705357694998384, + "grad_norm": 6.696020603179932, + "kl": 3.6875, + "learning_rate": 2.91527630368582e-07, + "loss": 0.2436, + "reward": 0.521763414144516, + "reward_std": 0.15224834345281124, + "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4017857313156128, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1681 }, { "clip_ratio": 0.0, - "completion_length": 1782.54248046875, + "completion_length": 1638.30810546875, "epoch": 0.5024270032111119, - "grad_norm": 8.311044692993164, - "kl": 0.552734375, - "learning_rate": 5.8254094823017184e-08, - "loss": 0.0739, - "reward": 0.4614955633878708, - "reward_std": 0.16970830783247948, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 17.795812606811523, + "kl": 3.2421875, + "learning_rate": 2.9127047411508596e-07, + "loss": 0.2034, + "reward": 0.4849330559372902, + "reward_std": 0.11843020841479301, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4101562649011612, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1682 }, { "clip_ratio": 0.0, - "completion_length": 1692.7054138183594, + "completion_length": 1576.88623046875, "epoch": 0.5027257112986334, - "grad_norm": 9.7755126953125, - "kl": 0.314453125, - "learning_rate": 5.8202654592671796e-08, - "loss": 0.0712, - "reward": 0.5078125298023224, - "reward_std": 0.19641589373350143, - "rewards/accuracy_reward": 0.08258929033763707, + "grad_norm": 48.165035247802734, + "kl": 2.90625, + "learning_rate": 2.9101327296335897e-07, + "loss": 0.231, + "reward": 0.5156250223517418, + "reward_std": 0.15507948398590088, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4252232313156128, + "rewards/tag_count_reward": 0.444196455180645, "step": 1683 }, { "clip_ratio": 0.0, - "completion_length": 1734.2054443359375, + "completion_length": 1650.544677734375, "epoch": 0.5030244193861548, - "grad_norm": 6.432696342468262, - "kl": 0.5048828125, - "learning_rate": 5.815120543864215e-08, - "loss": 0.0792, - "reward": 0.580357164144516, - "reward_std": 0.18288211524486542, - "rewards/accuracy_reward": 0.15401786309666932, + "grad_norm": 34.356109619140625, + "kl": 4.14453125, + "learning_rate": 2.9075602719321073e-07, + "loss": 0.2419, + "reward": 0.5864955633878708, + "reward_std": 0.17027808167040348, + "rewards/accuracy_reward": 0.15848215040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4263393059372902, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1684 }, { "clip_ratio": 0.0, - "completion_length": 1749.0715026855469, + "completion_length": 1585.9755554199219, "epoch": 0.5033231274736764, - "grad_norm": 9.884683609008789, - "kl": 0.40087890625, - "learning_rate": 5.809974741689989e-08, - "loss": 0.0891, - "reward": 0.4631696715950966, - "reward_std": 0.20654841139912605, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 35.28438949584961, + "kl": 3.92578125, + "learning_rate": 2.9049873708449946e-07, + "loss": 0.2574, + "reward": 0.471540205180645, + "reward_std": 0.151686517521739, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.411830373108387, + "rewards/tag_count_reward": 0.4358259215950966, "step": 1685 }, { "clip_ratio": 0.0, - "completion_length": 1775.85498046875, + "completion_length": 1594.3482971191406, "epoch": 0.5036218355611978, - "grad_norm": 6.274776458740234, - "kl": 0.5009765625, - "learning_rate": 5.804828058342631e-08, - "loss": 0.1001, - "reward": 0.4871651977300644, - "reward_std": 0.20861174911260605, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 38.939884185791016, + "kl": 4.8359375, + "learning_rate": 2.902414029171316e-07, + "loss": 0.3417, + "reward": 0.5106027126312256, + "reward_std": 0.18022429384291172, + "rewards/accuracy_reward": 0.08035714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1686 }, { "clip_ratio": 0.0, - "completion_length": 1799.7188415527344, + "completion_length": 1686.5938110351562, "epoch": 0.5039205436487193, - "grad_norm": 14.509649276733398, - "kl": 0.673828125, - "learning_rate": 5.7996804994212264e-08, - "loss": 0.086, - "reward": 0.4614955559372902, - "reward_std": 0.19297377206385136, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 29.193008422851562, + "kl": 4.82421875, + "learning_rate": 2.8998402497106133e-07, + "loss": 0.3009, + "reward": 0.4776785969734192, + "reward_std": 0.17025581002235413, + "rewards/accuracy_reward": 0.05357143050059676, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4034598395228386, + "rewards/tag_count_reward": 0.424107164144516, "step": 1687 }, { "clip_ratio": 0.0, - "completion_length": 1621.4666137695312, + "completion_length": 1507.62060546875, "epoch": 0.5042192517362407, - "grad_norm": 4.911339282989502, - "kl": 0.596435546875, - "learning_rate": 5.794532070525816e-08, - "loss": 0.1041, - "reward": 0.5234375298023224, - "reward_std": 0.19026800990104675, - "rewards/accuracy_reward": 0.10937500605359674, + "grad_norm": 30.875288009643555, + "kl": 3.630859375, + "learning_rate": 2.897266035262908e-07, + "loss": 0.2258, + "reward": 0.564732164144516, + "reward_std": 0.13637754693627357, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4140625149011612, + "rewards/tag_count_reward": 0.4486607387661934, "step": 1688 }, { "clip_ratio": 0.0, - "completion_length": 1739.9219665527344, + "completion_length": 1615.6452026367188, "epoch": 0.5045179598237622, - "grad_norm": 6.683948516845703, - "kl": 0.43017578125, - "learning_rate": 5.789382777257386e-08, - "loss": 0.0709, - "reward": 0.502232164144516, - "reward_std": 0.16857220232486725, - "rewards/accuracy_reward": 0.08035714481957257, + "grad_norm": 16.45070457458496, + "kl": 3.7109375, + "learning_rate": 2.894691388628693e-07, + "loss": 0.2348, + "reward": 0.510044664144516, + "reward_std": 0.14795215614140034, + "rewards/accuracy_reward": 0.07366071711294353, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4218750149011612, + "rewards/tag_count_reward": 0.436383955180645, "step": 1689 }, { "clip_ratio": 0.0, - "completion_length": 1746.4978332519531, + "completion_length": 1621.13623046875, "epoch": 0.5048166679112837, - "grad_norm": 14.69896411895752, - "kl": 0.650390625, - "learning_rate": 5.784232625217862e-08, - "loss": 0.092, - "reward": 0.4832589477300644, - "reward_std": 0.15229958482086658, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 32.02790069580078, + "kl": 3.86328125, + "learning_rate": 2.892116312608931e-07, + "loss": 0.2311, + "reward": 0.5167410969734192, + "reward_std": 0.13097978569567204, + "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4051339477300644, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1690 }, { "clip_ratio": 0.0, - "completion_length": 1766.6451416015625, + "completion_length": 1663.7880249023438, "epoch": 0.5051153759988052, - "grad_norm": 5.362015247344971, - "kl": 0.58154296875, - "learning_rate": 5.779081620010103e-08, - "loss": 0.0747, - "reward": 0.4938616380095482, - "reward_std": 0.17478568851947784, - "rewards/accuracy_reward": 0.098214291036129, + "grad_norm": 30.566143035888672, + "kl": 3.53125, + "learning_rate": 2.889540810005052e-07, + "loss": 0.2019, + "reward": 0.5435268133878708, + "reward_std": 0.13941910490393639, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.395647332072258, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1691 }, { "clip_ratio": 0.0, - "completion_length": 1777.27685546875, + "completion_length": 1683.3817749023438, "epoch": 0.5054140840863266, - "grad_norm": 14.879669189453125, - "kl": 0.5537109375, - "learning_rate": 5.773929767237902e-08, - "loss": 0.0865, - "reward": 0.4877232387661934, - "reward_std": 0.18643613904714584, - "rewards/accuracy_reward": 0.07589286006987095, + "grad_norm": 8.324190139770508, + "kl": 3.95703125, + "learning_rate": 2.886964883618951e-07, + "loss": 0.2567, + "reward": 0.4966518059372902, + "reward_std": 0.12623316049575806, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.411830373108387, + "rewards/tag_count_reward": 0.4408482313156128, "step": 1692 }, { "clip_ratio": 0.0, - "completion_length": 1750.0469360351562, + "completion_length": 1636.5134887695312, "epoch": 0.5057127921738481, - "grad_norm": 4.11676025390625, - "kl": 0.47900390625, - "learning_rate": 5.768777072505966e-08, - "loss": 0.0879, - "reward": 0.4414062798023224, - "reward_std": 0.17344310507178307, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 14.826581954956055, + "kl": 3.2734375, + "learning_rate": 2.884388536252983e-07, + "loss": 0.2329, + "reward": 0.471540205180645, + "reward_std": 0.14235100150108337, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.412388414144516, + "rewards/tag_count_reward": 0.4402901977300644, "step": 1693 }, { "clip_ratio": 0.0, - "completion_length": 1696.7500915527344, + "completion_length": 1582.1005249023438, "epoch": 0.5060115002613695, - "grad_norm": 2.993546485900879, - "kl": 0.47705078125, - "learning_rate": 5.763623541419924e-08, - "loss": 0.0756, - "reward": 0.592075914144516, - "reward_std": 0.19416572898626328, - "rewards/accuracy_reward": 0.1919642984867096, + "grad_norm": 28.0665283203125, + "kl": 2.76953125, + "learning_rate": 2.881811770709962e-07, + "loss": 0.1933, + "reward": 0.6473214328289032, + "reward_std": 0.15412876196205616, + "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.455357164144516, "step": 1694 }, { "clip_ratio": 0.0, - "completion_length": 1752.8125915527344, + "completion_length": 1661.47998046875, "epoch": 0.5063102083488911, - "grad_norm": 10.725166320800781, - "kl": 0.51953125, - "learning_rate": 5.758469179586312e-08, - "loss": 0.0868, - "reward": 0.5039062798023224, - "reward_std": 0.20096562057733536, - "rewards/accuracy_reward": 0.0892857180442661, + "grad_norm": 9.365323066711426, + "kl": 2.41796875, + "learning_rate": 2.8792345897931556e-07, + "loss": 0.157, + "reward": 0.5279018208384514, + "reward_std": 0.18057582527399063, + "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4146205484867096, + "rewards/tag_count_reward": 0.443080373108387, "step": 1695 }, { "clip_ratio": 0.0, - "completion_length": 1754.62060546875, + "completion_length": 1659.08935546875, "epoch": 0.5066089164364125, - "grad_norm": 5.36501932144165, - "kl": 0.513671875, - "learning_rate": 5.75331399261257e-08, - "loss": 0.0897, - "reward": 0.5072544738650322, - "reward_std": 0.20028329640626907, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 20.349254608154297, + "kl": 3.435546875, + "learning_rate": 2.8766569963062847e-07, + "loss": 0.2295, + "reward": 0.5340401977300644, + "reward_std": 0.16161047108471394, + "rewards/accuracy_reward": 0.09598214831203222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4068080559372902, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1696 }, { "clip_ratio": 0.0, - "completion_length": 1695.6451721191406, + "completion_length": 1603.1138916015625, "epoch": 0.506907624523934, - "grad_norm": 5.457496166229248, - "kl": 0.43212890625, - "learning_rate": 5.7481579861070374e-08, - "loss": 0.1018, - "reward": 0.5145089477300644, - "reward_std": 0.20478496700525284, - "rewards/accuracy_reward": 0.09151786169968545, + "grad_norm": 17.645732879638672, + "kl": 3.4453125, + "learning_rate": 2.874078993053519e-07, + "loss": 0.2271, + "reward": 0.5597098469734192, + "reward_std": 0.18300314620137215, + "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4229910895228386, + "rewards/tag_count_reward": 0.4481026977300644, "step": 1697 }, { "clip_ratio": 0.0, - "completion_length": 1759.1295166015625, + "completion_length": 1599.93310546875, "epoch": 0.5072063326114554, - "grad_norm": 5.945630073547363, - "kl": 0.5166015625, - "learning_rate": 5.7430011656789426e-08, - "loss": 0.0879, - "reward": 0.5044643059372902, - "reward_std": 0.21526460722088814, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 7.805138111114502, + "kl": 3.078125, + "learning_rate": 2.871500582839471e-07, + "loss": 0.2113, + "reward": 0.5463169887661934, + "reward_std": 0.15872856229543686, + "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4084821492433548, + "rewards/tag_count_reward": 0.4458705633878708, "step": 1698 }, { "clip_ratio": 0.0, - "completion_length": 1737.4331359863281, + "completion_length": 1602.9286499023438, "epoch": 0.507505040698977, - "grad_norm": 3.9728221893310547, - "kl": 0.55078125, - "learning_rate": 5.737843536938402e-08, - "loss": 0.0669, - "reward": 0.5329241380095482, - "reward_std": 0.2052437998354435, - "rewards/accuracy_reward": 0.12946429196745157, + "grad_norm": 51.21775817871094, + "kl": 4.6875, + "learning_rate": 2.8689217684692014e-07, + "loss": 0.2819, + "reward": 0.544642873108387, + "reward_std": 0.20269916579127312, + "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4034598395228386, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1699 }, { "clip_ratio": 0.0, - "completion_length": 1696.0156860351562, + "completion_length": 1544.700927734375, "epoch": 0.5078037487864984, - "grad_norm": 5.640941143035889, - "kl": 0.55322265625, - "learning_rate": 5.73268510549641e-08, - "loss": 0.1041, - "reward": 0.4386160895228386, - "reward_std": 0.18350149877369404, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 12.312024116516113, + "kl": 3.150390625, + "learning_rate": 2.8663425527482047e-07, + "loss": 0.2183, + "reward": 0.490513414144516, + "reward_std": 0.1532817929983139, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029017984867096, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1700 }, { "clip_ratio": 0.0, - "completion_length": 1803.134033203125, + "completion_length": 1665.1094665527344, "epoch": 0.5081024568740199, - "grad_norm": 8.871408462524414, - "kl": 0.572265625, - "learning_rate": 5.727525876964834e-08, - "loss": 0.0773, - "reward": 0.5518973469734192, - "reward_std": 0.23602242767810822, - "rewards/accuracy_reward": 0.14285714831203222, + "grad_norm": 7.044699668884277, + "kl": 2.86328125, + "learning_rate": 2.863762938482417e-07, + "loss": 0.1765, + "reward": 0.588727705180645, + "reward_std": 0.19333726540207863, + "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.409040205180645, + "rewards/tag_count_reward": 0.4503348395228386, "step": 1701 }, { "clip_ratio": 0.0, - "completion_length": 1719.9754943847656, + "completion_length": 1552.8951721191406, "epoch": 0.5084011649615413, - "grad_norm": 5.793740272521973, - "kl": 0.54541015625, - "learning_rate": 5.722365856956408e-08, - "loss": 0.0854, - "reward": 0.463727705180645, - "reward_std": 0.20054088905453682, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 15.801091194152832, + "kl": 3.51953125, + "learning_rate": 2.861182928478204e-07, + "loss": 0.246, + "reward": 0.5033482387661934, + "reward_std": 0.16814018599689007, + "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4056919813156128, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1702 }, { "clip_ratio": 0.0, - "completion_length": 1779.9911499023438, + "completion_length": 1656.74560546875, "epoch": 0.5086998730490628, - "grad_norm": 7.889227867126465, - "kl": 0.61376953125, - "learning_rate": 5.7172050510847304e-08, - "loss": 0.0897, - "reward": 0.5630580484867096, - "reward_std": 0.17666624300181866, - "rewards/accuracy_reward": 0.1674107201397419, + "grad_norm": 23.930551528930664, + "kl": 3.98828125, + "learning_rate": 2.8586025255423653e-07, + "loss": 0.2494, + "reward": 0.5926339626312256, + "reward_std": 0.1494963150471449, + "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.436383955180645, "step": 1703 }, { "clip_ratio": 0.0, - "completion_length": 1736.87060546875, + "completion_length": 1576.4911193847656, "epoch": 0.5089985811365842, - "grad_norm": 4.747326374053955, - "kl": 0.6171875, - "learning_rate": 5.7120434649642504e-08, - "loss": 0.1047, - "reward": 0.431919664144516, - "reward_std": 0.1922074370086193, - "rewards/accuracy_reward": 0.04910714505240321, + "grad_norm": 32.162025451660156, + "kl": 3.05078125, + "learning_rate": 2.8560217324821253e-07, + "loss": 0.2356, + "reward": 0.4949776977300644, + "reward_std": 0.10406588017940521, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125149011612, + "rewards/tag_count_reward": 0.4525669887661934, "step": 1704 }, { "clip_ratio": 0.0, - "completion_length": 1842.7232971191406, + "completion_length": 1756.6429443359375, "epoch": 0.5092972892241058, - "grad_norm": 6.129151821136475, - "kl": 0.64453125, - "learning_rate": 5.706881104210267e-08, - "loss": 0.0863, - "reward": 0.4609375149011612, - "reward_std": 0.17204393073916435, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 58.297550201416016, + "kl": 5.296875, + "learning_rate": 2.8534405521051334e-07, + "loss": 0.3031, + "reward": 0.4977678880095482, + "reward_std": 0.13423941284418106, + "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125223517418, + "rewards/tag_count_reward": 0.4196428805589676, "step": 1705 }, { "clip_ratio": 0.0, - "completion_length": 1692.6563110351562, + "completion_length": 1571.2634582519531, "epoch": 0.5095959973116272, - "grad_norm": 3.040463447570801, - "kl": 0.5009765625, - "learning_rate": 5.7017179744389264e-08, - "loss": 0.0737, - "reward": 0.4503348395228386, - "reward_std": 0.20169005542993546, - "rewards/accuracy_reward": 0.035714288242161274, + "grad_norm": 7.30522346496582, + "kl": 2.966796875, + "learning_rate": 2.850858987219463e-07, + "loss": 0.1995, + "reward": 0.4899553805589676, + "reward_std": 0.13860627822577953, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4146205484867096, + "rewards/tag_count_reward": 0.4542410969734192, "step": 1706 }, { "clip_ratio": 0.0, - "completion_length": 1762.5291137695312, + "completion_length": 1671.4398193359375, "epoch": 0.5098947053991487, - "grad_norm": 12.697985649108887, - "kl": 0.61376953125, - "learning_rate": 5.6965540812672044e-08, - "loss": 0.0781, - "reward": 0.4609375149011612, - "reward_std": 0.19305440410971642, - "rewards/accuracy_reward": 0.06250000419095159, + "grad_norm": 34.25748062133789, + "kl": 2.87890625, + "learning_rate": 2.848277040633602e-07, + "loss": 0.2095, + "reward": 0.4793526977300644, + "reward_std": 0.12168661318719387, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3984375149011612, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1707 }, { "clip_ratio": 0.0, - "completion_length": 1705.63623046875, + "completion_length": 1629.7277526855469, "epoch": 0.5101934134866701, - "grad_norm": 6.468627452850342, - "kl": 0.478515625, - "learning_rate": 5.691389430312912e-08, - "loss": 0.0748, - "reward": 0.5262276977300644, - "reward_std": 0.19190123304724693, - "rewards/accuracy_reward": 0.1026785746216774, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.423549123108387, + "grad_norm": 9.296675682067871, + "kl": 2.958984375, + "learning_rate": 2.845694715156456e-07, + "loss": 0.19, + "reward": 0.5334821715950966, + "reward_std": 0.13663618452847004, + "rewards/accuracy_reward": 0.080357147147879, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4531250149011612, "step": 1708 }, { "clip_ratio": 0.0, - "completion_length": 1739.7277526855469, + "completion_length": 1647.4286193847656, "epoch": 0.5104921215741917, - "grad_norm": 8.60976505279541, - "kl": 0.4736328125, - "learning_rate": 5.686224027194682e-08, - "loss": 0.0887, - "reward": 0.5362723469734192, - "reward_std": 0.2040967345237732, - "rewards/accuracy_reward": 0.11383929150179029, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4202009066939354, + "grad_norm": 15.512045860290527, + "kl": 2.630859375, + "learning_rate": 2.843112013597341e-07, + "loss": 0.1849, + "reward": 0.572544664144516, + "reward_std": 0.1516516599804163, + "rewards/accuracy_reward": 0.11383929220028222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4587053805589676, "step": 1709 }, { "clip_ratio": 0.0, - "completion_length": 1646.3572082519531, + "completion_length": 1586.5782165527344, "epoch": 0.5107908296617131, - "grad_norm": 3.534381628036499, - "kl": 0.47900390625, - "learning_rate": 5.681057877531967e-08, - "loss": 0.0783, - "reward": 0.5820312723517418, - "reward_std": 0.2108813300728798, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 31.277374267578125, + "kl": 2.5703125, + "learning_rate": 2.840528938765984e-07, + "loss": 0.1766, + "reward": 0.5970982387661934, + "reward_std": 0.17505653202533722, + "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4213169887661934, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1710 }, { "clip_ratio": 0.0, - "completion_length": 1700.96435546875, + "completion_length": 1551.6094360351562, "epoch": 0.5110895377492346, - "grad_norm": 7.907309532165527, - "kl": 0.69091796875, - "learning_rate": 5.6758909869450336e-08, - "loss": 0.1116, - "reward": 0.579799123108387, - "reward_std": 0.2541929520666599, - "rewards/accuracy_reward": 0.1785714328289032, + "grad_norm": 13.63786792755127, + "kl": 3.328125, + "learning_rate": 2.8379454934725165e-07, + "loss": 0.2456, + "reward": 0.6858259290456772, + "reward_std": 0.25161556527018547, + "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4012276902794838, + "rewards/tag_count_reward": 0.446986623108387, "step": 1711 }, { "clip_ratio": 0.0, - "completion_length": 1711.05810546875, + "completion_length": 1576.1764221191406, "epoch": 0.511388245836756, - "grad_norm": 7.10408353805542, - "kl": 0.4404296875, - "learning_rate": 5.67072336105495e-08, - "loss": 0.0886, - "reward": 0.507254496216774, - "reward_std": 0.14994528517127037, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 12.431355476379395, + "kl": 2.9453125, + "learning_rate": 2.835361680527475e-07, + "loss": 0.1949, + "reward": 0.5463169813156128, + "reward_std": 0.11601202562451363, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4246651977300644, + "rewards/tag_count_reward": 0.4547991305589676, "step": 1712 }, { "clip_ratio": 0.0, - "completion_length": 1672.9129943847656, + "completion_length": 1560.6005249023438, "epoch": 0.5116869539242775, - "grad_norm": 3.6639413833618164, - "kl": 0.49658203125, - "learning_rate": 5.665555005483588e-08, - "loss": 0.0865, - "reward": 0.4810268133878708, - "reward_std": 0.18161585554480553, - "rewards/accuracy_reward": 0.07142857508733869, + "grad_norm": 33.89189147949219, + "kl": 3.048828125, + "learning_rate": 2.832777502741794e-07, + "loss": 0.2441, + "reward": 0.5301339402794838, + "reward_std": 0.18215370550751686, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4095982313156128, + "rewards/tag_count_reward": 0.4408482313156128, "step": 1713 }, { "clip_ratio": 0.0, - "completion_length": 1782.9130554199219, + "completion_length": 1664.5067749023438, "epoch": 0.511985662011799, - "grad_norm": 3.9398984909057617, - "kl": 0.482421875, - "learning_rate": 5.6603859258536114e-08, - "loss": 0.0817, - "reward": 0.493861623108387, - "reward_std": 0.22507883980870247, - "rewards/accuracy_reward": 0.09151786100119352, + "grad_norm": 19.321317672729492, + "kl": 3.712890625, + "learning_rate": 2.830192962926806e-07, + "loss": 0.2385, + "reward": 0.4955357238650322, + "reward_std": 0.14425735734403133, + "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437723517418, + "rewards/tag_count_reward": 0.4375000149011612, "step": 1714 }, { "clip_ratio": 0.0, - "completion_length": 1700.22998046875, + "completion_length": 1537.7478332519531, "epoch": 0.5122843700993205, - "grad_norm": 4.555171966552734, - "kl": 0.490234375, - "learning_rate": 5.6552161277884713e-08, - "loss": 0.0937, - "reward": 0.5301339477300644, - "reward_std": 0.22331983596086502, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 7.224479675292969, + "kl": 2.80078125, + "learning_rate": 2.827608063894236e-07, + "loss": 0.1833, + "reward": 0.537388414144516, + "reward_std": 0.14651411026716232, + "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4185268059372902, + "rewards/tag_count_reward": 0.4547991305589676, "step": 1715 }, { "clip_ratio": 0.0, - "completion_length": 1713.3327026367188, + "completion_length": 1571.1785888671875, "epoch": 0.5125830781868419, - "grad_norm": 13.010358810424805, - "kl": 0.65869140625, - "learning_rate": 5.650045616912402e-08, - "loss": 0.0749, - "reward": 0.4715401977300644, - "reward_std": 0.1861034408211708, - "rewards/accuracy_reward": 0.06696428591385484, + "grad_norm": 24.615938186645508, + "kl": 3.48828125, + "learning_rate": 2.825022808456201e-07, + "loss": 0.2104, + "reward": 0.5318080633878708, + "reward_std": 0.17444274947047234, + "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.404575914144516, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1716 }, { "clip_ratio": 0.0, - "completion_length": 1758.29248046875, + "completion_length": 1592.51123046875, "epoch": 0.5128817862743634, - "grad_norm": 5.765406131744385, - "kl": 0.5849609375, - "learning_rate": 5.6448743988504144e-08, - "loss": 0.0976, - "reward": 0.4916294887661934, - "reward_std": 0.18990936130285263, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 39.29931640625, + "kl": 3.09765625, + "learning_rate": 2.8224371994252073e-07, + "loss": 0.2395, + "reward": 0.5608259066939354, + "reward_std": 0.16877485439181328, + "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1717 }, { "clip_ratio": 0.0, - "completion_length": 1730.8415832519531, + "completion_length": 1660.5179443359375, "epoch": 0.5131804943618848, - "grad_norm": 116.10564422607422, - "kl": 1.04736328125, - "learning_rate": 5.6397024792282855e-08, - "loss": 0.1041, - "reward": 0.4720982387661934, - "reward_std": 0.1816786341369152, - "rewards/accuracy_reward": 0.06026785867288709, + "grad_norm": 11.275134086608887, + "kl": 3.24609375, + "learning_rate": 2.8198512396141425e-07, + "loss": 0.1945, + "reward": 0.5245535969734192, + "reward_std": 0.14664108119904995, + "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.411830373108387, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1718 }, { "clip_ratio": 0.0, - "completion_length": 1689.5335388183594, + "completion_length": 1545.8527221679688, "epoch": 0.5134792024494064, - "grad_norm": 5.59490966796875, - "kl": 0.482421875, - "learning_rate": 5.634529863672555e-08, - "loss": 0.0831, - "reward": 0.5535714626312256, - "reward_std": 0.19458775967359543, - "rewards/accuracy_reward": 0.1450892894063145, + "grad_norm": 29.886648178100586, + "kl": 3.39453125, + "learning_rate": 2.8172649318362775e-07, + "loss": 0.2544, + "reward": 0.577008955180645, + "reward_std": 0.15247288346290588, + "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4084821566939354, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1719 }, { "clip_ratio": 0.0, - "completion_length": 1752.6139221191406, + "completion_length": 1672.2188415527344, "epoch": 0.5137779105369278, - "grad_norm": 8.322219848632812, - "kl": 0.4794921875, - "learning_rate": 5.629356557810525e-08, - "loss": 0.0859, - "reward": 0.4441964477300644, - "reward_std": 0.19700022041797638, - "rewards/accuracy_reward": 0.03571428777649999, + "grad_norm": 18.022705078125, + "kl": 4.46484375, + "learning_rate": 2.8146782789052623e-07, + "loss": 0.2832, + "reward": 0.4531250149011612, + "reward_std": 0.16237447038292885, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4084821566939354, + "rewards/tag_count_reward": 0.4285714477300644, "step": 1720 }, { "clip_ratio": 0.0, - "completion_length": 1766.3728332519531, + "completion_length": 1701.6384582519531, "epoch": 0.5140766186244493, - "grad_norm": 3.1799659729003906, - "kl": 0.41357421875, - "learning_rate": 5.624182567270244e-08, - "loss": 0.0658, - "reward": 0.4503348395228386, - "reward_std": 0.18813712894916534, - "rewards/accuracy_reward": 0.042410716880112886, + "grad_norm": 53.16096496582031, + "kl": 4.60546875, + "learning_rate": 2.8120912836351216e-07, + "loss": 0.2732, + "reward": 0.474888414144516, + "reward_std": 0.1595217101275921, + "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4079241305589676, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1721 }, { "clip_ratio": 0.0, - "completion_length": 1767.02685546875, + "completion_length": 1698.8616943359375, "epoch": 0.5143753267119707, - "grad_norm": 33.33684158325195, - "kl": 0.7236328125, - "learning_rate": 5.6190078976805046e-08, - "loss": 0.0966, - "reward": 0.4698660895228386, - "reward_std": 0.22689533606171608, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 50.88163757324219, + "kl": 4.02734375, + "learning_rate": 2.8095039488402524e-07, + "loss": 0.2784, + "reward": 0.5011160969734192, + "reward_std": 0.18065334483981133, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.396205373108387, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1722 }, { "clip_ratio": 0.0, - "completion_length": 1716.2411499023438, + "completion_length": 1618.7545166015625, "epoch": 0.5146740347994921, - "grad_norm": 5.883179187774658, - "kl": 0.501953125, - "learning_rate": 5.613832554670842e-08, - "loss": 0.0954, - "reward": 0.525111623108387, - "reward_std": 0.15712843090295792, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 18.025619506835938, + "kl": 3.033203125, + "learning_rate": 2.806916277335421e-07, + "loss": 0.208, + "reward": 0.5491071566939354, + "reward_std": 0.12111184559762478, + "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4202009066939354, + "rewards/tag_count_reward": 0.4441964402794838, "step": 1723 }, { "clip_ratio": 0.0, - "completion_length": 1795.5246276855469, + "completion_length": 1696.9822387695312, "epoch": 0.5149727428870137, - "grad_norm": 7.587532043457031, - "kl": 0.61181640625, - "learning_rate": 5.608656543871523e-08, - "loss": 0.0762, - "reward": 0.5061384066939354, - "reward_std": 0.21833180263638496, - "rewards/accuracy_reward": 0.11607143143191934, + "grad_norm": 38.09000778198242, + "kl": 4.83984375, + "learning_rate": 2.8043282719357616e-07, + "loss": 0.2816, + "reward": 0.5156250223517418, + "reward_std": 0.1665725726634264, + "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669813156128, + "rewards/tag_count_reward": 0.4151785895228386, "step": 1724 }, { "clip_ratio": 0.0, - "completion_length": 1804.8014526367188, + "completion_length": 1734.6473693847656, "epoch": 0.5152714509745351, - "grad_norm": 3.489287853240967, - "kl": 0.63671875, - "learning_rate": 5.603479870913539e-08, - "loss": 0.0808, - "reward": 0.4648437723517418, - "reward_std": 0.1994500271975994, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 49.2379035949707, + "kl": 5.7109375, + "learning_rate": 2.801739935456769e-07, + "loss": 0.3535, + "reward": 0.4860491305589676, + "reward_std": 0.1667410135269165, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830559372902, + "rewards/tag_count_reward": 0.4190848395228386, "step": 1725 }, { "clip_ratio": 0.0, - "completion_length": 1724.9018859863281, + "completion_length": 1572.7567749023438, "epoch": 0.5155701590620566, - "grad_norm": 6.750126838684082, - "kl": 0.41845703125, - "learning_rate": 5.598302541428601e-08, - "loss": 0.0921, - "reward": 0.4425223469734192, - "reward_std": 0.20786990970373154, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 7.968415260314941, + "kl": 3.87890625, + "learning_rate": 2.7991512707143006e-07, + "loss": 0.2729, + "reward": 0.476004496216774, + "reward_std": 0.15905269049108028, + "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3978794813156128, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1726 }, { "clip_ratio": 0.0, - "completion_length": 1710.2902221679688, + "completion_length": 1592.9085388183594, "epoch": 0.515868867149578, - "grad_norm": 9.041167259216309, - "kl": 0.3916015625, - "learning_rate": 5.593124561049141e-08, - "loss": 0.0853, - "reward": 0.4536830633878708, - "reward_std": 0.14712319895625114, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 12.466891288757324, + "kl": 3.546875, + "learning_rate": 2.7965622805245705e-07, + "loss": 0.2417, + "reward": 0.4832589402794838, + "reward_std": 0.12968280166387558, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4112723395228386, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1727 }, { "clip_ratio": 0.0, - "completion_length": 1742.8058776855469, + "completion_length": 1616.85498046875, "epoch": 0.5161675752370996, - "grad_norm": 5.264516353607178, - "kl": 0.5810546875, - "learning_rate": 5.587945935408289e-08, - "loss": 0.1024, - "reward": 0.5412946566939354, - "reward_std": 0.2058468870818615, - "rewards/accuracy_reward": 0.14062500302679837, + "grad_norm": 11.699602127075195, + "kl": 4.33984375, + "learning_rate": 2.7939729677041444e-07, + "loss": 0.3027, + "reward": 0.579799123108387, + "reward_std": 0.1764991171658039, + "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4006696566939354, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1728 }, { "clip_ratio": 0.0, - "completion_length": 1733.0000305175781, + "completion_length": 1674.2411499023438, "epoch": 0.516466283324621, - "grad_norm": 3.747082471847534, - "kl": 0.49560546875, - "learning_rate": 5.582766670139885e-08, - "loss": 0.0754, - "reward": 0.4698661044239998, - "reward_std": 0.15520112961530685, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 39.4326171875, + "kl": 4.29296875, + "learning_rate": 2.791383335069942e-07, + "loss": 0.2455, + "reward": 0.4966518059372902, + "reward_std": 0.1555672474205494, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3984375223517418, + "rewards/tag_count_reward": 0.416294664144516, "step": 1729 }, { "clip_ratio": 0.0, - "completion_length": 1688.9130249023438, + "completion_length": 1530.18310546875, "epoch": 0.5167649914121425, - "grad_norm": 5.431518077850342, - "kl": 0.43505859375, - "learning_rate": 5.5775867708784594e-08, - "loss": 0.0884, - "reward": 0.6188616380095482, - "reward_std": 0.17551067098975182, - "rewards/accuracy_reward": 0.2031250074505806, + "grad_norm": 32.17528533935547, + "kl": 3.171875, + "learning_rate": 2.78879338543923e-07, + "loss": 0.2435, + "reward": 0.6607143059372902, + "reward_std": 0.13966858945786953, + "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4157366305589676, + "rewards/tag_count_reward": 0.4531250223517418, "step": 1730 }, { "clip_ratio": 0.0, - "completion_length": 1740.2991638183594, + "completion_length": 1666.1786499023438, "epoch": 0.5170636994996639, - "grad_norm": 3.4073755741119385, - "kl": 0.451171875, - "learning_rate": 5.5724062432592375e-08, - "loss": 0.0758, - "reward": 0.4927455559372902, - "reward_std": 0.21713030710816383, - "rewards/accuracy_reward": 0.09151786123402417, + "grad_norm": 34.014766693115234, + "kl": 4.53515625, + "learning_rate": 2.786203121629619e-07, + "loss": 0.2797, + "reward": 0.5323660895228386, + "reward_std": 0.18930118530988693, + "rewards/accuracy_reward": 0.0915178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4012276977300644, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1731 }, { "clip_ratio": 0.0, - "completion_length": 1731.837158203125, + "completion_length": 1638.5670471191406, "epoch": 0.5173624075871854, - "grad_norm": 6.391365051269531, - "kl": 0.50732421875, - "learning_rate": 5.5672250929181264e-08, - "loss": 0.0825, - "reward": 0.4486607387661934, - "reward_std": 0.17984088137745857, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 45.9080810546875, + "kl": 4.29296875, + "learning_rate": 2.783612546459063e-07, + "loss": 0.2373, + "reward": 0.5044643133878708, + "reward_std": 0.16251758486032486, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1732 }, { "clip_ratio": 0.0, - "completion_length": 1645.16748046875, + "completion_length": 1503.9754943847656, "epoch": 0.5176611156747069, - "grad_norm": 99.79548645019531, - "kl": 0.81201171875, - "learning_rate": 5.562043325491707e-08, - "loss": 0.1081, - "reward": 0.5245535969734192, - "reward_std": 0.19782382808625698, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 36.12447738647461, + "kl": 2.84765625, + "learning_rate": 2.781021662745853e-07, + "loss": 0.2299, + "reward": 0.557477705180645, + "reward_std": 0.17240197770297527, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4151785895228386, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1733 }, { "clip_ratio": 0.0, - "completion_length": 1738.6072387695312, + "completion_length": 1577.555908203125, "epoch": 0.5179598237622284, - "grad_norm": 6.34877347946167, - "kl": 0.60400390625, - "learning_rate": 5.5568609466172355e-08, - "loss": 0.0808, - "reward": 0.544084832072258, - "reward_std": 0.20431343838572502, - "rewards/accuracy_reward": 0.1473214365541935, + "grad_norm": 14.236021995544434, + "kl": 3.671875, + "learning_rate": 2.7784304733086176e-07, + "loss": 0.2521, + "reward": 0.5809152126312256, + "reward_std": 0.15254950150847435, + "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3967633992433548, + "rewards/tag_count_reward": 0.4358259215950966, "step": 1734 }, { "clip_ratio": 0.0, - "completion_length": 1747.3259887695312, + "completion_length": 1567.4041137695312, "epoch": 0.5182585318497498, - "grad_norm": 7.398337364196777, - "kl": 0.51416015625, - "learning_rate": 5.551677961932632e-08, - "loss": 0.0967, - "reward": 0.4687500223517418, - "reward_std": 0.1552267111837864, - "rewards/accuracy_reward": 0.07589286053553224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "grad_norm": 28.387414932250977, + "kl": 3.453125, + "learning_rate": 2.775838980966316e-07, + "loss": 0.2288, + "reward": 0.5290178805589676, + "reward_std": 0.11452163383364677, + "rewards/accuracy_reward": 0.08482143236324191, + "rewards/format_reward": 0.0022321429569274187, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1735 }, { "clip_ratio": 0.0, - "completion_length": 1772.5201721191406, + "completion_length": 1717.0915832519531, "epoch": 0.5185572399372713, - "grad_norm": 6.590037822723389, - "kl": 0.56201171875, - "learning_rate": 5.546494377076477e-08, - "loss": 0.0855, - "reward": 0.5267857238650322, - "reward_std": 0.25083496794104576, - "rewards/accuracy_reward": 0.10491071501746774, + "grad_norm": 37.269073486328125, + "kl": 3.8359375, + "learning_rate": 2.7732471885382383e-07, + "loss": 0.2295, + "reward": 0.5106027126312256, + "reward_std": 0.19032376818358898, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4218750149011612, + "rewards/tag_count_reward": 0.4391741305589676, "step": 1736 }, { "clip_ratio": 0.0, - "completion_length": 1791.7478637695312, + "completion_length": 1659.6652526855469, "epoch": 0.5188559480247927, - "grad_norm": 7.542052745819092, - "kl": 0.642578125, - "learning_rate": 5.541310197688002e-08, - "loss": 0.0793, - "reward": 0.451450914144516, - "reward_std": 0.1912882849574089, - "rewards/accuracy_reward": 0.04910714365541935, + "grad_norm": 61.883785247802734, + "kl": 5.5, + "learning_rate": 2.7706550988440006e-07, + "loss": 0.3331, + "reward": 0.435825914144516, + "reward_std": 0.15172969177365303, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437649011612, + "rewards/tag_count_reward": 0.415736623108387, "step": 1737 }, { "clip_ratio": 0.0, - "completion_length": 1708.2322082519531, + "completion_length": 1615.8996276855469, "epoch": 0.5191546561123143, - "grad_norm": 18.371646881103516, - "kl": 0.8349609375, - "learning_rate": 5.536125429407086e-08, - "loss": 0.1197, - "reward": 0.4056919813156128, - "reward_std": 0.16813630983233452, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 88.1395492553711, + "kl": 4.96875, + "learning_rate": 2.768062714703543e-07, + "loss": 0.275, + "reward": 0.4564732313156128, + "reward_std": 0.14758897572755814, + "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3878348395228386, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1738 }, { "clip_ratio": 0.0, - "completion_length": 1728.3192749023438, + "completion_length": 1563.7031860351562, "epoch": 0.5194533641998357, - "grad_norm": 29.867605209350586, - "kl": 0.873046875, - "learning_rate": 5.5309400778742475e-08, - "loss": 0.1226, - "reward": 0.4994420036673546, - "reward_std": 0.17680953815579414, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 67.6760482788086, + "kl": 2.087890625, + "learning_rate": 2.765470038937124e-07, + "loss": 0.1871, + "reward": 0.5357143059372902, + "reward_std": 0.11059441789984703, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4034598395228386, + "rewards/tag_count_reward": 0.4531250223517418, "step": 1739 }, { "clip_ratio": 0.0, - "completion_length": 1729.0246276855469, + "completion_length": 1644.3951416015625, "epoch": 0.5197520722873572, - "grad_norm": 23.154891967773438, - "kl": 0.8974609375, - "learning_rate": 5.5257541487306426e-08, - "loss": 0.1109, - "reward": 0.4832589477300644, - "reward_std": 0.17792154848575592, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 13.766127586364746, + "kl": 4.10546875, + "learning_rate": 2.7628770743653213e-07, + "loss": 0.3013, + "reward": 0.5172991305589676, + "reward_std": 0.15774049796164036, + "rewards/accuracy_reward": 0.0915178598370403, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3939732313156128, + "rewards/tag_count_reward": 0.4257812649011612, "step": 1740 }, { "clip_ratio": 0.0, - "completion_length": 1675.8728637695312, + "completion_length": 1541.0558471679688, "epoch": 0.5200507803748786, - "grad_norm": 6.514451503753662, - "kl": 0.65625, - "learning_rate": 5.520567647618054e-08, - "loss": 0.1107, - "reward": 0.5078125223517418, - "reward_std": 0.20380251482129097, - "rewards/accuracy_reward": 0.11383928917348385, + "grad_norm": 54.18669891357422, + "kl": 3.126953125, + "learning_rate": 2.760283823809027e-07, + "loss": 0.2774, + "reward": 0.5452009215950966, + "reward_std": 0.15356412529945374, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3939732238650322, + "rewards/tag_count_reward": 0.435825914144516, "step": 1741 }, { "clip_ratio": 0.0, - "completion_length": 1699.6786193847656, + "completion_length": 1541.3482971191406, "epoch": 0.5203494884624001, - "grad_norm": 6.714378833770752, - "kl": 0.6552734375, - "learning_rate": 5.5153805801788854e-08, - "loss": 0.1135, - "reward": 0.5128348544239998, - "reward_std": 0.2615843936800957, - "rewards/accuracy_reward": 0.11830357648432255, + "grad_norm": 33.631744384765625, + "kl": 3.3046875, + "learning_rate": 2.7576902900894426e-07, + "loss": 0.2612, + "reward": 0.553013414144516, + "reward_std": 0.20481225103139877, + "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3945312649011612, + "rewards/tag_count_reward": 0.4391741305589676, "step": 1742 }, { "clip_ratio": 0.0, - "completion_length": 1682.1808776855469, + "completion_length": 1546.7389221191406, "epoch": 0.5206481965499216, - "grad_norm": 25.284433364868164, - "kl": 0.587890625, - "learning_rate": 5.510192952056159e-08, - "loss": 0.0867, - "reward": 0.426897332072258, - "reward_std": 0.16536338441073895, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 62.504573822021484, + "kl": 2.25, + "learning_rate": 2.7550964760280794e-07, + "loss": 0.1813, + "reward": 0.4626116305589676, + "reward_std": 0.13229131791740656, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4090401902794838, + "rewards/tag_count_reward": 0.4425223469734192, "step": 1743 }, { "clip_ratio": 0.0, - "completion_length": 1774.7188415527344, + "completion_length": 1645.9710693359375, "epoch": 0.5209469046374431, - "grad_norm": 5.692668914794922, - "kl": 0.560546875, - "learning_rate": 5.505004768893504e-08, - "loss": 0.0888, - "reward": 0.4648437649011612, - "reward_std": 0.21675099432468414, - "rewards/accuracy_reward": 0.07366071688011289, + "grad_norm": 34.03953170776367, + "kl": 2.423828125, + "learning_rate": 2.752502384446752e-07, + "loss": 0.1714, + "reward": 0.5117187649011612, + "reward_std": 0.15512298047542572, + "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830559372902, + "rewards/tag_count_reward": 0.4425223469734192, "step": 1744 }, { "clip_ratio": 0.0, - "completion_length": 1777.0960693359375, + "completion_length": 1658.3170166015625, "epoch": 0.5212456127249645, - "grad_norm": 15.219472885131836, - "kl": 0.783203125, - "learning_rate": 5.499816036335156e-08, - "loss": 0.0856, - "reward": 0.4542410969734192, - "reward_std": 0.20848624035716057, - "rewards/accuracy_reward": 0.0669642873108387, + "grad_norm": 50.95347595214844, + "kl": 2.6328125, + "learning_rate": 2.749908018167578e-07, + "loss": 0.1869, + "reward": 0.5111607313156128, + "reward_std": 0.17601805552840233, + "rewards/accuracy_reward": 0.07366071850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3872767984867096, + "rewards/tag_count_reward": 0.4375000149011612, "step": 1745 }, { "clip_ratio": 0.0, - "completion_length": 1749.3036499023438, + "completion_length": 1671.5826416015625, "epoch": 0.521544320812486, - "grad_norm": 11.474162101745605, - "kl": 0.6953125, - "learning_rate": 5.494626760025949e-08, - "loss": 0.0894, - "reward": 0.5898437798023224, - "reward_std": 0.21363534405827522, - "rewards/accuracy_reward": 0.2031250074505806, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187649011612, + "grad_norm": 11.679443359375, + "kl": 3.56640625, + "learning_rate": 2.7473133800129745e-07, + "loss": 0.239, + "reward": 0.6015625149011612, + "reward_std": 0.1607975121587515, + "rewards/accuracy_reward": 0.1785714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1746 }, { "clip_ratio": 0.0, - "completion_length": 1684.8817749023438, + "completion_length": 1572.1741943359375, "epoch": 0.5218430289000074, - "grad_norm": 7.166556358337402, - "kl": 0.6240234375, - "learning_rate": 5.4894369456113045e-08, - "loss": 0.1186, - "reward": 0.4916294813156128, - "reward_std": 0.17229808494448662, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 33.101806640625, + "kl": 2.62109375, + "learning_rate": 2.7447184728056524e-07, + "loss": 0.2042, + "reward": 0.529575914144516, + "reward_std": 0.1589144691824913, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4068080559372902, + "rewards/tag_count_reward": 0.4447544887661934, "step": 1747 }, { "clip_ratio": 0.0, - "completion_length": 1752.1697082519531, + "completion_length": 1660.8795166015625, "epoch": 0.522141736987529, - "grad_norm": 31.309431076049805, - "kl": 0.92578125, - "learning_rate": 5.484246598737233e-08, - "loss": 0.1035, - "reward": 0.3906250149011612, - "reward_std": 0.16934703290462494, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 15.556615829467773, + "kl": 4.015625, + "learning_rate": 2.742123299368617e-07, + "loss": 0.2553, + "reward": 0.4263393059372902, + "reward_std": 0.12485009990632534, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3794642984867096, + "rewards/tag_count_reward": 0.4196428805589676, "step": 1748 }, { "clip_ratio": 0.0, - "completion_length": 1706.01123046875, + "completion_length": 1575.8304443359375, "epoch": 0.5224404450750504, - "grad_norm": 3.795792579650879, - "kl": 0.6201171875, - "learning_rate": 5.479055725050324e-08, - "loss": 0.0976, - "reward": 0.4324776977300644, - "reward_std": 0.15270678512752056, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 29.764310836791992, + "kl": 4.43359375, + "learning_rate": 2.7395278625251623e-07, + "loss": 0.3109, + "reward": 0.4804687723517418, + "reward_std": 0.11687616631388664, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3945312574505806, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1749 }, { "clip_ratio": 0.0, - "completion_length": 1688.1697082519531, + "completion_length": 1554.02685546875, "epoch": 0.5227391531625719, - "grad_norm": 2.865485906600952, - "kl": 0.52490234375, - "learning_rate": 5.47386433019774e-08, - "loss": 0.0856, - "reward": 0.4174107313156128, - "reward_std": 0.17103733867406845, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 35.57770919799805, + "kl": 4.76953125, + "learning_rate": 2.7369321650988697e-07, + "loss": 0.3418, + "reward": 0.4542410969734192, + "reward_std": 0.15828418172895908, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1750 }, { "clip_ratio": 0.0, - "completion_length": 1793.0469665527344, + "completion_length": 1601.8996276855469, "epoch": 0.5230378612500933, - "grad_norm": 12.732425689697266, - "kl": 0.7412109375, - "learning_rate": 5.468672419827207e-08, - "loss": 0.1039, - "reward": 0.380580373108387, - "reward_std": 0.16339075937867165, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 25.79292869567871, + "kl": 3.5703125, + "learning_rate": 2.7343362099136036e-07, + "loss": 0.2452, + "reward": 0.458705373108387, + "reward_std": 0.09508886840194464, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482313156128, + "rewards/tag_count_reward": 0.4542410895228386, "step": 1751 }, { "clip_ratio": 0.0, - "completion_length": 1768.7433776855469, + "completion_length": 1655.2590026855469, "epoch": 0.5233365693376149, - "grad_norm": 4.380957126617432, - "kl": 0.5986328125, - "learning_rate": 5.4634799995870154e-08, - "loss": 0.0866, - "reward": 0.5284598395228386, - "reward_std": 0.25493403524160385, - "rewards/accuracy_reward": 0.1450892947614193, + "grad_norm": 52.47477722167969, + "kl": 5.234375, + "learning_rate": 2.731739999793508e-07, + "loss": 0.3182, + "reward": 0.5680803805589676, + "reward_std": 0.2094019427895546, + "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3833705484867096, + "rewards/tag_count_reward": 0.4229910895228386, "step": 1752 }, { "clip_ratio": 0.0, - "completion_length": 1689.9353637695312, + "completion_length": 1567.3661499023438, "epoch": 0.5236352774251363, - "grad_norm": 17.14016342163086, - "kl": 0.57958984375, - "learning_rate": 5.4582870751260114e-08, - "loss": 0.0928, - "reward": 0.522879496216774, - "reward_std": 0.21201874688267708, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 130.18553161621094, + "kl": 6.23828125, + "learning_rate": 2.7291435375630057e-07, + "loss": 0.3576, + "reward": 0.5351562649011612, + "reward_std": 0.16126592084765434, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4045759066939354, + "rewards/tag_count_reward": 0.4235491305589676, "step": 1753 }, { "clip_ratio": 0.0, - "completion_length": 1741.2522888183594, + "completion_length": 1635.18310546875, "epoch": 0.5239339855126578, - "grad_norm": 4.083420276641846, - "kl": 0.55908203125, - "learning_rate": 5.4530936520935864e-08, - "loss": 0.1066, - "reward": 0.4341518133878708, - "reward_std": 0.20548857375979424, - "rewards/accuracy_reward": 0.05133928940631449, + "grad_norm": 78.28179168701172, + "kl": 5.19921875, + "learning_rate": 2.7265468260467934e-07, + "loss": 0.3046, + "reward": 0.4547991380095482, + "reward_std": 0.16429843939840794, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125149011612, + "rewards/tag_count_reward": 0.4213169813156128, "step": 1754 }, { "clip_ratio": 0.0, - "completion_length": 1756.5781860351562, + "completion_length": 1624.4264221191406, "epoch": 0.5242326936001792, - "grad_norm": 6.4811859130859375, - "kl": 0.57421875, - "learning_rate": 5.447899736139675e-08, - "loss": 0.0985, - "reward": 0.4252232387661934, - "reward_std": 0.19021673128008842, - "rewards/accuracy_reward": 0.046875003492459655, + "grad_norm": 47.4450798034668, + "kl": 4.3125, + "learning_rate": 2.7239498680698375e-07, + "loss": 0.2723, + "reward": 0.4843750149011612, + "reward_std": 0.13483248464763165, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482313156128, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1755 }, { "clip_ratio": 0.0, - "completion_length": 1748.2009887695312, + "completion_length": 1559.5692443847656, "epoch": 0.5245314016877007, - "grad_norm": 5.068997859954834, - "kl": 0.61328125, - "learning_rate": 5.442705332914751e-08, - "loss": 0.1069, - "reward": 0.4458705633878708, - "reward_std": 0.18817389011383057, - "rewards/accuracy_reward": 0.058035715483129025, + "grad_norm": 8.70786190032959, + "kl": 2.80859375, + "learning_rate": 2.7213526664573756e-07, + "loss": 0.1914, + "reward": 0.5172991380095482, + "reward_std": 0.15818733349442482, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.387834832072258, + "rewards/tag_count_reward": 0.4414062723517418, "step": 1756 }, { "clip_ratio": 0.0, - "completion_length": 1741.4375915527344, + "completion_length": 1633.9554138183594, "epoch": 0.5248301097752222, - "grad_norm": 5.30210542678833, - "kl": 0.5107421875, - "learning_rate": 5.437510448069814e-08, - "loss": 0.0778, - "reward": 0.4815848469734192, - "reward_std": 0.20469912886619568, - "rewards/accuracy_reward": 0.08258929196745157, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3967634066939354, + "grad_norm": 45.53723907470703, + "kl": 4.5390625, + "learning_rate": 2.718755224034907e-07, + "loss": 0.2902, + "reward": 0.5000000149011612, + "reward_std": 0.15746591798961163, + "rewards/accuracy_reward": 0.07589286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.424107164144516, "step": 1757 }, { "clip_ratio": 0.0, - "completion_length": 1687.0291137695312, + "completion_length": 1587.08935546875, "epoch": 0.5251288178627437, - "grad_norm": 7.627849578857422, - "kl": 0.474853515625, - "learning_rate": 5.432315087256391e-08, - "loss": 0.0888, - "reward": 0.530133955180645, - "reward_std": 0.18123888596892357, - "rewards/accuracy_reward": 0.11607143399305642, + "grad_norm": 19.14122200012207, + "kl": 3.46484375, + "learning_rate": 2.716157543628195e-07, + "loss": 0.2691, + "reward": 0.5513392984867096, + "reward_std": 0.15758421644568443, + "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4140625149011612, + "rewards/tag_count_reward": 0.4464285969734192, "step": 1758 }, { "clip_ratio": 0.0, - "completion_length": 1695.4175109863281, + "completion_length": 1595.6942749023438, "epoch": 0.5254275259502651, - "grad_norm": 3.407773971557617, - "kl": 0.58837890625, - "learning_rate": 5.4271192561265225e-08, - "loss": 0.0977, - "reward": 0.5195312723517418, - "reward_std": 0.23819676041603088, - "rewards/accuracy_reward": 0.11607143376022577, + "grad_norm": 36.462257385253906, + "kl": 4.75, + "learning_rate": 2.713559628063261e-07, + "loss": 0.3221, + "reward": 0.521205373108387, + "reward_std": 0.18046706169843674, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.403459832072258, + "rewards/tag_count_reward": 0.4252232238650322, "step": 1759 }, { "clip_ratio": 0.0, - "completion_length": 1675.0179443359375, + "completion_length": 1566.5647888183594, "epoch": 0.5257262340377866, - "grad_norm": 15.501312255859375, - "kl": 0.5703125, - "learning_rate": 5.4219229603327666e-08, - "loss": 0.0966, - "reward": 0.5050223469734192, - "reward_std": 0.22029558569192886, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 29.310579299926758, + "kl": 3.314453125, + "learning_rate": 2.710961480166383e-07, + "loss": 0.2121, + "reward": 0.5770089626312256, + "reward_std": 0.19902991876006126, + "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.4497768133878708, "step": 1760 }, { "clip_ratio": 0.0, - "completion_length": 1796.1139221191406, + "completion_length": 1639.44873046875, "epoch": 0.526024942125308, - "grad_norm": 5.08833122253418, - "kl": 0.5654296875, - "learning_rate": 5.4167262055281826e-08, - "loss": 0.0836, - "reward": 0.4341518059372902, - "reward_std": 0.18582125380635262, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 10.898287773132324, + "kl": 4.01171875, + "learning_rate": 2.7083631027640916e-07, + "loss": 0.2856, + "reward": 0.5005580708384514, + "reward_std": 0.1660266313701868, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376116082072258, + "rewards/tag_count_reward": 0.4291294887661934, "step": 1761 }, { "clip_ratio": 0.0, - "completion_length": 1728.7991638183594, + "completion_length": 1616.0335693359375, "epoch": 0.5263236502128296, - "grad_norm": 5.1948652267456055, - "kl": 0.65478515625, - "learning_rate": 5.41152899736633e-08, - "loss": 0.1149, - "reward": 0.5111607387661934, - "reward_std": 0.18828542158007622, - "rewards/accuracy_reward": 0.12276786006987095, + "grad_norm": 18.60226058959961, + "kl": 3.251953125, + "learning_rate": 2.705764498683165e-07, + "loss": 0.2423, + "reward": 0.5535714626312256, + "reward_std": 0.1172682624310255, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.388392873108387, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1762 }, { "clip_ratio": 0.0, - "completion_length": 1786.6518859863281, + "completion_length": 1685.0536499023438, "epoch": 0.526622358300351, - "grad_norm": 3.855620861053467, - "kl": 0.625, - "learning_rate": 5.4063313415012636e-08, - "loss": 0.0844, - "reward": 0.4754464477300644, - "reward_std": 0.1965351514518261, - "rewards/accuracy_reward": 0.09375000488944352, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3816964477300644, + "grad_norm": 28.885034561157227, + "kl": 4.19140625, + "learning_rate": 2.7031656707506315e-07, + "loss": 0.2606, + "reward": 0.518415205180645, + "reward_std": 0.1853452157229185, + "rewards/accuracy_reward": 0.10937500419095159, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4090401977300644, "step": 1763 }, { "clip_ratio": 0.0, - "completion_length": 1748.0357971191406, + "completion_length": 1596.6161193847656, "epoch": 0.5269210663878725, - "grad_norm": 5.168260097503662, - "kl": 0.61669921875, - "learning_rate": 5.4011332435875213e-08, - "loss": 0.1149, - "reward": 0.4168526977300644, - "reward_std": 0.17661592364311218, + "grad_norm": 78.760986328125, + "kl": 2.578125, + "learning_rate": 2.7005666217937605e-07, + "loss": 0.2221, + "reward": 0.4737723395228386, + "reward_std": 0.12881378643214703, "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419738650322, + "rewards/tag_count_reward": 0.431361623108387, "step": 1764 }, { "clip_ratio": 0.0, - "completion_length": 1763.65185546875, + "completion_length": 1719.2054443359375, "epoch": 0.5272197744753939, - "grad_norm": 9.878872871398926, - "kl": 0.67724609375, - "learning_rate": 5.395934709280127e-08, - "loss": 0.0897, - "reward": 0.446986623108387, - "reward_std": 0.16939592361450195, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 8.007554054260254, + "kl": 2.7734375, + "learning_rate": 2.6979673546400635e-07, + "loss": 0.1474, + "reward": 0.479352705180645, + "reward_std": 0.12781946547329426, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3889509066939354, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1765 }, { "clip_ratio": 0.0, - "completion_length": 1740.7433776855469, + "completion_length": 1625.5893249511719, "epoch": 0.5275184825629153, - "grad_norm": 4.2619171142578125, - "kl": 0.546875, - "learning_rate": 5.3907357442345724e-08, - "loss": 0.0913, - "reward": 0.4687500223517418, - "reward_std": 0.19137344881892204, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 31.861757278442383, + "kl": 2.54296875, + "learning_rate": 2.695367872117286e-07, + "loss": 0.1666, + "reward": 0.5117187649011612, + "reward_std": 0.129623893648386, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3861607313156128, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1766 }, { "clip_ratio": 0.0, - "completion_length": 1744.5023193359375, + "completion_length": 1652.7947387695312, "epoch": 0.5278171906504369, - "grad_norm": 9.35279369354248, - "kl": 0.59716796875, - "learning_rate": 5.3855363541068257e-08, - "loss": 0.1084, - "reward": 0.4687500149011612, - "reward_std": 0.17357774451375008, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 78.7196044921875, + "kl": 2.525390625, + "learning_rate": 2.6927681770534126e-07, + "loss": 0.1863, + "reward": 0.5189732387661934, + "reward_std": 0.12173357605934143, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3906250223517418, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1767 }, { "clip_ratio": 0.0, - "completion_length": 1796.1072082519531, + "completion_length": 1697.4420166015625, "epoch": 0.5281158987379583, - "grad_norm": 3.1474697589874268, - "kl": 0.47802734375, - "learning_rate": 5.380336544553312e-08, - "loss": 0.0734, - "reward": 0.4174107313156128, - "reward_std": 0.18328580632805824, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 58.66118621826172, + "kl": 1.916015625, + "learning_rate": 2.690168272276656e-07, + "loss": 0.141, + "reward": 0.4687500223517418, + "reward_std": 0.1304073203355074, + "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4017857313156128, + "rewards/tag_count_reward": 0.450892873108387, "step": 1768 }, { "clip_ratio": 0.0, - "completion_length": 1745.9822082519531, + "completion_length": 1627.5223999023438, "epoch": 0.5284146068254798, - "grad_norm": 7.076191425323486, - "kl": 0.564453125, - "learning_rate": 5.375136321230914e-08, - "loss": 0.0999, - "reward": 0.4581473395228386, - "reward_std": 0.19278140366077423, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 62.1063346862793, + "kl": 2.232421875, + "learning_rate": 2.687568160615457e-07, + "loss": 0.1542, + "reward": 0.4988839477300644, + "reward_std": 0.1047830106690526, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.400111623108387, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1769 }, { "clip_ratio": 0.0, - "completion_length": 1689.6228332519531, + "completion_length": 1529.4397888183594, "epoch": 0.5287133149130012, - "grad_norm": 6.8052978515625, - "kl": 0.5361328125, - "learning_rate": 5.3699356897969675e-08, - "loss": 0.113, - "reward": 0.450334832072258, - "reward_std": 0.1757761649787426, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 57.12336349487305, + "kl": 1.8828125, + "learning_rate": 2.6849678448984835e-07, + "loss": 0.1534, + "reward": 0.515066996216774, + "reward_std": 0.10974467173218727, + "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669738650322, + "rewards/tag_count_reward": 0.4547991380095482, "step": 1770 }, { "clip_ratio": 0.0, - "completion_length": 1738.4175109863281, + "completion_length": 1619.0447082519531, "epoch": 0.5290120230005227, - "grad_norm": 9.303313255310059, - "kl": 0.67822265625, - "learning_rate": 5.3647346559092476e-08, - "loss": 0.1063, - "reward": 0.4927455559372902, - "reward_std": 0.1845741979777813, - "rewards/accuracy_reward": 0.1183035783469677, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.372209832072258, + "grad_norm": 16.558101654052734, + "kl": 3.68359375, + "learning_rate": 2.682367327954624e-07, + "loss": 0.2529, + "reward": 0.550781287252903, + "reward_std": 0.14278215169906616, + "rewards/accuracy_reward": 0.1227678656578064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.428013414144516, "step": 1771 }, { "clip_ratio": 0.0, - "completion_length": 1753.3281860351562, + "completion_length": 1582.8036499023438, "epoch": 0.5293107310880442, - "grad_norm": 4.250111103057861, - "kl": 0.5048828125, - "learning_rate": 5.3595332252259704e-08, - "loss": 0.0867, - "reward": 0.5284598469734192, - "reward_std": 0.20956435799598694, - "rewards/accuracy_reward": 0.13839286006987095, + "grad_norm": 41.929752349853516, + "kl": 2.765625, + "learning_rate": 2.6797666126129855e-07, + "loss": 0.2302, + "reward": 0.6238839626312256, + "reward_std": 0.19059934839606285, + "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669813156128, + "rewards/tag_count_reward": 0.4453125298023224, "step": 1772 }, { "clip_ratio": 0.0, - "completion_length": 1707.6473999023438, + "completion_length": 1566.8058776855469, "epoch": 0.5296094391755657, - "grad_norm": 6.120192050933838, - "kl": 0.59765625, - "learning_rate": 5.3543314034057804e-08, - "loss": 0.0945, - "reward": 0.5290178805589676, - "reward_std": 0.23417676240205765, - "rewards/accuracy_reward": 0.12946429452858865, + "grad_norm": 10.776983261108398, + "kl": 3.22265625, + "learning_rate": 2.6771657017028906e-07, + "loss": 0.2202, + "reward": 0.568080373108387, + "reward_std": 0.1851207111030817, + "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3995535895228386, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1773 }, { "clip_ratio": 0.0, - "completion_length": 1703.9442749023438, + "completion_length": 1580.0781555175781, "epoch": 0.5299081472630871, - "grad_norm": 16.22130584716797, - "kl": 0.67236328125, - "learning_rate": 5.349129196107753e-08, - "loss": 0.1005, - "reward": 0.4698660895228386, - "reward_std": 0.21711378917098045, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 49.52845764160156, + "kl": 4.091796875, + "learning_rate": 2.674564598053877e-07, + "loss": 0.2446, + "reward": 0.517857164144516, + "reward_std": 0.16952192783355713, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3850446566939354, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1774 }, { "clip_ratio": 0.0, - "completion_length": 1767.9777526855469, + "completion_length": 1635.1808776855469, "epoch": 0.5302068553506086, - "grad_norm": 6.030083179473877, - "kl": 0.7490234375, - "learning_rate": 5.343926608991379e-08, - "loss": 0.1126, - "reward": 0.4760044813156128, - "reward_std": 0.22250287607312202, - "rewards/accuracy_reward": 0.08482143189758062, + "grad_norm": 25.246688842773438, + "kl": 3.8203125, + "learning_rate": 2.6719633044956897e-07, + "loss": 0.2524, + "reward": 0.5150669738650322, + "reward_std": 0.17731483653187752, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830559372902, + "rewards/tag_count_reward": 0.4414062723517418, "step": 1775 }, { "clip_ratio": 0.0, - "completion_length": 1740.96435546875, + "completion_length": 1679.2098999023438, "epoch": 0.53050556343813, - "grad_norm": 5.705425262451172, - "kl": 0.54296875, - "learning_rate": 5.338723647716561e-08, - "loss": 0.0961, - "reward": 0.4737723469734192, - "reward_std": 0.210675947368145, - "rewards/accuracy_reward": 0.0870535783469677, + "grad_norm": 11.587535858154297, + "kl": 4.140625, + "learning_rate": 2.6693618238582806e-07, + "loss": 0.2558, + "reward": 0.5401786044239998, + "reward_std": 0.19198604673147202, + "rewards/accuracy_reward": 0.10937500279396772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187649011612, + "rewards/tag_count_reward": 0.4308035969734192, "step": 1776 }, { "clip_ratio": 0.0, - "completion_length": 1678.5804443359375, + "completion_length": 1644.3304443359375, "epoch": 0.5308042715256516, - "grad_norm": 8.856090545654297, - "kl": 0.43115234375, - "learning_rate": 5.333520317943615e-08, - "loss": 0.0937, - "reward": 0.5630580484867096, - "reward_std": 0.22425556182861328, - "rewards/accuracy_reward": 0.1584821455180645, + "grad_norm": 21.94377326965332, + "kl": 3.765625, + "learning_rate": 2.6667601589718074e-07, + "loss": 0.2302, + "reward": 0.6054687798023224, + "reward_std": 0.1847195401787758, + "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4045759066939354, + "rewards/tag_count_reward": 0.4492187723517418, "step": 1777 }, { "clip_ratio": 0.0, - "completion_length": 1689.774658203125, + "completion_length": 1608.6965026855469, "epoch": 0.531102979613173, - "grad_norm": 6.1137237548828125, - "kl": 0.626953125, - "learning_rate": 5.32831662533325e-08, - "loss": 0.11, - "reward": 0.5066964477300644, - "reward_std": 0.23895793035626411, - "rewards/accuracy_reward": 0.11383928777649999, + "grad_norm": 16.150959014892578, + "kl": 3.73828125, + "learning_rate": 2.664158312666625e-07, + "loss": 0.2477, + "reward": 0.553571455180645, + "reward_std": 0.19976021721959114, + "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "rewards/tag_count_reward": 0.439732164144516, "step": 1778 }, { "clip_ratio": 0.0, - "completion_length": 1738.4085693359375, + "completion_length": 1655.8750915527344, "epoch": 0.5314016877006945, - "grad_norm": 3.736436605453491, - "kl": 0.51220703125, - "learning_rate": 5.3231125755465766e-08, - "loss": 0.0882, - "reward": 0.552455373108387, - "reward_std": 0.17856815829873085, - "rewards/accuracy_reward": 0.14062500488944352, + "grad_norm": 69.86961364746094, + "kl": 4.93359375, + "learning_rate": 2.661556287773288e-07, + "loss": 0.3018, + "reward": 0.5613839402794838, + "reward_std": 0.13027407601475716, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.411830373108387, + "rewards/tag_count_reward": 0.4252232387661934, "step": 1779 }, { "clip_ratio": 0.0, - "completion_length": 1750.2166137695312, + "completion_length": 1650.6965026855469, "epoch": 0.5317003957882159, - "grad_norm": 4.601034641265869, - "kl": 0.69921875, - "learning_rate": 5.317908174245087e-08, - "loss": 0.1198, - "reward": 0.4274553805589676, - "reward_std": 0.23352116346359253, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 22.816179275512695, + "kl": 4.3046875, + "learning_rate": 2.6589540871225437e-07, + "loss": 0.2419, + "reward": 0.4720982387661934, + "reward_std": 0.1627504676580429, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.380580373108387, + "rewards/tag_count_reward": 0.436383955180645, "step": 1780 }, { "clip_ratio": 0.0, - "completion_length": 1732.82373046875, + "completion_length": 1605.0759887695312, "epoch": 0.5319991038757375, - "grad_norm": 15.019307136535645, - "kl": 0.771484375, - "learning_rate": 5.312703427090665e-08, - "loss": 0.1037, - "reward": 0.452008955180645, - "reward_std": 0.22465864568948746, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 11.295478820800781, + "kl": 3.40625, + "learning_rate": 2.656351713545332e-07, + "loss": 0.2377, + "reward": 0.525669664144516, + "reward_std": 0.1638424601405859, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3694196566939354, + "rewards/tag_count_reward": 0.4386160969734192, "step": 1781 }, { "clip_ratio": 0.0, - "completion_length": 1685.8884887695312, + "completion_length": 1585.0469665527344, "epoch": 0.5322978119632589, - "grad_norm": 17.38130760192871, - "kl": 0.73681640625, - "learning_rate": 5.307498339745561e-08, - "loss": 0.1182, - "reward": 0.5290178880095482, - "reward_std": 0.1856893002986908, - "rewards/accuracy_reward": 0.13616072200238705, + "grad_norm": 45.36103439331055, + "kl": 3.875, + "learning_rate": 2.6537491698727804e-07, + "loss": 0.2221, + "reward": 0.5753348395228386, + "reward_std": 0.14683369547128677, + "rewards/accuracy_reward": 0.13616071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392857164144516, + "rewards/tag_count_reward": 0.439174123108387, "step": 1782 }, { "clip_ratio": 0.0, - "completion_length": 1709.6853637695312, + "completion_length": 1674.0938110351562, "epoch": 0.5325965200507804, - "grad_norm": 6.719497203826904, - "kl": 0.728515625, - "learning_rate": 5.302292917872401e-08, - "loss": 0.0974, - "reward": 0.4776785895228386, - "reward_std": 0.23751825839281082, - "rewards/accuracy_reward": 0.08035714854486287, + "grad_norm": 27.492841720581055, + "kl": 3.544921875, + "learning_rate": 2.6511464589362006e-07, + "loss": 0.2075, + "reward": 0.5033482313156128, + "reward_std": 0.1878223568201065, + "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3973214477300644, + "rewards/tag_count_reward": 0.4408482387661934, "step": 1783 }, { "clip_ratio": 0.0, - "completion_length": 1808.9420471191406, + "completion_length": 1716.3148193359375, "epoch": 0.5328952281383018, - "grad_norm": 27.81325912475586, - "kl": 0.97265625, - "learning_rate": 5.297087167134176e-08, - "loss": 0.1039, - "reward": 0.4525669813156128, - "reward_std": 0.2179212085902691, - "rewards/accuracy_reward": 0.06919643143191934, + "grad_norm": 14.676650047302246, + "kl": 3.58984375, + "learning_rate": 2.648543583567088e-07, + "loss": 0.2355, + "reward": 0.518973246216774, + "reward_std": 0.16148994863033295, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3833705559372902, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1784 }, { "clip_ratio": 0.0, - "completion_length": 1636.5201416015625, + "completion_length": 1553.9130249023438, "epoch": 0.5331939362258233, - "grad_norm": 8.1644926071167, - "kl": 0.48583984375, - "learning_rate": 5.2918810931942294e-08, - "loss": 0.1115, - "reward": 0.5518973469734192, - "reward_std": 0.17692741006612778, - "rewards/accuracy_reward": 0.1540178619325161, + "grad_norm": 7.853994369506836, + "kl": 2.873046875, + "learning_rate": 2.6459405465971146e-07, + "loss": 0.1916, + "reward": 0.6406250447034836, + "reward_std": 0.1768389791250229, + "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3978794813156128, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1785 }, { "clip_ratio": 0.0, - "completion_length": 1653.7009582519531, + "completion_length": 1546.9889221191406, "epoch": 0.5334926443133448, - "grad_norm": 6.8274712562561035, - "kl": 0.486328125, - "learning_rate": 5.2866747017162616e-08, - "loss": 0.0929, - "reward": 0.5033482387661934, - "reward_std": 0.16562660597264767, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 78.41841888427734, + "kl": 1.84375, + "learning_rate": 2.643337350858131e-07, + "loss": 0.14, + "reward": 0.552455373108387, + "reward_std": 0.12744706496596336, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4118303805589676, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1786 }, { "clip_ratio": 0.0, - "completion_length": 1690.1094360351562, + "completion_length": 1597.0246276855469, "epoch": 0.5337913524008663, - "grad_norm": 6.946990013122559, - "kl": 0.59521484375, - "learning_rate": 5.281467998364314e-08, - "loss": 0.1106, - "reward": 0.5452009066939354, - "reward_std": 0.21829812973737717, - "rewards/accuracy_reward": 0.1540178619325161, + "grad_norm": 74.7909927368164, + "kl": 1.75390625, + "learning_rate": 2.640733999182157e-07, + "loss": 0.1445, + "reward": 0.6093750447034836, + "reward_std": 0.15403219126164913, + "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830484867096, + "rewards/tag_count_reward": 0.4531250223517418, "step": 1787 }, { "clip_ratio": 0.0, - "completion_length": 1735.4666137695312, + "completion_length": 1664.8750610351562, "epoch": 0.5340900604883877, - "grad_norm": 8.034150123596191, - "kl": 0.63671875, - "learning_rate": 5.276260988802772e-08, - "loss": 0.107, - "reward": 0.5172991380095482, - "reward_std": 0.1887793131172657, - "rewards/accuracy_reward": 0.1316964328289032, + "grad_norm": 96.9845962524414, + "kl": 1.9765625, + "learning_rate": 2.638130494401386e-07, + "loss": 0.1649, + "reward": 0.5742187649011612, + "reward_std": 0.15029161609709263, + "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.440290205180645, "step": 1788 }, { "clip_ratio": 0.0, - "completion_length": 1699.6094665527344, + "completion_length": 1559.96435546875, "epoch": 0.5343887685759092, - "grad_norm": 11.982064247131348, - "kl": 0.7119140625, - "learning_rate": 5.2710536786963514e-08, - "loss": 0.0776, - "reward": 0.4659598395228386, - "reward_std": 0.14216729253530502, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 61.88276290893555, + "kl": 1.556640625, + "learning_rate": 2.6355268393481753e-07, + "loss": 0.1277, + "reward": 0.5279018208384514, + "reward_std": 0.0803921278566122, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392299123108387, + "rewards/tag_count_reward": 0.4564732387661934, "step": 1789 }, { "clip_ratio": 0.0, - "completion_length": 1706.4732971191406, + "completion_length": 1545.7857666015625, "epoch": 0.5346874766634306, - "grad_norm": 9.62502670288086, - "kl": 0.71435546875, - "learning_rate": 5.265846073710093e-08, - "loss": 0.0996, - "reward": 0.4871651977300644, - "reward_std": 0.1732640415430069, - "rewards/accuracy_reward": 0.09375000419095159, + "grad_norm": 37.51866912841797, + "kl": 1.7568359375, + "learning_rate": 2.632923036855046e-07, + "loss": 0.1401, + "reward": 0.5569196715950966, + "reward_std": 0.12648136168718338, + "rewards/accuracy_reward": 0.09821428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3934151977300644, + "rewards/tag_count_reward": 0.458705373108387, "step": 1790 }, { "clip_ratio": 0.0, - "completion_length": 1710.0357971191406, + "completion_length": 1606.5246276855469, "epoch": 0.5349861847509522, - "grad_norm": 51.51442337036133, - "kl": 0.9697265625, - "learning_rate": 5.2606381795093635e-08, - "loss": 0.118, - "reward": 0.4681919887661934, - "reward_std": 0.2028796337544918, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 78.58927154541016, + "kl": 1.5546875, + "learning_rate": 2.6303190897546816e-07, + "loss": 0.1472, + "reward": 0.5362723544239998, + "reward_std": 0.15003644116222858, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3922991156578064, + "rewards/tag_count_reward": 0.4514509066939354, "step": 1791 }, { "clip_ratio": 0.0, - "completion_length": 1678.94873046875, + "completion_length": 1623.5402221679688, "epoch": 0.5352848928384736, - "grad_norm": 95.41849517822266, - "kl": 0.8828125, - "learning_rate": 5.25543000175984e-08, - "loss": 0.1183, - "reward": 0.463727705180645, - "reward_std": 0.20345925353467464, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 31.3472957611084, + "kl": 2.47265625, + "learning_rate": 2.6277150008799196e-07, + "loss": 0.1761, + "reward": 0.4994419887661934, + "reward_std": 0.16809764504432678, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3989955484867096, + "rewards/tag_count_reward": 0.439174123108387, "step": 1792 }, { "clip_ratio": 0.0, - "completion_length": 1723.5692749023438, + "completion_length": 1642.4286499023438, "epoch": 0.5355836009259951, - "grad_norm": 6.4979352951049805, - "kl": 0.755859375, - "learning_rate": 5.250221546127508e-08, - "loss": 0.1424, - "reward": 0.5429687649011612, - "reward_std": 0.2320251427590847, - "rewards/accuracy_reward": 0.15848214970901608, + "grad_norm": 55.71942138671875, + "kl": 2.13671875, + "learning_rate": 2.625110773063754e-07, + "loss": 0.1594, + "reward": 0.6093750223517418, + "reward_std": 0.17745032906532288, + "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3844866305589676, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1793 }, { "clip_ratio": 0.0, - "completion_length": 1710.4085388183594, + "completion_length": 1618.5313110351562, "epoch": 0.5358823090135165, - "grad_norm": 18.475244522094727, - "kl": 0.65576171875, - "learning_rate": 5.245012818278659e-08, - "loss": 0.1053, - "reward": 0.4732143059372902, - "reward_std": 0.20072150975465775, - "rewards/accuracy_reward": 0.06696429010480642, + "grad_norm": 24.259843826293945, + "kl": 2.091796875, + "learning_rate": 2.6225064091393293e-07, + "loss": 0.1335, + "reward": 0.5239955559372902, + "reward_std": 0.14079313911497593, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4062500223517418, + "rewards/tag_count_reward": 0.4481026977300644, "step": 1794 }, { "clip_ratio": 0.0, - "completion_length": 1743.7723999023438, + "completion_length": 1639.94873046875, "epoch": 0.536181017101038, - "grad_norm": 10.69605827331543, - "kl": 0.830078125, - "learning_rate": 5.239803823879877e-08, - "loss": 0.1056, - "reward": 0.6065848544239998, - "reward_std": 0.23907452449202538, - "rewards/accuracy_reward": 0.20535715017467737, + "grad_norm": 64.32665252685547, + "kl": 1.8515625, + "learning_rate": 2.619901911939938e-07, + "loss": 0.1563, + "reward": 0.6729911044239998, + "reward_std": 0.20768187940120697, + "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4012276977300644, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1795 }, { "clip_ratio": 0.0, - "completion_length": 1697.7679443359375, + "completion_length": 1593.1406860351562, "epoch": 0.5364797251885595, - "grad_norm": 18.55603790283203, - "kl": 0.8515625, - "learning_rate": 5.2345945685980396e-08, - "loss": 0.1144, - "reward": 0.502790205180645, - "reward_std": 0.2607928328216076, - "rewards/accuracy_reward": 0.10044643329456449, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437649011612, + "grad_norm": 48.83095169067383, + "kl": 2.93359375, + "learning_rate": 2.61729728429902e-07, + "loss": 0.2512, + "reward": 0.541852705180645, + "reward_std": 0.2201291024684906, + "rewards/accuracy_reward": 0.09598214831203222, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4458705633878708, "step": 1796 }, { "clip_ratio": 0.0, - "completion_length": 1730.3572082519531, + "completion_length": 1636.1295471191406, "epoch": 0.536778433276081, - "grad_norm": 14.653114318847656, - "kl": 0.8115234375, - "learning_rate": 5.2293850581003017e-08, - "loss": 0.1147, - "reward": 0.491071455180645, - "reward_std": 0.22302145510911942, - "rewards/accuracy_reward": 0.11160714877769351, + "grad_norm": 24.956937789916992, + "kl": 4.33203125, + "learning_rate": 2.614692529050151e-07, + "loss": 0.2768, + "reward": 0.5306919887661934, + "reward_std": 0.16055604070425034, + "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3794643059372902, + "rewards/tag_count_reward": 0.4324777126312256, "step": 1797 }, { "clip_ratio": 0.0, - "completion_length": 1704.8281860351562, + "completion_length": 1672.77685546875, "epoch": 0.5370771413636024, - "grad_norm": 7.1379852294921875, - "kl": 0.5263671875, - "learning_rate": 5.224175298054104e-08, - "loss": 0.0856, - "reward": 0.4871651977300644, - "reward_std": 0.19382302463054657, + "grad_norm": 31.89096450805664, + "kl": 4.02734375, + "learning_rate": 2.612087649027052e-07, + "loss": 0.231, + "reward": 0.541294664144516, + "reward_std": 0.13896547071635723, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3889509066939354, + "rewards/tag_count_reward": 0.443080373108387, "step": 1798 }, { "clip_ratio": 0.0, - "completion_length": 1759.2500915527344, + "completion_length": 1641.6764221191406, "epoch": 0.5373758494511239, - "grad_norm": 8.258048057556152, - "kl": 0.49609375, - "learning_rate": 5.2189652941271544e-08, - "loss": 0.106, - "reward": 0.4603794813156128, - "reward_std": 0.22329707816243172, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 67.78101348876953, + "kl": 4.837890625, + "learning_rate": 2.6094826470635774e-07, + "loss": 0.2667, + "reward": 0.5161830708384514, + "reward_std": 0.15377869084477425, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3822544813156128, + "rewards/tag_count_reward": 0.4402901902794838, "step": 1799 }, { "clip_ratio": 0.0, - "completion_length": 1764.6831359863281, + "completion_length": 1723.4443054199219, "epoch": 0.5376745575386453, - "grad_norm": 4.882433891296387, - "kl": 0.58544921875, - "learning_rate": 5.213755051987426e-08, - "loss": 0.0917, - "reward": 0.4626116305589676, - "reward_std": 0.22807568684220314, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 16.118839263916016, + "kl": 4.24609375, + "learning_rate": 2.606877525993713e-07, + "loss": 0.2614, + "reward": 0.5061384066939354, + "reward_std": 0.17579124122858047, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.368861623108387, + "rewards/tag_count_reward": 0.4302455484867096, "step": 1800 }, { "clip_ratio": 0.0, - "completion_length": 1776.7366943359375, + "completion_length": 1683.1630249023438, "epoch": 0.5379732656261669, - "grad_norm": 5.217732906341553, - "kl": 0.5732421875, - "learning_rate": 5.208544577303149e-08, - "loss": 0.0945, - "reward": 0.4693080484867096, - "reward_std": 0.22524543479084969, - "rewards/accuracy_reward": 0.10267857508733869, + "grad_norm": 108.90744018554688, + "kl": 5.86328125, + "learning_rate": 2.6042722886515745e-07, + "loss": 0.3297, + "reward": 0.5133928880095482, + "reward_std": 0.15270156040787697, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.424107164144516, "step": 1801 }, { "clip_ratio": 0.0, - "completion_length": 1765.6742248535156, + "completion_length": 1720.5648193359375, "epoch": 0.5382719737136883, - "grad_norm": 9.284955978393555, - "kl": 0.5029296875, - "learning_rate": 5.203333875742813e-08, - "loss": 0.096, - "reward": 0.4732143133878708, - "reward_std": 0.19540031626820564, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 65.82952880859375, + "kl": 5.421875, + "learning_rate": 2.601666937871407e-07, + "loss": 0.3045, + "reward": 0.5284598469734192, + "reward_std": 0.16147832199931145, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3839285895228386, + "rewards/tag_count_reward": 0.4280134066939354, "step": 1802 }, { "clip_ratio": 0.0, - "completion_length": 1769.6630249023438, + "completion_length": 1661.1898193359375, "epoch": 0.5385706818012098, - "grad_norm": 5.78469181060791, - "kl": 0.7021484375, - "learning_rate": 5.198122952975149e-08, - "loss": 0.0962, - "reward": 0.4335937723517418, - "reward_std": 0.20844383910298347, - "rewards/accuracy_reward": 0.06696428824216127, + "grad_norm": 87.2514419555664, + "kl": 5.205078125, + "learning_rate": 2.5990614764875747e-07, + "loss": 0.3195, + "reward": 0.4949776902794838, + "reward_std": 0.1483394019305706, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294738650322, + "rewards/tag_count_reward": 0.4324776977300644, "step": 1803 }, { "clip_ratio": 0.0, - "completion_length": 1758.1206359863281, + "completion_length": 1643.3840026855469, "epoch": 0.5388693898887312, - "grad_norm": 7.625720500946045, - "kl": 0.7724609375, - "learning_rate": 5.192911814669131e-08, - "loss": 0.1216, - "reward": 0.401227705180645, - "reward_std": 0.21745198220014572, - "rewards/accuracy_reward": 0.029017859371379018, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3699776977300644, + "grad_norm": 51.15807342529297, + "kl": 4.94921875, + "learning_rate": 2.5964559073345654e-07, + "loss": 0.2927, + "reward": 0.450892873108387, + "reward_std": 0.14215877279639244, + "rewards/accuracy_reward": 0.01785714295692742, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357313156128, "step": 1804 }, { "clip_ratio": 0.0, - "completion_length": 1740.2723693847656, + "completion_length": 1587.4643249511719, "epoch": 0.5391680979762528, - "grad_norm": 11.179536819458008, - "kl": 0.853515625, - "learning_rate": 5.187700466493965e-08, - "loss": 0.1212, - "reward": 0.384486623108387, - "reward_std": 0.16160518676042557, + "grad_norm": 38.97306823730469, + "kl": 2.81640625, + "learning_rate": 2.5938502332469827e-07, + "loss": 0.2283, + "reward": 0.4525669887661934, + "reward_std": 0.10540596209466457, "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3822544813156128, + "rewards/tag_count_reward": 0.4503348395228386, "step": 1805 }, { "clip_ratio": 0.0, - "completion_length": 1805.2322082519531, + "completion_length": 1710.3460388183594, "epoch": 0.5394668060637742, - "grad_norm": 3.211925983428955, - "kl": 0.6142578125, - "learning_rate": 5.1824889141190873e-08, - "loss": 0.0845, - "reward": 0.4380580484867096, - "reward_std": 0.20161837339401245, - "rewards/accuracy_reward": 0.06026785867288709, + "grad_norm": 64.27703857421875, + "kl": 2.419921875, + "learning_rate": 2.5912444570595435e-07, + "loss": 0.1645, + "reward": 0.5111607313156128, + "reward_std": 0.14220664277672768, + "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3777901977300644, + "rewards/tag_count_reward": 0.4375000149011612, "step": 1806 }, { "clip_ratio": 0.0, - "completion_length": 1721.7723999023438, - "epoch": 0.5397655141512957, - "grad_norm": 21.115386962890625, - "kl": 0.8955078125, - "learning_rate": 5.177277163214159e-08, - "loss": 0.1085, - "reward": 0.4771205559372902, - "reward_std": 0.24780837073922157, - "rewards/accuracy_reward": 0.09598215040750802, + "completion_length": 1712.930908203125, + "epoch": 0.5397655141512957, + "grad_norm": 49.46287155151367, + "kl": 3.228515625, + "learning_rate": 2.5886385816070793e-07, + "loss": 0.2212, + "reward": 0.5033482313156128, + "reward_std": 0.2209962196648121, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3811384066939354, + "rewards/tag_count_reward": 0.4118303805589676, "step": 1807 }, { "clip_ratio": 0.0, - "completion_length": 1700.16748046875, + "completion_length": 1676.7009582519531, "epoch": 0.5400642222388171, - "grad_norm": 16.138547897338867, - "kl": 0.6669921875, - "learning_rate": 5.17206521944905e-08, - "loss": 0.1176, - "reward": 0.4648437649011612, - "reward_std": 0.221311267465353, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 42.49440002441406, + "kl": 4.38671875, + "learning_rate": 2.586032609724525e-07, + "loss": 0.2492, + "reward": 0.478794664144516, + "reward_std": 0.16028334572911263, + "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3688616305589676, + "rewards/tag_count_reward": 0.4006696566939354, "step": 1808 }, { "clip_ratio": 0.0, - "completion_length": 1637.9688415527344, + "completion_length": 1540.04248046875, "epoch": 0.5403629303263385, - "grad_norm": 13.798367500305176, - "kl": 0.826171875, - "learning_rate": 5.166853088493848e-08, - "loss": 0.1426, - "reward": 0.4285714477300644, - "reward_std": 0.18568534776568413, - "rewards/accuracy_reward": 0.044642860535532236, + "grad_norm": 54.58527374267578, + "kl": 2.5537109375, + "learning_rate": 2.583426544246924e-07, + "loss": 0.2039, + "reward": 0.494419664144516, + "reward_std": 0.11774345673620701, + "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3839285895228386, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1809 }, { "clip_ratio": 0.0, - "completion_length": 1668.0915832519531, + "completion_length": 1564.2701416015625, "epoch": 0.5406616384138601, - "grad_norm": 5.310388088226318, - "kl": 0.6005859375, - "learning_rate": 5.1616407760188384e-08, - "loss": 0.0912, - "reward": 0.431361623108387, - "reward_std": 0.1808728091418743, - "rewards/accuracy_reward": 0.03125000232830644, + "grad_norm": 10.956871032714844, + "kl": 2.77734375, + "learning_rate": 2.5808203880094194e-07, + "loss": 0.1744, + "reward": 0.4877232387661934, + "reward_std": 0.13426114991307259, + "rewards/accuracy_reward": 0.035714286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4001116305589676, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1810 }, { "clip_ratio": 0.0, - "completion_length": 1765.6540832519531, + "completion_length": 1650.5112609863281, "epoch": 0.5409603465013815, - "grad_norm": 4.478755474090576, - "kl": 0.6279296875, - "learning_rate": 5.156428287694508e-08, - "loss": 0.1001, - "reward": 0.491071455180645, - "reward_std": 0.1982564851641655, - "rewards/accuracy_reward": 0.10937500791624188, + "grad_norm": 45.98472595214844, + "kl": 2.306640625, + "learning_rate": 2.578214143847254e-07, + "loss": 0.1646, + "reward": 0.5591518133878708, + "reward_std": 0.11571215279400349, + "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3816964477300644, + "rewards/tag_count_reward": 0.447544664144516, "step": 1811 }, { "clip_ratio": 0.0, - "completion_length": 1701.0603332519531, + "completion_length": 1584.0246276855469, "epoch": 0.541259054588903, - "grad_norm": 4.564516067504883, - "kl": 0.521484375, - "learning_rate": 5.1512156291915296e-08, - "loss": 0.0938, - "reward": 0.4101562723517418, - "reward_std": 0.19633417204022408, - "rewards/accuracy_reward": 0.022321430034935474, + "grad_norm": 23.514137268066406, + "kl": 2.82421875, + "learning_rate": 2.575607814595765e-07, + "loss": 0.1945, + "reward": 0.4765625223517418, + "reward_std": 0.12958474270999432, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.387834832072258, + "rewards/tag_count_reward": 0.4497768133878708, "step": 1812 }, { "clip_ratio": 0.0, - "completion_length": 1647.3482971191406, + "completion_length": 1564.1250610351562, "epoch": 0.5415577626764244, - "grad_norm": 14.698564529418945, - "kl": 0.755859375, - "learning_rate": 5.1460028061807704e-08, - "loss": 0.1458, - "reward": 0.4659598395228386, - "reward_std": 0.20159431919455528, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 12.7684965133667, + "kl": 3.080078125, + "learning_rate": 2.5730014030903853e-07, + "loss": 0.2152, + "reward": 0.5251116305589676, + "reward_std": 0.1270734705030918, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3945312649011612, + "rewards/tag_count_reward": 0.451450914144516, "step": 1813 }, { "clip_ratio": 0.0, - "completion_length": 1746.29248046875, + "completion_length": 1623.2611999511719, "epoch": 0.5418564707639459, - "grad_norm": 6.253147602081299, - "kl": 0.611328125, - "learning_rate": 5.140789824333266e-08, - "loss": 0.1115, - "reward": 0.4174107238650322, - "reward_std": 0.17439532279968262, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 6.009121894836426, + "kl": 2.923828125, + "learning_rate": 2.570394912166633e-07, + "loss": 0.1813, + "reward": 0.4916294813156128, + "reward_std": 0.12303758598864079, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000149011612, + "rewards/tag_count_reward": 0.4380580633878708, "step": 1814 }, { "clip_ratio": 0.0, - "completion_length": 1631.1585388183594, + "completion_length": 1545.8683776855469, "epoch": 0.5421551788514674, - "grad_norm": 7.386971473693848, - "kl": 0.6064453125, - "learning_rate": 5.135576689320231e-08, - "loss": 0.1179, - "reward": 0.5055803880095482, - "reward_std": 0.24121849611401558, - "rewards/accuracy_reward": 0.11607143376022577, + "grad_norm": 26.26144027709961, + "kl": 3.640625, + "learning_rate": 2.5677883446601154e-07, + "loss": 0.2689, + "reward": 0.5535714477300644, + "reward_std": 0.19417370855808258, + "rewards/accuracy_reward": 0.10937500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3895089477300644, + "rewards/tag_count_reward": 0.4441964477300644, "step": 1815 }, { "clip_ratio": 0.0, - "completion_length": 1757.9152526855469, + "completion_length": 1677.259033203125, "epoch": 0.5424538869389889, - "grad_norm": 4.201247692108154, - "kl": 0.50048828125, - "learning_rate": 5.130363406813048e-08, - "loss": 0.0736, - "reward": 0.4464285895228386, - "reward_std": 0.22180605307221413, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 71.92900848388672, + "kl": 1.599609375, + "learning_rate": 2.565181703406524e-07, + "loss": 0.1251, + "reward": 0.4871651977300644, + "reward_std": 0.14847573451697826, + "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3995535895228386, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1816 }, { "clip_ratio": 0.0, - "completion_length": 1725.3415832519531, + "completion_length": 1644.9688415527344, "epoch": 0.5427525950265103, - "grad_norm": 5.131992340087891, - "kl": 0.6083984375, - "learning_rate": 5.125149982483254e-08, - "loss": 0.0917, - "reward": 0.4665178805589676, - "reward_std": 0.18896405026316643, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 10.076800346374512, + "kl": 3.14453125, + "learning_rate": 2.562574991241627e-07, + "loss": 0.1983, + "reward": 0.5295759215950966, + "reward_std": 0.12321187369525433, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3839285895228386, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1817 }, { "clip_ratio": 0.0, - "completion_length": 1788.5692749023438, + "completion_length": 1635.9040832519531, "epoch": 0.5430513031140318, - "grad_norm": 4.610702991485596, - "kl": 0.7119140625, - "learning_rate": 5.119936422002547e-08, - "loss": 0.0976, - "reward": 0.5212053805589676, - "reward_std": 0.2066245637834072, - "rewards/accuracy_reward": 0.13839286472648382, + "grad_norm": 34.617576599121094, + "kl": 2.5078125, + "learning_rate": 2.559968211001273e-07, + "loss": 0.1768, + "reward": 0.6021205559372902, + "reward_std": 0.1867583505809307, + "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125149011612, + "rewards/tag_count_reward": 0.4414062723517418, "step": 1818 }, { "clip_ratio": 0.0, - "completion_length": 1707.80810546875, + "completion_length": 1648.9397888183594, "epoch": 0.5433500112015532, - "grad_norm": 28.93818473815918, - "kl": 0.8046875, - "learning_rate": 5.114722731042765e-08, - "loss": 0.1058, - "reward": 0.4441964402794838, - "reward_std": 0.20058098062872887, - "rewards/accuracy_reward": 0.06026785937137902, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3816964402794838, + "grad_norm": 38.862770080566406, + "kl": 2.53125, + "learning_rate": 2.5573613655213827e-07, + "loss": 0.1806, + "reward": 0.5033482313156128, + "reward_std": 0.1319346632808447, + "rewards/accuracy_reward": 0.058035716880112886, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1819 }, { "clip_ratio": 0.0, - "completion_length": 1735.8237609863281, + "completion_length": 1646.62060546875, "epoch": 0.5436487192890748, - "grad_norm": 5.740725994110107, - "kl": 0.55859375, - "learning_rate": 5.109508915275897e-08, - "loss": 0.1009, - "reward": 0.4559151977300644, - "reward_std": 0.179160475730896, - "rewards/accuracy_reward": 0.06026785867288709, + "grad_norm": 15.36838436126709, + "kl": 2.6953125, + "learning_rate": 2.5547544576379486e-07, + "loss": 0.178, + "reward": 0.5005580559372902, + "reward_std": 0.11358188092708588, + "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.4536830559372902, "step": 1820 }, { "clip_ratio": 0.0, - "completion_length": 1759.3237609863281, + "completion_length": 1661.5603637695312, "epoch": 0.5439474273765962, - "grad_norm": 5.280090808868408, - "kl": 0.55029296875, - "learning_rate": 5.104294980374063e-08, - "loss": 0.1046, - "reward": 0.5435268133878708, - "reward_std": 0.2136024534702301, - "rewards/accuracy_reward": 0.15848214854486287, + "grad_norm": 27.059324264526367, + "kl": 2.822265625, + "learning_rate": 2.5521474901870316e-07, + "loss": 0.1949, + "reward": 0.601562537252903, + "reward_std": 0.1394155267626047, + "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3850446566939354, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1821 }, { "clip_ratio": 0.0, - "completion_length": 1713.26123046875, + "completion_length": 1695.7925109863281, "epoch": 0.5442461354641177, - "grad_norm": 70.28608703613281, - "kl": 1.017578125, - "learning_rate": 5.099080932009512e-08, - "loss": 0.1154, - "reward": 0.4821428805589676, - "reward_std": 0.16099776700139046, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 27.44449806213379, + "kl": 2.8359375, + "learning_rate": 2.549540466004756e-07, + "loss": 0.1814, + "reward": 0.5318080708384514, + "reward_std": 0.1326269581913948, + "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3995535895228386, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1822 }, { "clip_ratio": 0.0, - "completion_length": 1707.4375610351562, + "completion_length": 1545.0223999023438, "epoch": 0.5445448435516391, - "grad_norm": 11.907197952270508, - "kl": 0.7939453125, - "learning_rate": 5.0938667758546175e-08, - "loss": 0.1055, - "reward": 0.4447544813156128, - "reward_std": 0.1897578276693821, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 17.120256423950195, + "kl": 2.6953125, + "learning_rate": 2.546933387927309e-07, + "loss": 0.1817, + "reward": 0.5050223469734192, + "reward_std": 0.12700673565268517, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3822544813156128, + "rewards/tag_count_reward": 0.4425223544239998, "step": 1823 }, { "clip_ratio": 0.0, - "completion_length": 1697.33935546875, + "completion_length": 1569.5960388183594, "epoch": 0.5448435516391607, - "grad_norm": 5.706474781036377, - "kl": 0.666015625, - "learning_rate": 5.0886525175818717e-08, - "loss": 0.1123, - "reward": 0.4687500149011612, - "reward_std": 0.20179509744048119, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 11.614546775817871, + "kl": 3.50390625, + "learning_rate": 2.544326258790936e-07, + "loss": 0.2327, + "reward": 0.5323660895228386, + "reward_std": 0.14016331732273102, + "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3772321566939354, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1824 }, { "clip_ratio": 0.0, - "completion_length": 1725.555908203125, + "completion_length": 1629.2054138183594, "epoch": 0.5451422597266821, - "grad_norm": 31.42140769958496, - "kl": 0.92578125, - "learning_rate": 5.0834381628638746e-08, - "loss": 0.1115, - "reward": 0.5362723469734192, - "reward_std": 0.19312890991568565, - "rewards/accuracy_reward": 0.1517857147846371, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3844866305589676, + "grad_norm": 24.148929595947266, + "kl": 3.4921875, + "learning_rate": 2.541719081431937e-07, + "loss": 0.203, + "reward": 0.6060268133878708, + "reward_std": 0.1510606463998556, + "rewards/accuracy_reward": 0.1562500069849193, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1825 }, { "clip_ratio": 0.0, - "completion_length": 1759.2277526855469, + "completion_length": 1592.19873046875, "epoch": 0.5454409678142036, - "grad_norm": 17.639989852905273, - "kl": 0.8125, - "learning_rate": 5.078223717373333e-08, - "loss": 0.1, - "reward": 0.400669664144516, - "reward_std": 0.2163918875157833, - "rewards/accuracy_reward": 0.033482144586741924, + "grad_norm": 61.79855728149414, + "kl": 4.50390625, + "learning_rate": 2.539111858686667e-07, + "loss": 0.2826, + "reward": 0.4977678805589676, + "reward_std": 0.1696145497262478, + "rewards/accuracy_reward": 0.06696429057046771, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3671875149011612, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1826 }, { "clip_ratio": 0.0, - "completion_length": 1725.9933776855469, + "completion_length": 1584.0447082519531, "epoch": 0.545739675901725, - "grad_norm": 6.461996555328369, - "kl": 0.7578125, - "learning_rate": 5.073009186783056e-08, - "loss": 0.121, - "reward": 0.470424123108387, - "reward_std": 0.1946926712989807, - "rewards/accuracy_reward": 0.09375000349245965, + "grad_norm": 21.755510330200195, + "kl": 4.0, + "learning_rate": 2.536504593391528e-07, + "loss": 0.2586, + "reward": 0.541852705180645, + "reward_std": 0.13907089829444885, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376674123108387, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1827 }, { "clip_ratio": 0.0, - "completion_length": 1733.7902526855469, + "completion_length": 1642.5245971679688, "epoch": 0.5460383839892465, - "grad_norm": 4.954241752624512, - "kl": 0.7197265625, - "learning_rate": 5.0677945767659394e-08, - "loss": 0.1049, - "reward": 0.5357143059372902, - "reward_std": 0.21174169331789017, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 15.096441268920898, + "kl": 2.96484375, + "learning_rate": 2.5338972883829695e-07, + "loss": 0.2013, + "reward": 0.5959821492433548, + "reward_std": 0.1694699563086033, + "rewards/accuracy_reward": 0.14955357927829027, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3772321566939354, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1828 }, { "clip_ratio": 0.0, - "completion_length": 1707.5781860351562, + "completion_length": 1636.8058471679688, "epoch": 0.546337092076768, - "grad_norm": 7.382134914398193, - "kl": 0.615234375, - "learning_rate": 5.062579892994966e-08, - "loss": 0.1065, - "reward": 0.4190848395228386, - "reward_std": 0.21632909402251244, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 53.58095169067383, + "kl": 3.02734375, + "learning_rate": 2.531289946497483e-07, + "loss": 0.2164, + "reward": 0.4642857313156128, + "reward_std": 0.14818907342851162, + "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3766741305589676, + "rewards/tag_count_reward": 0.4330357387661934, "step": 1829 }, { "clip_ratio": 0.0, - "completion_length": 1762.0692749023438, + "completion_length": 1627.1942749023438, "epoch": 0.5466358001642895, - "grad_norm": 4.876481056213379, - "kl": 0.6689453125, - "learning_rate": 5.057365141143204e-08, - "loss": 0.109, - "reward": 0.514508955180645, - "reward_std": 0.212405264377594, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 16.86935043334961, + "kl": 3.25390625, + "learning_rate": 2.528682570571602e-07, + "loss": 0.2179, + "reward": 0.5954241380095482, + "reward_std": 0.15624183230102062, + "rewards/accuracy_reward": 0.15178571734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "rewards/tag_count_reward": 0.443638414144516, "step": 1830 }, { "clip_ratio": 0.0, - "completion_length": 1731.1897888183594, + "completion_length": 1615.5670471191406, "epoch": 0.5469345082518109, - "grad_norm": 6.85830545425415, - "kl": 0.65576171875, - "learning_rate": 5.0521503268837906e-08, - "loss": 0.109, - "reward": 0.4743303880095482, - "reward_std": 0.20959612727165222, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 6.735917091369629, + "kl": 3.109375, + "learning_rate": 2.5260751634418953e-07, + "loss": 0.2046, + "reward": 0.5625000223517418, + "reward_std": 0.15686860121786594, + "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3761160895228386, + "rewards/tag_count_reward": 0.444196455180645, "step": 1831 }, { "clip_ratio": 0.0, - "completion_length": 1763.2344360351562, + "completion_length": 1664.9241638183594, "epoch": 0.5472332163393324, - "grad_norm": 38.49789810180664, - "kl": 0.875, - "learning_rate": 5.0469354558899325e-08, - "loss": 0.0967, - "reward": 0.4626116305589676, - "reward_std": 0.22047000750899315, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 10.945684432983398, + "kl": 3.36328125, + "learning_rate": 2.523467727944966e-07, + "loss": 0.2219, + "reward": 0.5474330484867096, + "reward_std": 0.1896630898118019, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.373325914144516, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1832 }, { "clip_ratio": 0.0, - "completion_length": 1626.4465026855469, + "completion_length": 1601.0157165527344, "epoch": 0.5475319244268538, - "grad_norm": 9.97778606414795, - "kl": 0.7841796875, - "learning_rate": 5.0417205338348975e-08, - "loss": 0.0975, - "reward": 0.4503348395228386, - "reward_std": 0.18189863115549088, - "rewards/accuracy_reward": 0.06250000186264515, + "grad_norm": 89.59307098388672, + "kl": 5.30078125, + "learning_rate": 2.520860266917449e-07, + "loss": 0.2999, + "reward": 0.498325914144516, + "reward_std": 0.14390374720096588, + "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3878348469734192, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1833 }, { "clip_ratio": 0.0, - "completion_length": 1722.8840026855469, + "completion_length": 1593.9844360351562, "epoch": 0.5478306325143754, - "grad_norm": 10.538787841796875, - "kl": 0.630859375, - "learning_rate": 5.0365055663920116e-08, - "loss": 0.1182, - "reward": 0.4570312649011612, - "reward_std": 0.21223386004567146, - "rewards/accuracy_reward": 0.07142857671715319, + "grad_norm": 16.7200870513916, + "kl": 3.7734375, + "learning_rate": 2.518252783196006e-07, + "loss": 0.2471, + "reward": 0.5340401977300644, + "reward_std": 0.14667906612157822, + "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.385602705180645, + "rewards/tag_count_reward": 0.455915205180645, "step": 1834 }, { "clip_ratio": 0.0, - "completion_length": 1702.8683776855469, + "completion_length": 1592.2857666015625, "epoch": 0.5481293406018968, - "grad_norm": 60.46471405029297, - "kl": 1.0390625, - "learning_rate": 5.0312905592346487e-08, - "loss": 0.1343, - "reward": 0.4570312649011612, - "reward_std": 0.22876984253525734, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 8.380125999450684, + "kl": 3.8984375, + "learning_rate": 2.5156452796173245e-07, + "loss": 0.2639, + "reward": 0.5290178805589676, + "reward_std": 0.1842203363776207, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3922991305589676, + "rewards/tag_count_reward": 0.4464285969734192, "step": 1835 }, { "clip_ratio": 0.0, - "completion_length": 1679.3438720703125, + "completion_length": 1547.9688110351562, "epoch": 0.5484280486894183, - "grad_norm": 7.4055328369140625, - "kl": 0.7451171875, - "learning_rate": 5.0260755180362227e-08, - "loss": 0.1053, - "reward": 0.524553582072258, - "reward_std": 0.1896517351269722, - "rewards/accuracy_reward": 0.1450892947614193, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3794642984867096, + "grad_norm": 7.6431474685668945, + "kl": 2.98828125, + "learning_rate": 2.513037759018111e-07, + "loss": 0.1865, + "reward": 0.6104910969734192, + "reward_std": 0.14610681496560574, + "rewards/accuracy_reward": 0.15848214668221772, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.452008955180645, "step": 1836 }, { "clip_ratio": 0.0, - "completion_length": 1685.7366638183594, + "completion_length": 1620.4933776855469, "epoch": 0.5487267567769397, - "grad_norm": 5.033426284790039, - "kl": 0.712890625, - "learning_rate": 5.020860448470189e-08, - "loss": 0.111, - "reward": 0.494977705180645, - "reward_std": 0.21193302050232887, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 26.614700317382812, + "kl": 3.75, + "learning_rate": 2.5104302242350944e-07, + "loss": 0.2221, + "reward": 0.5563616305589676, + "reward_std": 0.14477082900702953, + "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1837 }, { "clip_ratio": 0.0, - "completion_length": 1745.6830749511719, + "completion_length": 1706.9486999511719, "epoch": 0.5490254648644612, - "grad_norm": 6.44417667388916, - "kl": 0.69140625, - "learning_rate": 5.015645356210032e-08, - "loss": 0.1032, - "reward": 0.4135044887661934, - "reward_std": 0.20188569277524948, - "rewards/accuracy_reward": 0.029017857741564512, + "grad_norm": 10.748540878295898, + "kl": 2.8984375, + "learning_rate": 2.507822678105016e-07, + "loss": 0.1694, + "reward": 0.473772332072258, + "reward_std": 0.11917665787041187, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3844866305589676, + "rewards/tag_count_reward": 0.4559151977300644, "step": 1838 }, { "clip_ratio": 0.0, - "completion_length": 1764.1250915527344, + "completion_length": 1714.2165832519531, "epoch": 0.5493241729519827, - "grad_norm": 6.5874128341674805, - "kl": 0.4951171875, - "learning_rate": 5.0104302469292616e-08, - "loss": 0.0901, - "reward": 0.494419664144516, - "reward_std": 0.23182430118322372, - "rewards/accuracy_reward": 0.10267857694998384, + "grad_norm": 26.863054275512695, + "kl": 1.998046875, + "learning_rate": 2.505215123464631e-07, + "loss": 0.1192, + "reward": 0.5412946715950966, + "reward_std": 0.13468840345740318, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3917410895228386, + "rewards/tag_count_reward": 0.4564732387661934, "step": 1839 }, { "clip_ratio": 0.0, - "completion_length": 1713.3929138183594, + "completion_length": 1635.7701721191406, "epoch": 0.5496228810395042, - "grad_norm": 6.513303756713867, - "kl": 0.6015625, - "learning_rate": 5.005215126301403e-08, - "loss": 0.1126, - "reward": 0.506138414144516, - "reward_std": 0.21327876672148705, - "rewards/accuracy_reward": 0.11383929289877415, + "grad_norm": 26.842126846313477, + "kl": 2.609375, + "learning_rate": 2.5026075631507017e-07, + "loss": 0.1794, + "reward": 0.568638414144516, + "reward_std": 0.1791628785431385, + "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392299123108387, + "rewards/tag_count_reward": 0.448102705180645, "step": 1840 }, { "clip_ratio": 0.0, - "completion_length": 1769.9532165527344, + "completion_length": 1669.4397888183594, "epoch": 0.5499215891270256, - "grad_norm": 9.911648750305176, - "kl": 0.9130859375, - "learning_rate": 5e-08, - "loss": 0.1244, - "reward": 0.4570312723517418, - "reward_std": 0.20429929345846176, - "rewards/accuracy_reward": 0.09151785937137902, + "grad_norm": 42.67668914794922, + "kl": 3.05859375, + "learning_rate": 2.5e-07, + "loss": 0.2283, + "reward": 0.5357143133878708, + "reward_std": 0.13251677341759205, + "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3655134066939354, + "rewards/tag_count_reward": 0.4375000149011612, "step": 1841 }, { "clip_ratio": 0.0, - "completion_length": 1641.3081359863281, + "completion_length": 1524.57373046875, "epoch": 0.5502202972145471, - "grad_norm": 87.81301879882812, - "kl": 0.962890625, - "learning_rate": 4.9947848736985966e-08, - "loss": 0.1216, - "reward": 0.5100446715950966, - "reward_std": 0.21619299426674843, - "rewards/accuracy_reward": 0.1049107201397419, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.4029018059372902, + "grad_norm": 58.52946090698242, + "kl": 2.26171875, + "learning_rate": 2.497392436849298e-07, + "loss": 0.1894, + "reward": 0.5608259066939354, + "reward_std": 0.16248693317174911, + "rewards/accuracy_reward": 0.10267857578583062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473395228386, "step": 1842 }, { "clip_ratio": 0.0, - "completion_length": 1697.7701721191406, + "completion_length": 1541.7589721679688, "epoch": 0.5505190053020685, - "grad_norm": 27.758787155151367, - "kl": 1.1181640625, - "learning_rate": 4.989569753070738e-08, - "loss": 0.1571, - "reward": 0.435825914144516, - "reward_std": 0.20184194669127464, - "rewards/accuracy_reward": 0.08482143236324191, + "grad_norm": 64.95293426513672, + "kl": 2.392578125, + "learning_rate": 2.494784876535369e-07, + "loss": 0.1963, + "reward": 0.5535714402794838, + "reward_std": 0.15209194645285606, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044813156128, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1843 }, { "clip_ratio": 0.0, - "completion_length": 1712.3929443359375, + "completion_length": 1655.0268249511719, "epoch": 0.5508177133895901, - "grad_norm": 6.678674697875977, - "kl": 0.68017578125, - "learning_rate": 4.9843546437899676e-08, - "loss": 0.1027, - "reward": 0.4520089402794838, - "reward_std": 0.19997362419962883, - "rewards/accuracy_reward": 0.06919643329456449, + "grad_norm": 40.915008544921875, + "kl": 2.65625, + "learning_rate": 2.4921773218949835e-07, + "loss": 0.1834, + "reward": 0.513950914144516, + "reward_std": 0.1428381036967039, + "rewards/accuracy_reward": 0.07366071664728224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125149011612, + "rewards/tag_count_reward": 0.440290205180645, "step": 1844 }, { "clip_ratio": 0.0, - "completion_length": 1683.5313415527344, + "completion_length": 1582.3259582519531, "epoch": 0.5511164214771115, - "grad_norm": 5.007708549499512, - "kl": 0.66748046875, - "learning_rate": 4.9791395515298114e-08, - "loss": 0.1069, - "reward": 0.459821455180645, - "reward_std": 0.20155712589621544, - "rewards/accuracy_reward": 0.06919643003493547, + "grad_norm": 25.71390151977539, + "kl": 2.1171875, + "learning_rate": 2.4895697757649054e-07, + "loss": 0.1485, + "reward": 0.5239955559372902, + "reward_std": 0.14202952571213245, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3906250149011612, + "rewards/tag_count_reward": 0.4547991305589676, "step": 1845 }, { "clip_ratio": 0.0, - "completion_length": 1772.4241638183594, + "completion_length": 1704.4844665527344, "epoch": 0.551415129564633, - "grad_norm": 4.851314544677734, - "kl": 0.611328125, - "learning_rate": 4.9739244819637775e-08, - "loss": 0.0996, - "reward": 0.451450914144516, - "reward_std": 0.19233689829707146, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 26.884929656982422, + "kl": 3.26953125, + "learning_rate": 2.4869622409818886e-07, + "loss": 0.189, + "reward": 0.5301339477300644, + "reward_std": 0.13444687239825726, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3688616156578064, + "rewards/tag_count_reward": 0.443080373108387, "step": 1846 }, { "clip_ratio": 0.0, - "completion_length": 1758.1585693359375, + "completion_length": 1716.9174499511719, "epoch": 0.5517138376521544, - "grad_norm": 4.743221282958984, - "kl": 0.6181640625, - "learning_rate": 4.9687094407653516e-08, - "loss": 0.097, - "reward": 0.5139509290456772, - "reward_std": 0.18188300356268883, - "rewards/accuracy_reward": 0.11607143515720963, + "grad_norm": 28.488710403442383, + "kl": 2.6875, + "learning_rate": 2.484354720382676e-07, + "loss": 0.1728, + "reward": 0.5747767984867096, + "reward_std": 0.14004367589950562, + "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3978794813156128, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1847 }, { "clip_ratio": 0.0, - "completion_length": 1717.4554443359375, + "completion_length": 1605.57373046875, "epoch": 0.552012545739676, - "grad_norm": 4.645486354827881, - "kl": 0.63134765625, - "learning_rate": 4.963494433607988e-08, - "loss": 0.1016, - "reward": 0.455357164144516, - "reward_std": 0.21575545147061348, - "rewards/accuracy_reward": 0.0602678619325161, + "grad_norm": 37.16767501831055, + "kl": 3.12109375, + "learning_rate": 2.481747216803994e-07, + "loss": 0.1583, + "reward": 0.5195312723517418, + "reward_std": 0.15428761951625347, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3950893059372902, + "rewards/tag_count_reward": 0.4481026977300644, "step": 1848 }, { "clip_ratio": 0.0, - "completion_length": 1756.9933776855469, + "completion_length": 1595.8996276855469, "epoch": 0.5523112538271974, - "grad_norm": 16.066486358642578, - "kl": 0.98828125, - "learning_rate": 4.958279466165102e-08, - "loss": 0.1361, - "reward": 0.4414062723517418, - "reward_std": 0.22569428011775017, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 25.365875244140625, + "kl": 4.1328125, + "learning_rate": 2.479139733082551e-07, + "loss": 0.2891, + "reward": 0.541294664144516, + "reward_std": 0.1736100148409605, + "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1849 }, { "clip_ratio": 0.0, - "completion_length": 1725.6719360351562, + "completion_length": 1669.8304443359375, "epoch": 0.5526099619147189, - "grad_norm": 6.107819080352783, - "kl": 0.70458984375, - "learning_rate": 4.953064544110068e-08, - "loss": 0.0978, - "reward": 0.4760044887661934, - "reward_std": 0.20945369824767113, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 87.41926574707031, + "kl": 5.58203125, + "learning_rate": 2.476532272055034e-07, + "loss": 0.3359, + "reward": 0.5161830708384514, + "reward_std": 0.17138975113630295, + "rewards/accuracy_reward": 0.08482143306173384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830484867096, + "rewards/tag_count_reward": 0.431361623108387, "step": 1850 }, { "clip_ratio": 0.0, - "completion_length": 1738.5982971191406, + "completion_length": 1650.7433776855469, "epoch": 0.5529086700022403, - "grad_norm": 10.283284187316895, - "kl": 0.859375, - "learning_rate": 4.94784967311621e-08, - "loss": 0.1102, - "reward": 0.450334832072258, - "reward_std": 0.19237303733825684, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 16.201539993286133, + "kl": 3.92578125, + "learning_rate": 2.473924836558105e-07, + "loss": 0.2465, + "reward": 0.5440848395228386, + "reward_std": 0.12630446441471577, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3565848395228386, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1851 }, { "clip_ratio": 0.0, - "completion_length": 1744.9777526855469, + "completion_length": 1714.3170776367188, "epoch": 0.5532073780897617, - "grad_norm": 14.462075233459473, - "kl": 0.8583984375, - "learning_rate": 4.942634858856797e-08, - "loss": 0.1136, - "reward": 0.4776785969734192, - "reward_std": 0.2143266312777996, - "rewards/accuracy_reward": 0.10267857369035482, + "grad_norm": 56.88924789428711, + "kl": 5.2265625, + "learning_rate": 2.471317429428398e-07, + "loss": 0.3168, + "reward": 0.5239955559372902, + "reward_std": 0.137503856793046, + "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000223517418, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1852 }, { "clip_ratio": 0.0, - "completion_length": 1743.1541137695312, + "completion_length": 1695.5313110351562, "epoch": 0.5535060861772833, - "grad_norm": 33.149009704589844, - "kl": 0.845703125, - "learning_rate": 4.937420107005034e-08, - "loss": 0.1085, - "reward": 0.4570312574505806, - "reward_std": 0.18596115335822105, - "rewards/accuracy_reward": 0.08928571734577417, + "grad_norm": 61.337310791015625, + "kl": 5.03125, + "learning_rate": 2.468710053502517e-07, + "loss": 0.2693, + "reward": 0.5234375223517418, + "reward_std": 0.14771644212305546, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455559372902, + "rewards/tag_count_reward": 0.4207589477300644, "step": 1853 }, { "clip_ratio": 0.0, - "completion_length": 1693.6674499511719, + "completion_length": 1600.4911499023438, "epoch": 0.5538047942648047, - "grad_norm": 18.77207374572754, - "kl": 0.70458984375, - "learning_rate": 4.932205423234061e-08, - "loss": 0.1044, - "reward": 0.568080373108387, - "reward_std": 0.246793944388628, - "rewards/accuracy_reward": 0.1897321492433548, + "grad_norm": 40.9768180847168, + "kl": 4.53515625, + "learning_rate": 2.46610271161703e-07, + "loss": 0.2821, + "reward": 0.6021205633878708, + "reward_std": 0.1776778493076563, + "rewards/accuracy_reward": 0.16517857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482313156128, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1854 }, { "clip_ratio": 0.0, - "completion_length": 1669.9889221191406, + "completion_length": 1585.5960388183594, "epoch": 0.5541035023523262, - "grad_norm": 6.0513916015625, - "kl": 0.6572265625, - "learning_rate": 4.926990813216944e-08, - "loss": 0.1144, - "reward": 0.5072544813156128, - "reward_std": 0.1938609704375267, - "rewards/accuracy_reward": 0.10491071827709675, + "grad_norm": 5.329873561859131, + "kl": 3.0234375, + "learning_rate": 2.463495406608472e-07, + "loss": 0.2083, + "reward": 0.594308078289032, + "reward_std": 0.16510101221501827, + "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4023437723517418, + "rewards/tag_count_reward": 0.4536830559372902, "step": 1855 }, { "clip_ratio": 0.0, - "completion_length": 1774.7522888183594, + "completion_length": 1721.15185546875, "epoch": 0.5544022104398476, - "grad_norm": 12.412491798400879, - "kl": 0.8896484375, - "learning_rate": 4.9217762826266665e-08, - "loss": 0.1218, - "reward": 0.3744419813156128, - "reward_std": 0.18628092482686043, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 13.018850326538086, + "kl": 4.27734375, + "learning_rate": 2.460888141313333e-07, + "loss": 0.2536, + "reward": 0.4369419813156128, + "reward_std": 0.14147493056952953, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812649011612, + "rewards/tag_count_reward": 0.4168526977300644, "step": 1856 }, { "clip_ratio": 0.0, - "completion_length": 1715.3103332519531, + "completion_length": 1619.6541137695312, "epoch": 0.5547009185273691, - "grad_norm": 8.971603393554688, - "kl": 0.51416015625, - "learning_rate": 4.9165618371361256e-08, - "loss": 0.0758, - "reward": 0.4581473395228386, - "reward_std": 0.2259516716003418, - "rewards/accuracy_reward": 0.06250000349245965, + "grad_norm": 22.766019821166992, + "kl": 2.8671875, + "learning_rate": 2.4582809185680626e-07, + "loss": 0.2017, + "reward": 0.4905134215950966, + "reward_std": 0.176256962120533, + "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3956473395228386, + "rewards/tag_count_reward": 0.443638414144516, "step": 1857 }, { "clip_ratio": 0.0, - "completion_length": 1697.4085693359375, + "completion_length": 1595.5157165527344, "epoch": 0.5549996266148906, - "grad_norm": 76.55606079101562, - "kl": 1.00390625, - "learning_rate": 4.911347482418129e-08, - "loss": 0.1472, - "reward": 0.5362723469734192, - "reward_std": 0.2125493623316288, - "rewards/accuracy_reward": 0.15178572107106447, + "grad_norm": 46.013431549072266, + "kl": 2.8125, + "learning_rate": 2.4556737412090643e-07, + "loss": 0.2089, + "reward": 0.6183035969734192, + "reward_std": 0.15249226242303848, + "rewards/accuracy_reward": 0.17187500605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.384486623108387, + "rewards/tag_count_reward": 0.4464285969734192, "step": 1858 }, { "clip_ratio": 0.0, - "completion_length": 1727.44873046875, + "completion_length": 1652.9420166015625, "epoch": 0.5552983347024121, - "grad_norm": 9.430054664611816, - "kl": 0.564453125, - "learning_rate": 4.906133224145383e-08, - "loss": 0.1011, - "reward": 0.474888414144516, - "reward_std": 0.2501452639698982, - "rewards/accuracy_reward": 0.10044642956927419, + "grad_norm": 94.20999908447266, + "kl": 5.4140625, + "learning_rate": 2.4530666120726915e-07, + "loss": 0.3182, + "reward": 0.5000000298023224, + "reward_std": 0.18773213773965836, + "rewards/accuracy_reward": 0.07812500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419813156128, + "rewards/tag_count_reward": 0.4218750149011612, "step": 1859 }, { "clip_ratio": 0.0, - "completion_length": 1638.9241638183594, + "completion_length": 1569.071533203125, "epoch": 0.5555970427899335, - "grad_norm": 6.181450366973877, - "kl": 0.63525390625, - "learning_rate": 4.900919067990489e-08, - "loss": 0.0986, - "reward": 0.4676339477300644, - "reward_std": 0.20346413925290108, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 9.277281761169434, + "kl": 2.81640625, + "learning_rate": 2.4504595339952445e-07, + "loss": 0.1925, + "reward": 0.5396205559372902, + "reward_std": 0.13220451585948467, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3939732313156128, + "rewards/tag_count_reward": 0.4547991305589676, "step": 1860 }, { "clip_ratio": 0.0, - "completion_length": 1684.8460693359375, + "completion_length": 1631.7701416015625, "epoch": 0.555895750877455, - "grad_norm": 7.165920257568359, - "kl": 0.51123046875, - "learning_rate": 4.895705019625936e-08, - "loss": 0.0947, - "reward": 0.4972098395228386, - "reward_std": 0.20923374220728874, - "rewards/accuracy_reward": 0.10937500605359674, + "grad_norm": 5.5020976066589355, + "kl": 2.76171875, + "learning_rate": 2.447852509812968e-07, + "loss": 0.1783, + "reward": 0.5809152126312256, + "reward_std": 0.1795587930828333, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3878348395228386, + "rewards/tag_count_reward": 0.4447544813156128, "step": 1861 }, { "clip_ratio": 0.0, - "completion_length": 1709.6072387695312, + "completion_length": 1575.9710388183594, "epoch": 0.5561944589649764, - "grad_norm": 10.364458084106445, - "kl": 0.8310546875, - "learning_rate": 4.8904910847241024e-08, - "loss": 0.1221, - "reward": 0.4782366305589676, - "reward_std": 0.20052922144532204, - "rewards/accuracy_reward": 0.10714285867288709, + "grad_norm": 36.74385452270508, + "kl": 2.74609375, + "learning_rate": 2.445245542362051e-07, + "loss": 0.1985, + "reward": 0.575892873108387, + "reward_std": 0.1431265715509653, + "rewards/accuracy_reward": 0.1272321513388306, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937649011612, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1862 }, { "clip_ratio": 0.0, - "completion_length": 1650.74560546875, + "completion_length": 1548.7478332519531, "epoch": 0.556493167052498, - "grad_norm": 10.781634330749512, - "kl": 0.73046875, - "learning_rate": 4.885277268957235e-08, - "loss": 0.13, - "reward": 0.3895089477300644, - "reward_std": 0.18696628138422966, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 54.87440872192383, + "kl": 2.44921875, + "learning_rate": 2.442638634478617e-07, + "loss": 0.1981, + "reward": 0.4603794887661934, + "reward_std": 0.11083609610795975, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.380580373108387, + "rewards/tag_count_reward": 0.451450914144516, "step": 1863 }, { "clip_ratio": 0.0, - "completion_length": 1815.1764221191406, + "completion_length": 1744.1027221679688, "epoch": 0.5567918751400194, - "grad_norm": 13.789371490478516, - "kl": 0.9130859375, - "learning_rate": 4.880063577997453e-08, - "loss": 0.1184, - "reward": 0.4341517984867096, - "reward_std": 0.21957556530833244, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 15.86228084564209, + "kl": 3.84375, + "learning_rate": 2.4400317889987266e-07, + "loss": 0.2302, + "reward": 0.4988839477300644, + "reward_std": 0.14138775318861008, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3448660895228386, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1864 }, { "clip_ratio": 0.0, - "completion_length": 1793.8148193359375, + "completion_length": 1692.10498046875, "epoch": 0.5570905832275409, - "grad_norm": 6.8747124671936035, - "kl": 0.6572265625, - "learning_rate": 4.8748500175167454e-08, - "loss": 0.0962, - "reward": 0.4888393208384514, - "reward_std": 0.17413954064249992, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 38.201080322265625, + "kl": 2.3125, + "learning_rate": 2.4374250087583726e-07, + "loss": 0.1552, + "reward": 0.5530134066939354, + "reward_std": 0.09702224098145962, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.377232164144516, + "rewards/tag_count_reward": 0.443638414144516, "step": 1865 }, { "clip_ratio": 0.0, - "completion_length": 1713.5848693847656, + "completion_length": 1616.5558776855469, "epoch": 0.5573892913150623, - "grad_norm": 4.055199146270752, - "kl": 0.625, - "learning_rate": 4.869636593186952e-08, - "loss": 0.115, - "reward": 0.4414062723517418, - "reward_std": 0.21355580538511276, - "rewards/accuracy_reward": 0.07589286216534674, + "grad_norm": 45.745460510253906, + "kl": 2.77734375, + "learning_rate": 2.434818296593476e-07, + "loss": 0.2003, + "reward": 0.524553582072258, + "reward_std": 0.1677049621939659, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.365513414144516, + "rewards/tag_count_reward": 0.435267873108387, "step": 1866 }, { "clip_ratio": 0.0, - "completion_length": 1672.4063110351562, + "completion_length": 1596.9509582519531, "epoch": 0.5576879994025838, - "grad_norm": 4.739964485168457, - "kl": 0.65625, - "learning_rate": 4.8644233106797686e-08, - "loss": 0.1041, - "reward": 0.5474330559372902, - "reward_std": 0.20445086807012558, - "rewards/accuracy_reward": 0.15625000558793545, + "grad_norm": 7.408532619476318, + "kl": 2.8515625, + "learning_rate": 2.4322116553398844e-07, + "loss": 0.1775, + "reward": 0.5965401828289032, + "reward_std": 0.12140622176229954, + "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830559372902, + "rewards/tag_count_reward": 0.4581473395228386, "step": 1867 }, { "clip_ratio": 0.0, - "completion_length": 1716.8750915527344, + "completion_length": 1650.04248046875, "epoch": 0.5579867074901053, - "grad_norm": 17.557382583618164, - "kl": 0.8662109375, - "learning_rate": 4.8592101756667335e-08, - "loss": 0.1315, - "reward": 0.4135044887661934, - "reward_std": 0.18122153356671333, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 30.674081802368164, + "kl": 3.0625, + "learning_rate": 2.429605087833367e-07, + "loss": 0.2089, + "reward": 0.4838169813156128, + "reward_std": 0.11079679429531097, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937649011612, + "rewards/tag_count_reward": 0.443638414144516, "step": 1868 }, { "clip_ratio": 0.0, - "completion_length": 1760.6786499023438, + "completion_length": 1647.3616943359375, "epoch": 0.5582854155776268, - "grad_norm": 5.950899600982666, - "kl": 0.7265625, - "learning_rate": 4.8539971938192305e-08, - "loss": 0.1173, - "reward": 0.456473246216774, - "reward_std": 0.18892794847488403, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 53.42111587524414, + "kl": 2.8125, + "learning_rate": 2.426998596909615e-07, + "loss": 0.2099, + "reward": 0.5401785895228386, + "reward_std": 0.1639634370803833, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.373883955180645, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1869 }, { "clip_ratio": 0.0, - "completion_length": 1722.65185546875, + "completion_length": 1653.4911499023438, "epoch": 0.5585841236651482, - "grad_norm": 6.106803894042969, - "kl": 0.6552734375, - "learning_rate": 4.84878437080847e-08, - "loss": 0.1007, - "reward": 0.4687500223517418, - "reward_std": 0.20283469185233116, - "rewards/accuracy_reward": 0.08035714854486287, + "grad_norm": 20.33791732788086, + "kl": 3.68359375, + "learning_rate": 2.424392185404235e-07, + "loss": 0.2237, + "reward": 0.511160746216774, + "reward_std": 0.16897070035338402, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.388392873108387, + "rewards/tag_count_reward": 0.4352678805589676, "step": 1870 }, { "clip_ratio": 0.0, - "completion_length": 1741.3906860351562, + "completion_length": 1695.9866943359375, "epoch": 0.5588828317526697, - "grad_norm": 5.037458419799805, - "kl": 0.76171875, - "learning_rate": 4.8435717123054925e-08, - "loss": 0.1167, - "reward": 0.405133955180645, - "reward_std": 0.2296564318239689, - "rewards/accuracy_reward": 0.04241071594879031, + "grad_norm": 30.807552337646484, + "kl": 3.861328125, + "learning_rate": 2.4217858561527464e-07, + "loss": 0.2145, + "reward": 0.4877232387661934, + "reward_std": 0.1683344915509224, + "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232238650322, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1871 }, { "clip_ratio": 0.0, - "completion_length": 1695.16748046875, + "completion_length": 1583.6897888183594, "epoch": 0.5591815398401911, - "grad_norm": 805.3338012695312, - "kl": 3.2373046875, - "learning_rate": 4.838359223981162e-08, - "loss": 0.2439, - "reward": 0.4352678805589676, - "reward_std": 0.1744992583990097, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 10.859689712524414, + "kl": 3.171875, + "learning_rate": 2.419179611990581e-07, + "loss": 0.2192, + "reward": 0.5167410895228386, + "reward_std": 0.12609050795435905, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3794643059372902, + "rewards/tag_count_reward": 0.447544664144516, "step": 1872 }, { "clip_ratio": 0.0, - "completion_length": 1677.3326721191406, + "completion_length": 1580.2701721191406, "epoch": 0.5594802479277127, - "grad_norm": 4.389386177062988, - "kl": 0.6748046875, - "learning_rate": 4.833146911506152e-08, - "loss": 0.0927, - "reward": 0.4765625074505806, - "reward_std": 0.20730862766504288, - "rewards/accuracy_reward": 0.1026785783469677, + "grad_norm": 31.51136589050293, + "kl": 3.72265625, + "learning_rate": 2.416573455753076e-07, + "loss": 0.2579, + "reward": 0.5703125223517418, + "reward_std": 0.15775125101208687, + "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839402794838, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1873 }, { "clip_ratio": 0.0, - "completion_length": 1683.9465026855469, + "completion_length": 1611.0447082519531, "epoch": 0.5597789560152341, - "grad_norm": 6.868362903594971, - "kl": 0.6953125, - "learning_rate": 4.82793478055095e-08, - "loss": 0.0946, - "reward": 0.5290178805589676, - "reward_std": 0.20834114402532578, - "rewards/accuracy_reward": 0.14285714738070965, + "grad_norm": 46.38452911376953, + "kl": 3.76953125, + "learning_rate": 2.4139673902754753e-07, + "loss": 0.2359, + "reward": 0.5797991454601288, + "reward_std": 0.11784730665385723, + "rewards/accuracy_reward": 0.12500000419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3861607313156128, + "rewards/tag_count_reward": 0.454799123108387, "step": 1874 }, { "clip_ratio": 0.0, - "completion_length": 1717.3371276855469, + "completion_length": 1594.43310546875, "epoch": 0.5600776641027556, - "grad_norm": 57.612648010253906, - "kl": 0.97412109375, - "learning_rate": 4.8227228367858414e-08, - "loss": 0.1292, - "reward": 0.4503348469734192, - "reward_std": 0.2068021595478058, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 10.534722328186035, + "kl": 2.8671875, + "learning_rate": 2.4113614183929205e-07, + "loss": 0.2045, + "reward": 0.5446428880095482, + "reward_std": 0.1554849036037922, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.387834832072258, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1875 }, { "clip_ratio": 0.0, - "completion_length": 1745.7388916015625, + "completion_length": 1729.7590026855469, "epoch": 0.560376372190277, - "grad_norm": 9.710267066955566, - "kl": 0.89990234375, - "learning_rate": 4.817511085880912e-08, - "loss": 0.1133, - "reward": 0.4575893059372902, - "reward_std": 0.19013431295752525, - "rewards/accuracy_reward": 0.09375000279396772, + "grad_norm": 50.84327697753906, + "kl": 5.37109375, + "learning_rate": 2.408755542940456e-07, + "loss": 0.3143, + "reward": 0.502232164144516, + "reward_std": 0.1537531465291977, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3638392984867096, + "rewards/tag_count_reward": 0.4129464402794838, "step": 1876 }, { "clip_ratio": 0.0, - "completion_length": 1666.88623046875, + "completion_length": 1640.15185546875, "epoch": 0.5606750802777986, - "grad_norm": 6.7546539306640625, - "kl": 0.7021484375, - "learning_rate": 4.8122995335060355e-08, - "loss": 0.1262, - "reward": 0.5100446715950966, - "reward_std": 0.1827758140861988, - "rewards/accuracy_reward": 0.1272321455180645, + "grad_norm": 15.175320625305176, + "kl": 4.046875, + "learning_rate": 2.4061497667530176e-07, + "loss": 0.2442, + "reward": 0.5636161044239998, + "reward_std": 0.11764203757047653, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3828125223517418, + "rewards/tag_count_reward": 0.4319196566939354, "step": 1877 }, { "clip_ratio": 0.0, - "completion_length": 1726.8616943359375, + "completion_length": 1653.0692443847656, "epoch": 0.56097378836532, - "grad_norm": 12.329505920410156, - "kl": 0.861328125, - "learning_rate": 4.807088185330869e-08, - "loss": 0.1062, - "reward": 0.4029018059372902, - "reward_std": 0.18208423256874084, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 27.569849014282227, + "kl": 3.8828125, + "learning_rate": 2.4035440926654344e-07, + "loss": 0.2283, + "reward": 0.4933035969734192, + "reward_std": 0.1219912339001894, + "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560268059372902, + "rewards/tag_count_reward": 0.439732164144516, "step": 1878 }, { "clip_ratio": 0.0, - "completion_length": 1728.0090026855469, + "completion_length": 1604.9353332519531, "epoch": 0.5612724964528415, - "grad_norm": 5.7874436378479, - "kl": 0.73486328125, - "learning_rate": 4.8018770470248506e-08, - "loss": 0.107, - "reward": 0.4369419887661934, - "reward_std": 0.19765213876962662, - "rewards/accuracy_reward": 0.06250000325962901, + "grad_norm": 27.032745361328125, + "kl": 2.412109375, + "learning_rate": 2.400938523512425e-07, + "loss": 0.1639, + "reward": 0.5452009066939354, + "reward_std": 0.1399172507226467, + "rewards/accuracy_reward": 0.08258929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419813156128, + "rewards/tag_count_reward": 0.4626116305589676, "step": 1879 }, { "clip_ratio": 0.0, - "completion_length": 1728.0915832519531, + "completion_length": 1647.69873046875, "epoch": 0.5615712045403629, - "grad_norm": 8.949148178100586, - "kl": 0.828125, - "learning_rate": 4.796666124257186e-08, - "loss": 0.1312, - "reward": 0.447544664144516, - "reward_std": 0.19229017570614815, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 22.053356170654297, + "kl": 3.4453125, + "learning_rate": 2.3983330621285935e-07, + "loss": 0.2359, + "reward": 0.4994419887661934, + "reward_std": 0.12786435894668102, + "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "rewards/tag_count_reward": 0.4369419887661934, "step": 1880 }, { "clip_ratio": 0.0, - "completion_length": 1667.5402221679688, + "completion_length": 1623.5670166015625, "epoch": 0.5618699126278844, - "grad_norm": 7.399872303009033, - "kl": 0.69921875, - "learning_rate": 4.7914554226968506e-08, - "loss": 0.1034, - "reward": 0.4972098469734192, - "reward_std": 0.22650115936994553, - "rewards/accuracy_reward": 0.10937500488944352, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.387834832072258, + "grad_norm": 22.567678451538086, + "kl": 3.5078125, + "learning_rate": 2.395727711348425e-07, + "loss": 0.2334, + "reward": 0.5680803805589676, + "reward_std": 0.1958514228463173, + "rewards/accuracy_reward": 0.12946429592557251, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1881 }, { "clip_ratio": 0.0, - "completion_length": 1787.4509582519531, + "completion_length": 1675.2858276367188, "epoch": 0.5621686207154059, - "grad_norm": 4.582240104675293, - "kl": 0.72314453125, - "learning_rate": 4.7862449480125746e-08, - "loss": 0.104, - "reward": 0.4492187649011612, - "reward_std": 0.23083720356225967, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 23.86686134338379, + "kl": 2.4609375, + "learning_rate": 2.3931224740062873e-07, + "loss": 0.1823, + "reward": 0.5379464477300644, + "reward_std": 0.1649578120559454, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3733259066939354, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1882 }, { "clip_ratio": 0.0, - "completion_length": 1748.2277526855469, + "completion_length": 1668.3036499023438, "epoch": 0.5624673288029274, - "grad_norm": 15.884343147277832, - "kl": 0.8857421875, - "learning_rate": 4.781034705872845e-08, - "loss": 0.1148, - "reward": 0.4843750074505806, - "reward_std": 0.2136448696255684, - "rewards/accuracy_reward": 0.1316964328289032, + "grad_norm": 77.09688568115234, + "kl": 5.91015625, + "learning_rate": 2.3905173529364224e-07, + "loss": 0.3374, + "reward": 0.5546875223517418, + "reward_std": 0.1380814090371132, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3526785895228386, + "rewards/tag_count_reward": 0.4185268059372902, "step": 1883 }, { "clip_ratio": 0.0, - "completion_length": 1742.3951721191406, + "completion_length": 1661.0491943359375, "epoch": 0.5627660368904488, - "grad_norm": 11.557215690612793, - "kl": 0.783203125, - "learning_rate": 4.775824701945895e-08, - "loss": 0.1242, - "reward": 0.4988839626312256, - "reward_std": 0.23161952197551727, - "rewards/accuracy_reward": 0.12723214738070965, + "grad_norm": 9.921865463256836, + "kl": 4.20703125, + "learning_rate": 2.3879123509729477e-07, + "loss": 0.2863, + "reward": 0.5535714402794838, + "reward_std": 0.16164697147905827, + "rewards/accuracy_reward": 0.11607143003493547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716518059372902, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1884 }, { "clip_ratio": 0.0, - "completion_length": 1713.3415832519531, + "completion_length": 1651.3081359863281, "epoch": 0.5630647449779703, - "grad_norm": 6.719419002532959, - "kl": 0.6923828125, - "learning_rate": 4.7706149418996985e-08, - "loss": 0.0851, - "reward": 0.3895089402794838, - "reward_std": 0.17061890847980976, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 46.691959381103516, + "kl": 4.38671875, + "learning_rate": 2.385307470949849e-07, + "loss": 0.2582, + "reward": 0.4765625149011612, + "reward_std": 0.12649006955325603, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482313156128, + "rewards/tag_count_reward": 0.4542410895228386, "step": 1885 }, { "clip_ratio": 0.0, - "completion_length": 1731.8951416015625, + "completion_length": 1651.2411499023438, "epoch": 0.5633634530654917, - "grad_norm": 10.091897010803223, - "kl": 0.89453125, - "learning_rate": 4.7654054314019606e-08, - "loss": 0.1437, - "reward": 0.5089285895228386, - "reward_std": 0.218811497092247, - "rewards/accuracy_reward": 0.14732143096625805, + "grad_norm": 47.898948669433594, + "kl": 5.17578125, + "learning_rate": 2.3827027157009805e-07, + "loss": 0.3364, + "reward": 0.5837053954601288, + "reward_std": 0.17154448479413986, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.361607164144516, + "rewards/tag_count_reward": 0.4296875223517418, "step": 1886 }, { "clip_ratio": 0.0, - "completion_length": 1663.2322082519531, + "completion_length": 1614.3906860351562, "epoch": 0.5636621611530133, - "grad_norm": 4.745563507080078, - "kl": 0.6923828125, - "learning_rate": 4.760196176120123e-08, - "loss": 0.1409, - "reward": 0.4704241305589676, - "reward_std": 0.22835688665509224, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 45.100799560546875, + "kl": 4.36328125, + "learning_rate": 2.3800980880600613e-07, + "loss": 0.2695, + "reward": 0.5602678954601288, + "reward_std": 0.1970061995089054, + "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.387834832072258, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1887 }, { "clip_ratio": 0.0, - "completion_length": 1743.9286499023438, + "completion_length": 1646.5179443359375, "epoch": 0.5639608692405347, - "grad_norm": 7.739265441894531, - "kl": 0.6396484375, - "learning_rate": 4.754987181721342e-08, - "loss": 0.1047, - "reward": 0.444196455180645, - "reward_std": 0.21791108697652817, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 13.88018798828125, + "kl": 2.837890625, + "learning_rate": 2.3774935908606707e-07, + "loss": 0.19, + "reward": 0.5206473469734192, + "reward_std": 0.16583536937832832, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.392857164144516, + "rewards/tag_count_reward": 0.4492187649011612, "step": 1888 }, { "clip_ratio": 0.0, - "completion_length": 1731.5022888183594, + "completion_length": 1702.0357971191406, "epoch": 0.5642595773280562, - "grad_norm": 35.1544303894043, - "kl": 0.8671875, - "learning_rate": 4.7497784538724916e-08, - "loss": 0.1044, - "reward": 0.483816996216774, - "reward_std": 0.19497498497366905, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 17.34386444091797, + "kl": 3.95703125, + "learning_rate": 2.3748892269362458e-07, + "loss": 0.2474, + "reward": 0.5546875223517418, + "reward_std": 0.12680058367550373, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3699776977300644, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1889 }, { "clip_ratio": 0.0, - "completion_length": 1670.9308776855469, + "completion_length": 1588.2590026855469, "epoch": 0.5645582854155776, - "grad_norm": 11.129517555236816, - "kl": 0.75, - "learning_rate": 4.744569998240161e-08, - "loss": 0.1258, - "reward": 0.5479911044239998, - "reward_std": 0.2120901569724083, - "rewards/accuracy_reward": 0.1696428619325161, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482387661934, + "grad_norm": 18.85053825378418, + "kl": 3.96484375, + "learning_rate": 2.3722849991200805e-07, + "loss": 0.2635, + "reward": 0.6021205559372902, + "reward_std": 0.12867296021431684, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4414062798023224, "step": 1890 }, { "clip_ratio": 0.0, - "completion_length": 1727.8192749023438, + "completion_length": 1646.6719360351562, "epoch": 0.5648569935030991, - "grad_norm": 12.941956520080566, - "kl": 0.79345703125, - "learning_rate": 4.7393618204906374e-08, - "loss": 0.1172, - "reward": 0.4352678805589676, - "reward_std": 0.20285263657569885, - "rewards/accuracy_reward": 0.06250000349245965, + "grad_norm": 25.355182647705078, + "kl": 3.392578125, + "learning_rate": 2.3696809102453187e-07, + "loss": 0.2294, + "reward": 0.4905134215950966, + "reward_std": 0.15294067189097404, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.4347098469734192, "step": 1891 }, { "clip_ratio": 0.0, - "completion_length": 1756.9041137695312, + "completion_length": 1713.1428833007812, "epoch": 0.5651557015906206, - "grad_norm": 7.050516605377197, - "kl": 0.70703125, - "learning_rate": 4.734153926289907e-08, - "loss": 0.1057, - "reward": 0.4882812798023224, - "reward_std": 0.18563613295555115, - "rewards/accuracy_reward": 0.12053571874275804, + "grad_norm": 30.410537719726562, + "kl": 3.423828125, + "learning_rate": 2.3670769631449535e-07, + "loss": 0.2261, + "reward": 0.5491071715950966, + "reward_std": 0.13489943742752075, + "rewards/accuracy_reward": 0.12276785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455484867096, + "rewards/tag_count_reward": 0.4263392984867096, "step": 1892 }, { "clip_ratio": 0.0, - "completion_length": 1622.94873046875, + "completion_length": 1600.0201721191406, "epoch": 0.5654544096781421, - "grad_norm": 36.84849548339844, - "kl": 0.8564453125, - "learning_rate": 4.728946321303649e-08, - "loss": 0.1109, - "reward": 0.4587053880095482, - "reward_std": 0.20084641128778458, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 15.669499397277832, + "kl": 3.30859375, + "learning_rate": 2.3644731606518245e-07, + "loss": 0.2067, + "reward": 0.5072544813156128, + "reward_std": 0.16281880252063274, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716518059372902, + "rewards/tag_count_reward": 0.4135044813156128, "step": 1893 }, { "clip_ratio": 0.0, - "completion_length": 1687.30810546875, + "completion_length": 1646.8036499023438, "epoch": 0.5657531177656635, - "grad_norm": 11.104896545410156, - "kl": 0.54638671875, - "learning_rate": 4.723739011197228e-08, - "loss": 0.1096, - "reward": 0.5273437723517418, - "reward_std": 0.2431574948132038, - "rewards/accuracy_reward": 0.13839286006987095, + "grad_norm": 22.78024673461914, + "kl": 2.205078125, + "learning_rate": 2.361869505598614e-07, + "loss": 0.1481, + "reward": 0.5887277126312256, + "reward_std": 0.17063764110207558, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3889509066939354, + "rewards/tag_count_reward": 0.459263414144516, "step": 1894 }, { "clip_ratio": 0.0, - "completion_length": 1649.2902526855469, + "completion_length": 1550.07373046875, "epoch": 0.5660518258531849, - "grad_norm": 6.963273525238037, - "kl": 0.7412109375, - "learning_rate": 4.718532001635686e-08, - "loss": 0.1186, - "reward": 0.4090401902794838, - "reward_std": 0.19945456087589264, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 29.20393943786621, + "kl": 2.9609375, + "learning_rate": 2.359266000817843e-07, + "loss": 0.2152, + "reward": 0.5111607313156128, + "reward_std": 0.14274494908750057, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.4464285895228386, "step": 1895 }, { "clip_ratio": 0.0, - "completion_length": 1782.2032165527344, + "completion_length": 1705.4308776855469, "epoch": 0.5663505339407064, - "grad_norm": 5.6313862800598145, - "kl": 0.6923828125, - "learning_rate": 4.713325298283739e-08, - "loss": 0.108, - "reward": 0.5245535895228386, - "reward_std": 0.17563550919294357, - "rewards/accuracy_reward": 0.15178572107106447, + "grad_norm": 32.4789924621582, + "kl": 3.3046875, + "learning_rate": 2.3566626491418696e-07, + "loss": 0.227, + "reward": 0.5725446790456772, + "reward_std": 0.1053739283233881, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.4296875149011612, "step": 1896 }, { "clip_ratio": 0.0, - "completion_length": 1708.6451721191406, + "completion_length": 1617.9420471191406, "epoch": 0.5666492420282279, - "grad_norm": 70.36869049072266, - "kl": 1.09814453125, - "learning_rate": 4.708118906805771e-08, - "loss": 0.1165, - "reward": 0.4146205559372902, - "reward_std": 0.1774156242609024, - "rewards/accuracy_reward": 0.03571428591385484, + "grad_norm": 63.8911247253418, + "kl": 2.65625, + "learning_rate": 2.3540594534028855e-07, + "loss": 0.2239, + "reward": 0.5050223395228386, + "reward_std": 0.14275668375194073, + "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3789062574505806, + "rewards/tag_count_reward": 0.4492187798023224, "step": 1897 }, { "clip_ratio": 0.0, - "completion_length": 1659.3817443847656, + "completion_length": 1601.8817443847656, "epoch": 0.5669479501157494, - "grad_norm": 9.725513458251953, - "kl": 0.69921875, - "learning_rate": 4.702912832865825e-08, - "loss": 0.1311, - "reward": 0.4838169738650322, - "reward_std": 0.23168861493468285, - "rewards/accuracy_reward": 0.10267857671715319, + "grad_norm": 10.655900955200195, + "kl": 3.31640625, + "learning_rate": 2.3514564164329127e-07, + "loss": 0.2153, + "reward": 0.5474330708384514, + "reward_std": 0.1622856967151165, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3811384066939354, + "rewards/tag_count_reward": 0.451450914144516, "step": 1898 }, { "clip_ratio": 0.0, - "completion_length": 1674.5982971191406, + "completion_length": 1598.7433776855469, "epoch": 0.5672466582032708, - "grad_norm": 79.80070495605469, - "kl": 1.287109375, - "learning_rate": 4.697707082127599e-08, - "loss": 0.1505, - "reward": 0.3917410969734192, - "reward_std": 0.21621098741889, - "rewards/accuracy_reward": 0.03571428754366934, + "grad_norm": 77.06262969970703, + "kl": 5.03125, + "learning_rate": 2.3488535410637997e-07, + "loss": 0.3045, + "reward": 0.4743303805589676, + "reward_std": 0.14830541796982288, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.4386160895228386, "step": 1899 }, { "clip_ratio": 0.0, - "completion_length": 1693.96435546875, + "completion_length": 1606.6607666015625, "epoch": 0.5675453662907923, - "grad_norm": 7.357827186584473, - "kl": 0.712890625, - "learning_rate": 4.6925016602544395e-08, - "loss": 0.1241, - "reward": 0.4843750223517418, - "reward_std": 0.27111461386084557, - "rewards/accuracy_reward": 0.10937500186264515, + "grad_norm": 63.15616989135742, + "kl": 4.56640625, + "learning_rate": 2.3462508301272197e-07, + "loss": 0.2786, + "reward": 0.5206473469734192, + "reward_std": 0.1900695003569126, + "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000149011612, + "rewards/tag_count_reward": 0.4447544887661934, "step": 1900 }, { "clip_ratio": 0.0, - "completion_length": 1746.7300109863281, + "completion_length": 1702.01123046875, "epoch": 0.5678440743783137, - "grad_norm": 15.426056861877441, - "kl": 0.8525390625, - "learning_rate": 4.6872965729093355e-08, - "loss": 0.1257, - "reward": 0.4034598469734192, - "reward_std": 0.19583312422037125, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 35.832733154296875, + "kl": 4.4140625, + "learning_rate": 2.3436482864546677e-07, + "loss": 0.2672, + "reward": 0.4827009215950966, + "reward_std": 0.14082352630794048, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3454241305589676, + "rewards/tag_count_reward": 0.4224330559372902, "step": 1901 }, { "clip_ratio": 0.0, - "completion_length": 1681.76123046875, + "completion_length": 1588.3259582519531, "epoch": 0.5681427824658353, - "grad_norm": 6.397401332855225, - "kl": 0.70458984375, - "learning_rate": 4.682091825754912e-08, - "loss": 0.1159, - "reward": 0.3995535895228386, - "reward_std": 0.20196758955717087, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 37.67543411254883, + "kl": 4.17578125, + "learning_rate": 2.3410459128774563e-07, + "loss": 0.2606, + "reward": 0.483258955180645, + "reward_std": 0.15673963725566864, + "rewards/accuracy_reward": 0.04017857229337096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3683035969734192, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1902 }, { "clip_ratio": 0.0, - "completion_length": 1694.8192749023438, + "completion_length": 1635.3550109863281, "epoch": 0.5684414905533567, - "grad_norm": 13.813337326049805, - "kl": 0.8349609375, - "learning_rate": 4.676887424453424e-08, - "loss": 0.1174, - "reward": 0.4296875298023224, - "reward_std": 0.1833440326154232, - "rewards/accuracy_reward": 0.06250000302679837, + "grad_norm": 12.764564514160156, + "kl": 3.9296875, + "learning_rate": 2.338443712226712e-07, + "loss": 0.2406, + "reward": 0.5061384215950966, + "reward_std": 0.12712224572896957, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3671875074505806, + "rewards/tag_count_reward": 0.448102705180645, "step": 1903 }, { "clip_ratio": 0.0, - "completion_length": 1695.0848999023438, + "completion_length": 1621.8728332519531, "epoch": 0.5687401986408782, - "grad_norm": 56.66366195678711, - "kl": 1.1162109375, - "learning_rate": 4.67168337466675e-08, - "loss": 0.1192, - "reward": 0.5206473469734192, - "reward_std": 0.2108164206147194, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 12.215208053588867, + "kl": 2.244140625, + "learning_rate": 2.335841687333375e-07, + "loss": 0.1545, + "reward": 0.5982143133878708, + "reward_std": 0.15142563544213772, + "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.388950914144516, + "rewards/tag_count_reward": 0.4575893133878708, "step": 1904 }, { "clip_ratio": 0.0, - "completion_length": 1711.0469665527344, + "completion_length": 1615.4554138183594, "epoch": 0.5690389067283996, - "grad_norm": 7.402495384216309, - "kl": 0.71484375, - "learning_rate": 4.666479682056386e-08, - "loss": 0.1363, - "reward": 0.4760044887661934, - "reward_std": 0.2560390867292881, - "rewards/accuracy_reward": 0.09598214854486287, + "grad_norm": 43.20653533935547, + "kl": 3.859375, + "learning_rate": 2.333239841028193e-07, + "loss": 0.2524, + "reward": 0.5457589477300644, + "reward_std": 0.15469874814152718, + "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.380022332072258, + "rewards/tag_count_reward": 0.447544664144516, "step": 1905 }, { "clip_ratio": 0.0, - "completion_length": 1751.9598999023438, + "completion_length": 1675.9911499023438, "epoch": 0.5693376148159212, - "grad_norm": 7.844456672668457, - "kl": 0.744140625, - "learning_rate": 4.6612763522834385e-08, - "loss": 0.1133, - "reward": 0.4748884215950966, - "reward_std": 0.2150653600692749, - "rewards/accuracy_reward": 0.09598214784637094, + "grad_norm": 41.28318405151367, + "kl": 2.546875, + "learning_rate": 2.3306381761417194e-07, + "loss": 0.1785, + "reward": 0.5424107313156128, + "reward_std": 0.1523641124367714, + "rewards/accuracy_reward": 0.09375000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3789062723517418, + "rewards/tag_count_reward": 0.4486607387661934, "step": 1906 }, { "clip_ratio": 0.0, - "completion_length": 1671.0826416015625, + "completion_length": 1614.1964721679688, "epoch": 0.5696363229034426, - "grad_norm": 5.5460944175720215, - "kl": 0.708984375, - "learning_rate": 4.656073391008621e-08, - "loss": 0.0987, - "reward": 0.4838169887661934, - "reward_std": 0.22163764759898186, - "rewards/accuracy_reward": 0.09375000325962901, + "grad_norm": 16.117908477783203, + "kl": 3.0390625, + "learning_rate": 2.3280366955043106e-07, + "loss": 0.2163, + "reward": 0.534040205180645, + "reward_std": 0.16804299876093864, + "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3900669887661934, + "rewards/tag_count_reward": 0.4425223395228386, "step": 1907 }, { "clip_ratio": 0.0, - "completion_length": 1732.7009887695312, + "completion_length": 1674.8527526855469, "epoch": 0.5699350309909641, - "grad_norm": 4.679348945617676, - "kl": 0.583984375, - "learning_rate": 4.650870803892247e-08, - "loss": 0.0812, - "reward": 0.5189732313156128, - "reward_std": 0.21300087124109268, - "rewards/accuracy_reward": 0.11607143376022577, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029018059372902, + "grad_norm": 6.39769172668457, + "kl": 2.421875, + "learning_rate": 2.3254354019461232e-07, + "loss": 0.1464, + "reward": 0.583147332072258, + "reward_std": 0.16184914484620094, + "rewards/accuracy_reward": 0.1250000020954758, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4581473395228386, "step": 1908 }, { "clip_ratio": 0.0, - "completion_length": 1714.94873046875, + "completion_length": 1655.1964721679688, "epoch": 0.5702337390784855, - "grad_norm": 20.82760238647461, - "kl": 0.8193359375, - "learning_rate": 4.64566859659422e-08, - "loss": 0.1257, - "reward": 0.4162946566939354, - "reward_std": 0.23389434814453125, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 18.632488250732422, + "kl": 2.69921875, + "learning_rate": 2.32283429829711e-07, + "loss": 0.1854, + "reward": 0.5111607387661934, + "reward_std": 0.18763187527656555, + "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.364955373108387, + "rewards/tag_count_reward": 0.444196455180645, "step": 1909 }, { "clip_ratio": 0.0, - "completion_length": 1660.6116638183594, + "completion_length": 1578.9063415527344, "epoch": 0.570532447166007, - "grad_norm": 4.456637382507324, - "kl": 0.595703125, - "learning_rate": 4.64046677477403e-08, - "loss": 0.0949, - "reward": 0.4977678805589676, - "reward_std": 0.20145569369196892, - "rewards/accuracy_reward": 0.10491071920841932, + "grad_norm": 22.70128631591797, + "kl": 2.953125, + "learning_rate": 2.320233387387015e-07, + "loss": 0.2137, + "reward": 0.565290205180645, + "reward_std": 0.16703174635767937, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3928571566939354, + "rewards/tag_count_reward": 0.4469866305589676, "step": 1910 }, { "clip_ratio": 0.0, - "completion_length": 1702.4286804199219, + "completion_length": 1616.5536804199219, "epoch": 0.5708311552535285, - "grad_norm": 39.49913024902344, - "kl": 0.8720703125, - "learning_rate": 4.6352653440907526e-08, - "loss": 0.1281, - "reward": 0.5178571492433548, - "reward_std": 0.21353187784552574, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 14.844437599182129, + "kl": 2.99609375, + "learning_rate": 2.3176326720453762e-07, + "loss": 0.2071, + "reward": 0.5915178954601288, + "reward_std": 0.14543218351900578, + "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000149011612, + "rewards/tag_count_reward": 0.444196455180645, "step": 1911 }, { "clip_ratio": 0.0, - "completion_length": 1729.8929138183594, + "completion_length": 1708.7031860351562, "epoch": 0.57112986334105, - "grad_norm": 13.537575721740723, - "kl": 0.39501953125, - "learning_rate": 4.630064310203032e-08, - "loss": 0.0862, - "reward": 0.431919664144516, - "reward_std": 0.198443453758955, - "rewards/accuracy_reward": 0.03571428777649999, + "grad_norm": 32.28428649902344, + "kl": 3.84765625, + "learning_rate": 2.3150321551015163e-07, + "loss": 0.2221, + "reward": 0.4966518133878708, + "reward_std": 0.1747417338192463, + "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3962053805589676, + "rewards/tag_count_reward": 0.4363839477300644, "step": 1912 }, { "clip_ratio": 0.0, - "completion_length": 1704.8661499023438, + "completion_length": 1600.2969055175781, "epoch": 0.5714285714285714, - "grad_norm": 5.670235633850098, - "kl": 0.716796875, - "learning_rate": 4.6248636787690854e-08, - "loss": 0.1207, - "reward": 0.4559151902794838, - "reward_std": 0.2545998990535736, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 12.04436206817627, + "kl": 2.56640625, + "learning_rate": 2.3124318393845428e-07, + "loss": 0.1553, + "reward": 0.5334821715950966, + "reward_std": 0.1719003077596426, + "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3800223395228386, + "rewards/tag_count_reward": 0.4620535895228386, "step": 1913 }, { "clip_ratio": 0.0, - "completion_length": 1690.4554443359375, + "completion_length": 1606.3951416015625, "epoch": 0.5717272795160929, - "grad_norm": 6.401205062866211, - "kl": 0.61962890625, - "learning_rate": 4.6196634554466876e-08, - "loss": 0.0872, - "reward": 0.5223214477300644, - "reward_std": 0.18458571657538414, - "rewards/accuracy_reward": 0.12723214668221772, + "grad_norm": 28.22039031982422, + "kl": 2.2744140625, + "learning_rate": 2.309831727723344e-07, + "loss": 0.1584, + "reward": 0.5825892984867096, + "reward_std": 0.16422081738710403, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3950892984867096, + "rewards/tag_count_reward": 0.455357164144516, "step": 1914 }, { "clip_ratio": 0.0, - "completion_length": 1731.2456359863281, + "completion_length": 1628.5781860351562, "epoch": 0.5720259876036143, - "grad_norm": 6.698228359222412, - "kl": 0.6201171875, - "learning_rate": 4.6144636458931745e-08, - "loss": 0.0972, - "reward": 0.5212053880095482, - "reward_std": 0.15973718836903572, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 14.920714378356934, + "kl": 3.66796875, + "learning_rate": 2.3072318229465872e-07, + "loss": 0.2328, + "reward": 0.5976562798023224, + "reward_std": 0.10308068804442883, + "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482313156128, + "rewards/tag_count_reward": 0.4503348395228386, "step": 1915 }, { "clip_ratio": 0.0, - "completion_length": 1693.9063110351562, + "completion_length": 1633.6697082519531, "epoch": 0.5723246956911359, - "grad_norm": 5.049406051635742, - "kl": 0.693359375, - "learning_rate": 4.609264255765428e-08, - "loss": 0.1186, - "reward": 0.4866071566939354, - "reward_std": 0.2055772989988327, - "rewards/accuracy_reward": 0.1116071455180645, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.372767873108387, + "grad_norm": 58.2711067199707, + "kl": 4.59375, + "learning_rate": 2.3046321278827138e-07, + "loss": 0.2715, + "reward": 0.5608259215950966, + "reward_std": 0.14429756440222263, + "rewards/accuracy_reward": 0.12276786286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1916 }, { "clip_ratio": 0.0, - "completion_length": 1715.2031860351562, + "completion_length": 1638.8728637695312, "epoch": 0.5726234037786573, - "grad_norm": 5.69533109664917, - "kl": 0.7001953125, - "learning_rate": 4.604065290719873e-08, - "loss": 0.1199, - "reward": 0.4827009215950966, - "reward_std": 0.2012873999774456, - "rewards/accuracy_reward": 0.1250000090803951, + "grad_norm": 92.38995361328125, + "kl": 5.22265625, + "learning_rate": 2.3020326453599368e-07, + "loss": 0.2991, + "reward": 0.5714286044239998, + "reward_std": 0.14556533098220825, + "rewards/accuracy_reward": 0.12946429406292737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357700914144516, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1917 }, { "clip_ratio": 0.0, - "completion_length": 1740.6407165527344, + "completion_length": 1660.5536499023438, "epoch": 0.5729221118661788, - "grad_norm": 7.5856781005859375, - "kl": 0.72265625, - "learning_rate": 4.598866756412478e-08, - "loss": 0.0968, - "reward": 0.380580373108387, - "reward_std": 0.21000828966498375, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 14.821087837219238, + "kl": 3.19140625, + "learning_rate": 2.2994333782062393e-07, + "loss": 0.2136, + "reward": 0.4676339477300644, + "reward_std": 0.11946978233754635, + "rewards/accuracy_reward": 0.017857144121080637, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560268059372902, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1918 }, { "clip_ratio": 0.0, - "completion_length": 1724.5491943359375, + "completion_length": 1639.9755249023438, "epoch": 0.5732208199537002, - "grad_norm": 7.391285419464111, - "kl": 0.638671875, - "learning_rate": 4.593668658498737e-08, - "loss": 0.1232, - "reward": 0.4486607313156128, - "reward_std": 0.22851744666695595, - "rewards/accuracy_reward": 0.08035714644938707, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3660714477300644, + "grad_norm": 20.257143020629883, + "kl": 3.9375, + "learning_rate": 2.2968343292493686e-07, + "loss": 0.2339, + "reward": 0.5379464477300644, + "reward_std": 0.17739084362983704, + "rewards/accuracy_reward": 0.09598214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1919 }, { "clip_ratio": 0.0, - "completion_length": 1731.243408203125, + "completion_length": 1625.2098693847656, "epoch": 0.5735195280412217, - "grad_norm": 6.028646945953369, - "kl": 0.619140625, - "learning_rate": 4.5884710026336707e-08, - "loss": 0.1072, - "reward": 0.4737723395228386, - "reward_std": 0.20736614987254143, - "rewards/accuracy_reward": 0.10937500116415322, + "grad_norm": 27.95660972595215, + "kl": 3.349609375, + "learning_rate": 2.294235501316835e-07, + "loss": 0.1923, + "reward": 0.5747768059372902, + "reward_std": 0.16299426183104515, + "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.364397332072258, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1920 }, { "clip_ratio": 0.0, - "completion_length": 1724.6116943359375, + "completion_length": 1618.4219665527344, "epoch": 0.5738182361287432, - "grad_norm": 5.752760887145996, - "kl": 0.7001953125, - "learning_rate": 4.5832737944718176e-08, - "loss": 0.1124, - "reward": 0.4347098469734192, - "reward_std": 0.19032524898648262, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 12.885087013244629, + "kl": 3.71875, + "learning_rate": 2.2916368972359088e-07, + "loss": 0.24, + "reward": 0.5106026977300644, + "reward_std": 0.14280694164335728, + "rewards/accuracy_reward": 0.06696429080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3677455484867096, + "rewards/tag_count_reward": 0.443638414144516, "step": 1921 }, { "clip_ratio": 0.0, - "completion_length": 1755.0357971191406, + "completion_length": 1690.2545471191406, "epoch": 0.5741169442162647, - "grad_norm": 12.94085693359375, - "kl": 0.60009765625, - "learning_rate": 4.578077039667234e-08, - "loss": 0.1072, - "reward": 0.4776785895228386, - "reward_std": 0.21205327659845352, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 56.94000244140625, + "kl": 4.5859375, + "learning_rate": 2.289038519833617e-07, + "loss": 0.261, + "reward": 0.5530134215950966, + "reward_std": 0.1930195726454258, + "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3571428656578064, + "rewards/tag_count_reward": 0.4257812723517418, "step": 1922 }, { "clip_ratio": 0.0, - "completion_length": 1690.4085693359375, + "completion_length": 1652.1607971191406, "epoch": 0.5744156523037861, - "grad_norm": 16.924654006958008, - "kl": 0.5947265625, - "learning_rate": 4.5728807438734784e-08, - "loss": 0.1171, - "reward": 0.596540205180645, - "reward_std": 0.18080133944749832, - "rewards/accuracy_reward": 0.21428572502918541, + "grad_norm": 19.354734420776367, + "kl": 4.14453125, + "learning_rate": 2.2864403719367392e-07, + "loss": 0.2532, + "reward": 0.6484375298023224, + "reward_std": 0.11191197670996189, + "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3822544813156128, + "rewards/tag_count_reward": 0.4341517984867096, "step": 1923 }, { "clip_ratio": 0.0, - "completion_length": 1751.4933776855469, + "completion_length": 1708.7232971191406, "epoch": 0.5747143603913076, - "grad_norm": 4.747684955596924, - "kl": 0.7236328125, - "learning_rate": 4.56768491274361e-08, - "loss": 0.1239, - "reward": 0.5044643133878708, - "reward_std": 0.23846549913287163, - "rewards/accuracy_reward": 0.1607142873108387, + "grad_norm": 36.38756561279297, + "kl": 3.953125, + "learning_rate": 2.283842456371805e-07, + "loss": 0.2562, + "reward": 0.5993303805589676, + "reward_std": 0.16229806654155254, + "rewards/accuracy_reward": 0.16294643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.4363839402794838, "step": 1924 }, { "clip_ratio": 0.0, - "completion_length": 1657.62060546875, + "completion_length": 1608.51123046875, "epoch": 0.575013068478829, - "grad_norm": 17.475526809692383, - "kl": 0.375, - "learning_rate": 4.562489551930187e-08, - "loss": 0.1078, - "reward": 0.5061384066939354, - "reward_std": 0.2134801745414734, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 38.94938659667969, + "kl": 2.984375, + "learning_rate": 2.2812447759650932e-07, + "loss": 0.2164, + "reward": 0.5825893133878708, + "reward_std": 0.1651261616498232, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3856026977300644, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1925 }, { "clip_ratio": 0.0, - "completion_length": 1661.602783203125, + "completion_length": 1581.3460388183594, "epoch": 0.5753117765663506, - "grad_norm": 11.018867492675781, - "kl": 0.516845703125, - "learning_rate": 4.55729466708525e-08, - "loss": 0.1173, - "reward": 0.4665178880095482, - "reward_std": 0.22052284330129623, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 12.798613548278809, + "kl": 2.189453125, + "learning_rate": 2.2786473335426247e-07, + "loss": 0.1596, + "reward": 0.5502232238650322, + "reward_std": 0.16101041063666344, + "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3883928805589676, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1926 }, { "clip_ratio": 0.0, - "completion_length": 1654.8415832519531, + "completion_length": 1584.3594665527344, "epoch": 0.575610484653872, - "grad_norm": 5.590102195739746, - "kl": 0.60546875, - "learning_rate": 4.5521002638603254e-08, - "loss": 0.1212, - "reward": 0.428013414144516, - "reward_std": 0.2322436086833477, - "rewards/accuracy_reward": 0.064732147147879, + "grad_norm": 21.631610870361328, + "kl": 2.765625, + "learning_rate": 2.2760501319301626e-07, + "loss": 0.1933, + "reward": 0.5195312723517418, + "reward_std": 0.18138942122459412, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812723517418, + "rewards/tag_count_reward": 0.443638414144516, "step": 1927 }, { "clip_ratio": 0.0, - "completion_length": 1731.6183471679688, + "completion_length": 1614.2835693359375, "epoch": 0.5759091927413935, - "grad_norm": 32.2193489074707, - "kl": 0.81591796875, - "learning_rate": 4.546906347906413e-08, - "loss": 0.0964, - "reward": 0.3978794813156128, - "reward_std": 0.21737778931856155, - "rewards/accuracy_reward": 0.029017859371379018, + "grad_norm": 27.47783660888672, + "kl": 2.181640625, + "learning_rate": 2.2734531739532067e-07, + "loss": 0.1576, + "reward": 0.4994419887661934, + "reward_std": 0.14626258052885532, + "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.368861623108387, + "rewards/tag_count_reward": 0.4637276977300644, "step": 1928 }, { "clip_ratio": 0.0, - "completion_length": 1721.18310546875, + "completion_length": 1647.8996276855469, "epoch": 0.5762079008289149, - "grad_norm": 5.143306732177734, - "kl": 0.64794921875, - "learning_rate": 4.541712924873989e-08, - "loss": 0.1148, - "reward": 0.4324776902794838, - "reward_std": 0.20339882001280785, - "rewards/accuracy_reward": 0.08482143003493547, + "grad_norm": 16.487390518188477, + "kl": 2.796875, + "learning_rate": 2.2708564624369944e-07, + "loss": 0.1675, + "reward": 0.5223214477300644, + "reward_std": 0.13632718473672867, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562723517418, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1929 }, { "clip_ratio": 0.0, - "completion_length": 1705.7389221191406, + "completion_length": 1624.2567749023438, "epoch": 0.5765066089164365, - "grad_norm": 11.752492904663086, - "kl": 0.572265625, - "learning_rate": 4.536520000412984e-08, - "loss": 0.1193, - "reward": 0.420200914144516, - "reward_std": 0.22067348286509514, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 15.081119537353516, + "kl": 2.59375, + "learning_rate": 2.2682600002064923e-07, + "loss": 0.1629, + "reward": 0.5150669813156128, + "reward_std": 0.15225842595100403, + "rewards/accuracy_reward": 0.0535714291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3777901977300644, + "rewards/tag_count_reward": 0.4614955633878708, "step": 1930 }, { "clip_ratio": 0.0, - "completion_length": 1766.4732971191406, + "completion_length": 1712.6429443359375, "epoch": 0.5768053170039579, - "grad_norm": 7.567726135253906, - "kl": 0.6181640625, - "learning_rate": 4.531327580172793e-08, - "loss": 0.1113, - "reward": 0.400111623108387, - "reward_std": 0.22156139835715294, - "rewards/accuracy_reward": 0.03794643119908869, + "grad_norm": 14.658039093017578, + "kl": 3.63671875, + "learning_rate": 2.2656637900863965e-07, + "loss": 0.2079, + "reward": 0.4760044813156128, + "reward_std": 0.16634920425713062, + "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3621651902794838, + "rewards/tag_count_reward": 0.435825914144516, "step": 1931 }, { "clip_ratio": 0.0, - "completion_length": 1771.2567749023438, + "completion_length": 1715.3527221679688, "epoch": 0.5771040250914794, - "grad_norm": 196.2230224609375, - "kl": 1.416015625, - "learning_rate": 4.5261356698022605e-08, - "loss": 0.1375, - "reward": 0.5139509215950966, - "reward_std": 0.2324894778430462, - "rewards/accuracy_reward": 0.1495535783469677, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3643973469734192, + "grad_norm": 17.052610397338867, + "kl": 3.146484375, + "learning_rate": 2.2630678349011303e-07, + "loss": 0.2064, + "reward": 0.6049107313156128, + "reward_std": 0.17495453171432018, + "rewards/accuracy_reward": 0.1607142984867096, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4441964477300644, "step": 1932 }, { "clip_ratio": 0.0, - "completion_length": 1680.8348999023438, + "completion_length": 1617.3996276855469, "epoch": 0.5774027331790008, - "grad_norm": 8.469754219055176, - "kl": 0.60986328125, - "learning_rate": 4.520944274949676e-08, - "loss": 0.0965, - "reward": 0.4704241380095482, - "reward_std": 0.18429949134588242, - "rewards/accuracy_reward": 0.09375000419095159, + "grad_norm": 36.0367546081543, + "kl": 3.17578125, + "learning_rate": 2.2604721374748378e-07, + "loss": 0.2334, + "reward": 0.5424107536673546, + "reward_std": 0.15594821237027645, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.376674123108387, + "rewards/tag_count_reward": 0.435267873108387, "step": 1933 }, { "clip_ratio": 0.0, - "completion_length": 1625.6027526855469, + "completion_length": 1568.4308776855469, "epoch": 0.5777014412665223, - "grad_norm": 13.205565452575684, - "kl": 0.39013671875, - "learning_rate": 4.515753401262767e-08, - "loss": 0.0797, - "reward": 0.4358259215950966, - "reward_std": 0.16540747322142124, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 8.084633827209473, + "kl": 2.474609375, + "learning_rate": 2.2578767006313834e-07, + "loss": 0.1614, + "reward": 0.5111607313156128, + "reward_std": 0.14678932167589664, + "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4112723395228386, + "rewards/tag_count_reward": 0.4620535969734192, "step": 1934 }, { "clip_ratio": 0.0, - "completion_length": 1640.05810546875, + "completion_length": 1562.6629943847656, "epoch": 0.5780001493540438, - "grad_norm": 9.666220664978027, - "kl": 0.5361328125, - "learning_rate": 4.510563054388695e-08, - "loss": 0.1133, - "reward": 0.4659598469734192, - "reward_std": 0.23231659829616547, - "rewards/accuracy_reward": 0.08482143492437899, + "grad_norm": 5.367437839508057, + "kl": 2.765625, + "learning_rate": 2.2552815271943477e-07, + "loss": 0.1906, + "reward": 0.5435268133878708, + "reward_std": 0.19687253050506115, + "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3811384066939354, + "rewards/tag_count_reward": 0.4564732387661934, "step": 1935 }, { "clip_ratio": 0.0, - "completion_length": 1701.0871276855469, + "completion_length": 1640.3348999023438, "epoch": 0.5782988574415653, - "grad_norm": 4.352108955383301, - "kl": 0.6220703125, - "learning_rate": 4.505373239974052e-08, - "loss": 0.1054, - "reward": 0.486607164144516, - "reward_std": 0.22334739193320274, - "rewards/accuracy_reward": 0.11160715017467737, + "grad_norm": 9.087560653686523, + "kl": 3.60546875, + "learning_rate": 2.2526866199870258e-07, + "loss": 0.2308, + "reward": 0.5457589626312256, + "reward_std": 0.14839855581521988, + "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3750000223517418, + "rewards/tag_count_reward": 0.4386160969734192, "step": 1936 }, { "clip_ratio": 0.0, - "completion_length": 1785.5179138183594, + "completion_length": 1720.5067749023438, "epoch": 0.5785975655290867, - "grad_norm": 12.514901161193848, - "kl": 0.677734375, - "learning_rate": 4.5001839636648446e-08, - "loss": 0.1029, - "reward": 0.3777901902794838, - "reward_std": 0.21102286875247955, - "rewards/accuracy_reward": 0.017857143422588706, + "grad_norm": 7.7881083488464355, + "kl": 4.20703125, + "learning_rate": 2.2500919818324224e-07, + "loss": 0.2535, + "reward": 0.4375000223517418, + "reward_std": 0.12629659287631512, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3599330559372902, + "rewards/tag_count_reward": 0.4308035895228386, "step": 1937 }, { "clip_ratio": 0.0, - "completion_length": 1627.7567749023438, + "completion_length": 1582.8996276855469, "epoch": 0.5788962736166081, - "grad_norm": 8.7125244140625, - "kl": 0.63671875, - "learning_rate": 4.494995231106496e-08, - "loss": 0.1183, - "reward": 0.482700914144516, - "reward_std": 0.1759800836443901, - "rewards/accuracy_reward": 0.09598215040750802, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187649011612, + "grad_norm": 22.43412208557129, + "kl": 3.591796875, + "learning_rate": 2.2474976155532484e-07, + "loss": 0.2247, + "reward": 0.5591518133878708, + "reward_std": 0.13764260709285736, + "rewards/accuracy_reward": 0.10714286402799189, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.452008955180645, "step": 1938 }, { "clip_ratio": 0.0, - "completion_length": 1647.1049499511719, + "completion_length": 1569.1675109863281, "epoch": 0.5791949817041296, - "grad_norm": 6.119147777557373, - "kl": 0.64404296875, - "learning_rate": 4.489807047943842e-08, - "loss": 0.1228, - "reward": 0.4575893133878708, - "reward_std": 0.21562981233000755, - "rewards/accuracy_reward": 0.08705357555299997, + "grad_norm": 27.67903709411621, + "kl": 3.599609375, + "learning_rate": 2.2449035239719212e-07, + "loss": 0.229, + "reward": 0.547991082072258, + "reward_std": 0.15824110619723797, + "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3705357313156128, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1939 }, { "clip_ratio": 0.0, - "completion_length": 1634.0134887695312, + "completion_length": 1570.15185546875, "epoch": 0.5794936897916511, - "grad_norm": 5.519073009490967, - "kl": 0.57666015625, - "learning_rate": 4.4846194198211155e-08, - "loss": 0.1156, - "reward": 0.4765625149011612, - "reward_std": 0.221936147660017, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 7.029219150543213, + "kl": 2.568359375, + "learning_rate": 2.2423097099105577e-07, + "loss": 0.1706, + "reward": 0.5373883992433548, + "reward_std": 0.15089070796966553, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3895089402794838, + "rewards/tag_count_reward": 0.4614955559372902, "step": 1940 }, { "clip_ratio": 0.0, - "completion_length": 1691.5581359863281, + "completion_length": 1610.3170166015625, "epoch": 0.5797923978791726, - "grad_norm": 15.306662559509277, - "kl": 0.7158203125, - "learning_rate": 4.479432352381947e-08, - "loss": 0.1149, - "reward": 0.4135044813156128, - "reward_std": 0.21380996704101562, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 14.871225357055664, + "kl": 2.96875, + "learning_rate": 2.2397161761909736e-07, + "loss": 0.2055, + "reward": 0.5133928805589676, + "reward_std": 0.15376004949212074, + "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.455357164144516, "step": 1941 }, { "clip_ratio": 0.0, - "completion_length": 1706.3416137695312, + "completion_length": 1643.8929443359375, "epoch": 0.580091105966694, - "grad_norm": 5.407007694244385, - "kl": 0.6328125, - "learning_rate": 4.4742458512693576e-08, - "loss": 0.1123, - "reward": 0.5005580708384514, - "reward_std": 0.23377948626875877, - "rewards/accuracy_reward": 0.12500000488944352, + "grad_norm": 54.981807708740234, + "kl": 4.47265625, + "learning_rate": 2.2371229256346787e-07, + "loss": 0.2707, + "reward": 0.5753348544239998, + "reward_std": 0.17521991953253746, + "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580559372902, + "rewards/tag_count_reward": 0.4391741305589676, "step": 1942 }, { "clip_ratio": 0.0, - "completion_length": 1713.1875915527344, + "completion_length": 1640.9509582519531, "epoch": 0.5803898140542155, - "grad_norm": 10.399560928344727, - "kl": 0.54931640625, - "learning_rate": 4.469059922125753e-08, - "loss": 0.0943, - "reward": 0.5039062723517418, - "reward_std": 0.2080843262374401, - "rewards/accuracy_reward": 0.12946429080329835, + "grad_norm": 15.395828247070312, + "kl": 3.701171875, + "learning_rate": 2.2345299610628765e-07, + "loss": 0.2389, + "reward": 0.5747768133878708, + "reward_std": 0.11841774545609951, + "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419887661934, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1943 }, { "clip_ratio": 0.0, - "completion_length": 1762.3884887695312, + "completion_length": 1674.3661499023438, "epoch": 0.5806885221417369, - "grad_norm": 7.478565216064453, - "kl": 0.6611328125, - "learning_rate": 4.463874570592916e-08, - "loss": 0.1099, - "reward": 0.3950893059372902, - "reward_std": 0.22344403713941574, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 44.160858154296875, + "kl": 4.046875, + "learning_rate": 2.231937285296458e-07, + "loss": 0.242, + "reward": 0.5033482313156128, + "reward_std": 0.17699915170669556, + "rewards/accuracy_reward": 0.0535714291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3638392984867096, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1944 }, { "clip_ratio": 0.0, - "completion_length": 1717.4866943359375, + "completion_length": 1661.4688110351562, "epoch": 0.5809872302292585, - "grad_norm": 23.05340576171875, - "kl": 0.3115234375, - "learning_rate": 4.458689802311999e-08, - "loss": 0.1041, - "reward": 0.4559151977300644, - "reward_std": 0.22889038175344467, + "grad_norm": 6.402981758117676, + "kl": 3.3203125, + "learning_rate": 2.2293449011559994e-07, + "loss": 0.2177, + "reward": 0.5306919887661934, + "reward_std": 0.13995127193629742, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3800223395228386, + "rewards/tag_count_reward": 0.4547991305589676, "step": 1945 }, { "clip_ratio": 0.0, - "completion_length": 1658.3884582519531, + "completion_length": 1575.7165832519531, "epoch": 0.5812859383167799, - "grad_norm": 17.522966384887695, - "kl": 0.51611328125, - "learning_rate": 4.453505622923523e-08, - "loss": 0.1014, - "reward": 0.5005580484867096, - "reward_std": 0.21134903654456139, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 34.00381851196289, + "kl": 3.58203125, + "learning_rate": 2.2267528114617617e-07, + "loss": 0.2141, + "reward": 0.5613839477300644, + "reward_std": 0.16338125616312027, + "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3911830559372902, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1946 }, { "clip_ratio": 0.0, - "completion_length": 1693.6183776855469, + "completion_length": 1647.66748046875, "epoch": 0.5815846464043014, - "grad_norm": 11.033979415893555, - "kl": 0.6513671875, - "learning_rate": 4.4483220380673685e-08, - "loss": 0.1224, - "reward": 0.5133928805589676, - "reward_std": 0.19733186438679695, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 12.491554260253906, + "kl": 2.810546875, + "learning_rate": 2.2241610190336844e-07, + "loss": 0.177, + "reward": 0.581473246216774, + "reward_std": 0.14199474826455116, + "rewards/accuracy_reward": 0.13169643748551607, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3861607313156128, + "rewards/tag_count_reward": 0.4497768059372902, "step": 1947 }, { "clip_ratio": 0.0, - "completion_length": 1578.5090026855469, + "completion_length": 1514.4264526367188, "epoch": 0.5818833544918228, - "grad_norm": 7.210418701171875, - "kl": 0.40673828125, - "learning_rate": 4.443139053382766e-08, - "loss": 0.0717, - "reward": 0.5546875074505806, - "reward_std": 0.21886930987238884, - "rewards/accuracy_reward": 0.1517857201397419, + "grad_norm": 77.83522033691406, + "kl": 1.572265625, + "learning_rate": 2.221569526691383e-07, + "loss": 0.1352, + "reward": 0.5887276977300644, + "reward_std": 0.14795255847275257, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4029018059372902, + "rewards/tag_count_reward": 0.459263414144516, "step": 1948 }, { "clip_ratio": 0.0, - "completion_length": 1634.2366333007812, + "completion_length": 1569.5603332519531, "epoch": 0.5821820625793444, - "grad_norm": 7.379580020904541, - "kl": 0.646484375, - "learning_rate": 4.437956674508294e-08, - "loss": 0.1328, - "reward": 0.4765625223517418, - "reward_std": 0.23281536996364594, - "rewards/accuracy_reward": 0.10491071827709675, + "grad_norm": 37.207584381103516, + "kl": 1.900390625, + "learning_rate": 2.218978337254147e-07, + "loss": 0.1545, + "reward": 0.5452008992433548, + "reward_std": 0.13090214040130377, + "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3716517984867096, + "rewards/tag_count_reward": 0.4514509066939354, "step": 1949 }, { "clip_ratio": 0.0, - "completion_length": 1644.7768859863281, + "completion_length": 1516.9576416015625, "epoch": 0.5824807706668658, - "grad_norm": 6.763062953948975, - "kl": 0.701171875, - "learning_rate": 4.432774907081875e-08, - "loss": 0.1103, - "reward": 0.5167410895228386, - "reward_std": 0.19582484662532806, - "rewards/accuracy_reward": 0.1540178693830967, + "grad_norm": 12.996857643127441, + "kl": 3.08203125, + "learning_rate": 2.2163874535409375e-07, + "loss": 0.1997, + "reward": 0.6088169887661934, + "reward_std": 0.1362476423382759, + "rewards/accuracy_reward": 0.15625000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232313156128, + "rewards/tag_count_reward": 0.4525669813156128, "step": 1950 }, { "clip_ratio": 0.0, - "completion_length": 1651.2701721191406, + "completion_length": 1547.5491638183594, "epoch": 0.5827794787543873, - "grad_norm": 11.214900970458984, - "kl": 0.3935546875, - "learning_rate": 4.427593756740763e-08, - "loss": 0.1041, - "reward": 0.5507812723517418, - "reward_std": 0.24338475987315178, - "rewards/accuracy_reward": 0.1718750074505806, + "grad_norm": 20.087648391723633, + "kl": 2.0, + "learning_rate": 2.2137968783703813e-07, + "loss": 0.1781, + "reward": 0.6328125149011612, + "reward_std": 0.20250336825847626, + "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3789062649011612, + "rewards/tag_count_reward": 0.4497768133878708, "step": 1951 }, { "clip_ratio": 0.0, - "completion_length": 1619.8326721191406, + "completion_length": 1592.8460388183594, "epoch": 0.5830781868419087, - "grad_norm": 16.870525360107422, - "kl": 0.32177734375, - "learning_rate": 4.42241322912154e-08, - "loss": 0.0752, - "reward": 0.5039062723517418, - "reward_std": 0.21411184966564178, - "rewards/accuracy_reward": 0.10937500232830644, + "grad_norm": 25.490694046020508, + "kl": 1.53173828125, + "learning_rate": 2.2112066145607703e-07, + "loss": 0.0938, + "reward": 0.5781250298023224, + "reward_std": 0.1565748080611229, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3945312723517418, + "rewards/tag_count_reward": 0.4620535895228386, "step": 1952 }, { "clip_ratio": 0.0, - "completion_length": 1695.3505249023438, + "completion_length": 1643.3661499023438, "epoch": 0.5833768949294302, - "grad_norm": 5.278802394866943, - "kl": 0.78515625, - "learning_rate": 4.417233329860117e-08, - "loss": 0.1177, - "reward": 0.4118303805589676, - "reward_std": 0.2322327047586441, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 14.498685836791992, + "kl": 2.8515625, + "learning_rate": 2.2086166649300583e-07, + "loss": 0.186, + "reward": 0.4782366156578064, + "reward_std": 0.17088854685425758, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982387661934, + "rewards/tag_count_reward": 0.4358259066939354, "step": 1953 }, { "clip_ratio": 0.0, - "completion_length": 1640.2791137695312, + "completion_length": 1580.732177734375, "epoch": 0.5836756030169516, - "grad_norm": 24.8361759185791, - "kl": 0.8359375, - "learning_rate": 4.4120540645917124e-08, - "loss": 0.1266, - "reward": 0.526227705180645, - "reward_std": 0.1916242130100727, - "rewards/accuracy_reward": 0.15625000349245965, + "grad_norm": 16.9406681060791, + "kl": 3.4453125, + "learning_rate": 2.2060270322958562e-07, + "loss": 0.2124, + "reward": 0.5987723395228386, + "reward_std": 0.11908787302672863, + "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3699776977300644, + "rewards/tag_count_reward": 0.4402901977300644, "step": 1954 }, { "clip_ratio": 0.0, - "completion_length": 1721.4554443359375, + "completion_length": 1684.3170776367188, "epoch": 0.5839743111044732, - "grad_norm": 4.184187412261963, - "kl": 0.6298828125, - "learning_rate": 4.4068754389508614e-08, - "loss": 0.1193, - "reward": 0.4296875149011612, - "reward_std": 0.24544057995080948, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 21.546798706054688, + "kl": 3.57421875, + "learning_rate": 2.2034377194754306e-07, + "loss": 0.2313, + "reward": 0.5357143208384514, + "reward_std": 0.14970465376973152, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3448660895228386, + "rewards/tag_count_reward": 0.4375000223517418, "step": 1955 }, { "clip_ratio": 0.0, - "completion_length": 1732.0380249023438, + "completion_length": 1680.4442749023438, "epoch": 0.5842730191919946, - "grad_norm": 23.249361038208008, - "kl": 0.900390625, - "learning_rate": 4.401697458571399e-08, - "loss": 0.1146, - "reward": 0.4151785895228386, - "reward_std": 0.1874581277370453, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 20.07666015625, + "kl": 3.5234375, + "learning_rate": 2.2008487292856991e-07, + "loss": 0.2285, + "reward": 0.5005580559372902, + "reward_std": 0.11579084023833275, + "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3593750074505806, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1956 }, { "clip_ratio": 0.0, - "completion_length": 1677.0916137695312, + "completion_length": 1652.5603637695312, "epoch": 0.5845717272795161, - "grad_norm": 25.937400817871094, - "kl": 0.8720703125, - "learning_rate": 4.396520129086462e-08, - "loss": 0.1231, - "reward": 0.541294664144516, - "reward_std": 0.24069958552718163, - "rewards/accuracy_reward": 0.1718750074505806, + "grad_norm": 39.14363479614258, + "kl": 3.98828125, + "learning_rate": 2.198260064543231e-07, + "loss": 0.2412, + "reward": 0.616629496216774, + "reward_std": 0.19253928028047085, + "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3694196566939354, + "rewards/tag_count_reward": 0.4313616305589676, "step": 1957 }, { "clip_ratio": 0.0, - "completion_length": 1657.7947387695312, + "completion_length": 1629.5915832519531, "epoch": 0.5848704353670375, - "grad_norm": 27.89297866821289, - "kl": 0.8427734375, - "learning_rate": 4.3913434561284785e-08, - "loss": 0.1378, - "reward": 0.4732142984867096, - "reward_std": 0.1902882345020771, - "rewards/accuracy_reward": 0.10044643469154835, + "grad_norm": 23.324373245239258, + "kl": 3.51171875, + "learning_rate": 2.1956717280642392e-07, + "loss": 0.2232, + "reward": 0.5680803805589676, + "reward_std": 0.17788539454340935, + "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.436383955180645, "step": 1958 }, { "clip_ratio": 0.0, - "completion_length": 1674.0090026855469, + "completion_length": 1646.5581359863281, "epoch": 0.5851691434545591, - "grad_norm": 8.09544563293457, - "kl": 0.6884765625, - "learning_rate": 4.386167445329158e-08, - "loss": 0.1319, - "reward": 0.4748884215950966, - "reward_std": 0.26384004950523376, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 17.148895263671875, + "kl": 3.0625, + "learning_rate": 2.193083722664579e-07, + "loss": 0.183, + "reward": 0.5558035969734192, + "reward_std": 0.168367438018322, + "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.365513414144516, + "rewards/tag_count_reward": 0.4486607313156128, "step": 1959 }, { "clip_ratio": 0.0, - "completion_length": 1670.88623046875, + "completion_length": 1651.4911804199219, "epoch": 0.5854678515420805, - "grad_norm": 6.0982537269592285, - "kl": 0.6865234375, - "learning_rate": 4.380992102319496e-08, - "loss": 0.109, - "reward": 0.4252232238650322, - "reward_std": 0.18753017857670784, + "grad_norm": 22.90903091430664, + "kl": 3.5703125, + "learning_rate": 2.1904960511597481e-07, + "loss": 0.2218, + "reward": 0.4972098469734192, + "reward_std": 0.11686134897172451, "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839402794838, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1960 }, { "clip_ratio": 0.0, - "completion_length": 1686.9777526855469, + "completion_length": 1638.32373046875, "epoch": 0.585766559629602, - "grad_norm": 6.714524269104004, - "kl": 0.626953125, - "learning_rate": 4.3758174327297584e-08, - "loss": 0.1187, - "reward": 0.4614955559372902, - "reward_std": 0.20175254344940186, - "rewards/accuracy_reward": 0.08705357275903225, + "grad_norm": 28.854951858520508, + "kl": 2.5390625, + "learning_rate": 2.187908716364879e-07, + "loss": 0.1865, + "reward": 0.5641741380095482, + "reward_std": 0.16427361592650414, + "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419887661934, + "rewards/tag_count_reward": 0.4458705633878708, "step": 1961 }, { "clip_ratio": 0.0, - "completion_length": 1749.4442443847656, + "completion_length": 1613.8192749023438, "epoch": 0.5860652677171234, - "grad_norm": 5.055563449859619, - "kl": 0.7470703125, - "learning_rate": 4.370643442189477e-08, - "loss": 0.1321, - "reward": 0.489955373108387, - "reward_std": 0.22650102525949478, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 30.84535026550293, + "kl": 2.236328125, + "learning_rate": 2.1853217210947382e-07, + "loss": 0.1756, + "reward": 0.5998884066939354, + "reward_std": 0.1821010746061802, + "rewards/accuracy_reward": 0.15401786658912897, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3582589402794838, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1962 }, { "clip_ratio": 0.0, - "completion_length": 1646.1495971679688, + "completion_length": 1622.3170471191406, "epoch": 0.5863639758046449, - "grad_norm": 4.393374443054199, - "kl": 0.6875, - "learning_rate": 4.365470136327445e-08, - "loss": 0.1084, - "reward": 0.4319196566939354, - "reward_std": 0.21131715551018715, - "rewards/accuracy_reward": 0.07142857369035482, + "grad_norm": 11.063496589660645, + "kl": 2.953125, + "learning_rate": 2.1827350681637225e-07, + "loss": 0.1999, + "reward": 0.5234375223517418, + "reward_std": 0.14028967916965485, + "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3604910895228386, + "rewards/tag_count_reward": 0.4430803805589676, "step": 1963 }, { "clip_ratio": 0.0, - "completion_length": 1677.6116638183594, + "completion_length": 1602.3282165527344, "epoch": 0.5866626838921664, - "grad_norm": 4.57485294342041, - "kl": 0.689453125, - "learning_rate": 4.360297520771716e-08, - "loss": 0.1114, - "reward": 0.3978794813156128, - "reward_std": 0.21825696155428886, - "rewards/accuracy_reward": 0.03571428661234677, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3599330559372902, + "grad_norm": 18.20419692993164, + "kl": 3.09765625, + "learning_rate": 2.1801487603858578e-07, + "loss": 0.1993, + "reward": 0.4810268059372902, + "reward_std": 0.14484633691608906, + "rewards/accuracy_reward": 0.03571428684517741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4453125223517418, "step": 1964 }, { "clip_ratio": 0.0, - "completion_length": 1685.0938110351562, + "completion_length": 1637.2813110351562, "epoch": 0.5869613919796879, - "grad_norm": 4.518319606781006, - "kl": 0.6787109375, - "learning_rate": 4.3551256011495865e-08, - "loss": 0.0831, - "reward": 0.4419643059372902, - "reward_std": 0.23325100913643837, - "rewards/accuracy_reward": 0.06919643189758062, + "grad_norm": 38.480751037597656, + "kl": 2.513671875, + "learning_rate": 2.177562800574793e-07, + "loss": 0.1714, + "reward": 0.5044642984867096, + "reward_std": 0.1829574517905712, + "rewards/accuracy_reward": 0.06250000139698386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3727678805589676, + "rewards/tag_count_reward": 0.4419643059372902, "step": 1965 }, { "clip_ratio": 0.0, - "completion_length": 1657.97998046875, + "completion_length": 1553.6027526855469, "epoch": 0.5872601000672093, - "grad_norm": 10.316750526428223, - "kl": 0.6748046875, - "learning_rate": 4.3499543830875975e-08, - "loss": 0.1262, - "reward": 0.5011160895228386, - "reward_std": 0.20159783586859703, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 46.01110076904297, + "kl": 1.93359375, + "learning_rate": 2.1749771915437987e-07, + "loss": 0.1441, + "reward": 0.5753348395228386, + "reward_std": 0.13641639985144138, + "rewards/accuracy_reward": 0.11160714970901608, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3783482238650322, + "rewards/tag_count_reward": 0.4637276977300644, "step": 1966 }, { "clip_ratio": 0.0, - "completion_length": 1613.8884887695312, + "completion_length": 1593.8818054199219, "epoch": 0.5875588081547308, - "grad_norm": 5.7705302238464355, - "kl": 0.59716796875, - "learning_rate": 4.3447838722115295e-08, - "loss": 0.1064, - "reward": 0.502790205180645, - "reward_std": 0.230324886739254, - "rewards/accuracy_reward": 0.12723215157166123, + "grad_norm": 25.398975372314453, + "kl": 2.013671875, + "learning_rate": 2.1723919361057646e-07, + "loss": 0.1484, + "reward": 0.5937500298023224, + "reward_std": 0.14600478298962116, + "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580484867096, + "rewards/tag_count_reward": 0.4642857313156128, "step": 1967 }, { "clip_ratio": 0.0, - "completion_length": 1709.5982666015625, + "completion_length": 1653.6094665527344, "epoch": 0.5878575162422522, - "grad_norm": 9.069467544555664, - "kl": 0.6259765625, - "learning_rate": 4.339614074146391e-08, - "loss": 0.1095, - "reward": 0.3889509066939354, - "reward_std": 0.19291609525680542, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 41.52937316894531, + "kl": 1.9814453125, + "learning_rate": 2.1698070370731953e-07, + "loss": 0.1432, + "reward": 0.4732143133878708, + "reward_std": 0.11274466384202242, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937649011612, + "rewards/tag_count_reward": 0.4575893059372902, "step": 1968 }, { "clip_ratio": 0.0, - "completion_length": 1754.1808776855469, + "completion_length": 1674.76123046875, "epoch": 0.5881562243297738, - "grad_norm": 5.217681884765625, - "kl": 0.8251953125, - "learning_rate": 4.334444994516414e-08, - "loss": 0.1242, - "reward": 0.4017857387661934, - "reward_std": 0.21614592149853706, - "rewards/accuracy_reward": 0.058035716181620955, + "grad_norm": 38.19734573364258, + "kl": 2.775390625, + "learning_rate": 2.167222497258207e-07, + "loss": 0.1936, + "reward": 0.5033482387661934, + "reward_std": 0.15683877281844616, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.431919664144516, "step": 1969 }, { "clip_ratio": 0.0, - "completion_length": 1717.82373046875, + "completion_length": 1682.72998046875, "epoch": 0.5884549324172952, - "grad_norm": 17.121957778930664, - "kl": 0.9248046875, - "learning_rate": 4.329276638945051e-08, - "loss": 0.1263, - "reward": 0.4268973469734192, - "reward_std": 0.17948058247566223, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 6.867143630981445, + "kl": 3.19921875, + "learning_rate": 2.1646383194725253e-07, + "loss": 0.1959, + "reward": 0.5128348395228386, + "reward_std": 0.10380838066339493, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.353236623108387, + "rewards/tag_count_reward": 0.4369419813156128, "step": 1970 }, { "clip_ratio": 0.0, - "completion_length": 1786.4041137695312, + "completion_length": 1713.0201416015625, "epoch": 0.5887536405048167, - "grad_norm": 4.491950035095215, - "kl": 0.6884765625, - "learning_rate": 4.324109013054968e-08, - "loss": 0.1009, - "reward": 0.4430803656578064, - "reward_std": 0.22021972388029099, - "rewards/accuracy_reward": 0.09375000395812094, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3493303656578064, + "grad_norm": 12.690998077392578, + "kl": 3.142578125, + "learning_rate": 2.162054506527484e-07, + "loss": 0.1879, + "reward": 0.5396205559372902, + "reward_std": 0.13667017593979836, + "rewards/accuracy_reward": 0.09375000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1971 }, { "clip_ratio": 0.0, - "completion_length": 1699.8840026855469, + "completion_length": 1629.6875610351562, "epoch": 0.5890523485923381, - "grad_norm": 11.003803253173828, - "kl": 0.58740234375, - "learning_rate": 4.318942122468034e-08, - "loss": 0.1341, - "reward": 0.5825893133878708, - "reward_std": 0.22798555344343185, - "rewards/accuracy_reward": 0.2165178619325161, + "grad_norm": 33.08745574951172, + "kl": 2.451171875, + "learning_rate": 2.159471061234017e-07, + "loss": 0.1754, + "reward": 0.7036830633878708, + "reward_std": 0.1653738860040903, + "rewards/accuracy_reward": 0.2522321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3660714477300644, + "rewards/tag_count_reward": 0.451450914144516, "step": 1972 }, { "clip_ratio": 0.0, - "completion_length": 1711.0447387695312, + "completion_length": 1708.5357971191406, "epoch": 0.5893510566798597, - "grad_norm": 3.694277763366699, - "kl": 0.640625, - "learning_rate": 4.3137759728053205e-08, - "loss": 0.1053, - "reward": 0.4547991305589676, - "reward_std": 0.22769004851579666, - "rewards/accuracy_reward": 0.09598214644938707, + "grad_norm": 46.91404724121094, + "kl": 4.734375, + "learning_rate": 2.15688798640266e-07, + "loss": 0.2664, + "reward": 0.5429687649011612, + "reward_std": 0.19622131809592247, + "rewards/accuracy_reward": 0.10714286146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169887661934, + "rewards/tag_count_reward": 0.435825914144516, "step": 1973 }, { "clip_ratio": 0.0, - "completion_length": 1611.2568054199219, + "completion_length": 1531.7723999023438, "epoch": 0.5896497647673811, - "grad_norm": 17.761503219604492, - "kl": 0.7900390625, - "learning_rate": 4.30861056968709e-08, - "loss": 0.1149, - "reward": 0.4497768133878708, - "reward_std": 0.22443095967173576, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 21.30575942993164, + "kl": 2.240234375, + "learning_rate": 2.154305284843545e-07, + "loss": 0.1571, + "reward": 0.5440848618745804, + "reward_std": 0.14968426525592804, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.364955373108387, + "rewards/tag_count_reward": 0.4525669813156128, "step": 1974 }, { "clip_ratio": 0.0, - "completion_length": 1656.5737609863281, + "completion_length": 1658.5067749023438, "epoch": 0.5899484728549026, - "grad_norm": 50.35771560668945, - "kl": 0.97119140625, - "learning_rate": 4.303445918732797e-08, - "loss": 0.144, - "reward": 0.5323660969734192, - "reward_std": 0.22705606743693352, - "rewards/accuracy_reward": 0.15848215413279831, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "grad_norm": 88.57762908935547, + "kl": 4.6328125, + "learning_rate": 2.1517229593663985e-07, + "loss": 0.2536, + "reward": 0.5976562649011612, + "reward_std": 0.16410492360591888, + "rewards/accuracy_reward": 0.14955357881262898, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4481026977300644, "step": 1975 }, { "clip_ratio": 0.0, - "completion_length": 1716.1250915527344, + "completion_length": 1658.4107971191406, "epoch": 0.590247180942424, - "grad_norm": 7.33389139175415, - "kl": 0.767578125, - "learning_rate": 4.298282025561075e-08, - "loss": 0.1247, - "reward": 0.481584832072258, - "reward_std": 0.17371326312422752, - "rewards/accuracy_reward": 0.12053572107106447, + "grad_norm": 166.81121826171875, + "kl": 7.125, + "learning_rate": 2.1491410127805377e-07, + "loss": 0.3638, + "reward": 0.5530134066939354, + "reward_std": 0.1174120381474495, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3610491305589676, + "rewards/tag_count_reward": 0.4347098395228386, "step": 1976 }, { "clip_ratio": 0.0, - "completion_length": 1647.0625915527344, + "completion_length": 1601.9063415527344, "epoch": 0.5905458890299455, - "grad_norm": 7.200290203094482, - "kl": 0.7783203125, - "learning_rate": 4.293118895789733e-08, - "loss": 0.133, - "reward": 0.4620535969734192, - "reward_std": 0.1999606415629387, - "rewards/accuracy_reward": 0.10491071920841932, + "grad_norm": 73.37601470947266, + "kl": 6.1953125, + "learning_rate": 2.146559447894866e-07, + "loss": 0.3785, + "reward": 0.5251116305589676, + "reward_std": 0.1434258446097374, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357142873108387, + "rewards/tag_count_reward": 0.4268973395228386, "step": 1977 }, { "clip_ratio": 0.0, - "completion_length": 1730.21435546875, + "completion_length": 1685.2076721191406, "epoch": 0.590844597117467, - "grad_norm": 5.659553527832031, - "kl": 0.783203125, - "learning_rate": 4.2879565350357505e-08, - "loss": 0.1376, - "reward": 0.4654018059372902, - "reward_std": 0.2055562436580658, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 10.409059524536133, + "kl": 4.07421875, + "learning_rate": 2.1439782675178752e-07, + "loss": 0.2518, + "reward": 0.5719866454601288, + "reward_std": 0.15977041609585285, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4291294813156128, "step": 1978 }, { "clip_ratio": 0.0, - "completion_length": 1680.0982971191406, + "completion_length": 1649.2969360351562, "epoch": 0.5911433052049885, - "grad_norm": 14.421250343322754, - "kl": 0.75732421875, - "learning_rate": 4.282794948915271e-08, - "loss": 0.1282, - "reward": 0.451450914144516, - "reward_std": 0.22412951290607452, - "rewards/accuracy_reward": 0.06473214784637094, + "grad_norm": 64.11497497558594, + "kl": 3.93359375, + "learning_rate": 2.1413974744576355e-07, + "loss": 0.2074, + "reward": 0.5228794887661934, + "reward_std": 0.1752008181065321, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3867187723517418, + "rewards/tag_count_reward": 0.4447544887661934, "step": 1979 }, { "clip_ratio": 0.0, - "completion_length": 1702.9375610351562, + "completion_length": 1621.4107666015625, "epoch": 0.5914420132925099, - "grad_norm": 10.170560836791992, - "kl": 0.791015625, - "learning_rate": 4.277634143043593e-08, - "loss": 0.1318, - "reward": 0.424107164144516, - "reward_std": 0.18761087208986282, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 15.70301342010498, + "kl": 3.025390625, + "learning_rate": 2.1388170715217966e-07, + "loss": 0.1899, + "reward": 0.5156250223517418, + "reward_std": 0.13348249904811382, + "rewards/accuracy_reward": 0.06250000419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.4531250223517418, "step": 1980 }, { "clip_ratio": 0.0, - "completion_length": 1694.2366943359375, + "completion_length": 1663.8170471191406, "epoch": 0.5917407213800313, - "grad_norm": 4.966330051422119, - "kl": 0.61083984375, - "learning_rate": 4.272474123035167e-08, - "loss": 0.0946, - "reward": 0.4955357387661934, - "reward_std": 0.19870809465646744, - "rewards/accuracy_reward": 0.11830358020961285, + "grad_norm": 10.870034217834473, + "kl": 2.86328125, + "learning_rate": 2.1362370615175834e-07, + "loss": 0.1746, + "reward": 0.5848214700818062, + "reward_std": 0.16083652526140213, + "rewards/accuracy_reward": 0.13392857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.377232164144516, + "rewards/tag_count_reward": 0.4508928805589676, "step": 1981 }, { "clip_ratio": 0.0, - "completion_length": 1695.1764221191406, + "completion_length": 1634.7411193847656, "epoch": 0.5920394294675528, - "grad_norm": 11.007241249084473, - "kl": 0.59033203125, - "learning_rate": 4.2673148945035905e-08, - "loss": 0.1049, - "reward": 0.4408482313156128, - "reward_std": 0.22406017035245895, - "rewards/accuracy_reward": 0.06696428917348385, + "grad_norm": 51.173702239990234, + "kl": 3.93359375, + "learning_rate": 2.1336574472517953e-07, + "loss": 0.2281, + "reward": 0.5117187723517418, + "reward_std": 0.1355637740343809, + "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3738839477300644, + "rewards/tag_count_reward": 0.451450914144516, "step": 1982 }, { "clip_ratio": 0.0, - "completion_length": 1712.8572387695312, + "completion_length": 1641.3371276855469, "epoch": 0.5923381375550743, - "grad_norm": 7.774020671844482, - "kl": 0.6865234375, - "learning_rate": 4.262156463061599e-08, - "loss": 0.1266, - "reward": 0.412388414144516, - "reward_std": 0.20057574287056923, - "rewards/accuracy_reward": 0.05803571501746774, + "grad_norm": 30.83989143371582, + "kl": 3.3125, + "learning_rate": 2.1310782315307995e-07, + "loss": 0.2157, + "reward": 0.5189732387661934, + "reward_std": 0.1435831133276224, + "rewards/accuracy_reward": 0.06696428754366934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526977300644, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1983 }, { "clip_ratio": 0.0, - "completion_length": 1753.4666137695312, + "completion_length": 1723.9732971191406, "epoch": 0.5926368456425958, - "grad_norm": 5.822809219360352, - "kl": 0.62890625, - "learning_rate": 4.2569988343210576e-08, - "loss": 0.109, - "reward": 0.3973214477300644, - "reward_std": 0.21568941324949265, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 5.076685905456543, + "kl": 3.005859375, + "learning_rate": 2.1284994171605288e-07, + "loss": 0.1816, + "reward": 0.4787946715950966, + "reward_std": 0.11630864068865776, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.372767873108387, + "rewards/tag_count_reward": 0.4497768133878708, "step": 1984 }, { "clip_ratio": 0.0, - "completion_length": 1795.21435546875, + "completion_length": 1708.4933776855469, "epoch": 0.5929355537301172, - "grad_norm": 11.931553840637207, - "kl": 0.41064453125, - "learning_rate": 4.251842013892964e-08, - "loss": 0.0918, - "reward": 0.5518973544239998, - "reward_std": 0.24127205461263657, - "rewards/accuracy_reward": 0.20312501303851604, + "grad_norm": 5.019188404083252, + "kl": 3.35546875, + "learning_rate": 2.125921006946482e-07, + "loss": 0.2124, + "reward": 0.647879496216774, + "reward_std": 0.19821508787572384, + "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3487723395228386, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1985 }, { "clip_ratio": 0.0, - "completion_length": 1625.3929443359375, + "completion_length": 1566.1920471191406, "epoch": 0.5932342618176387, - "grad_norm": 27.32845687866211, - "kl": 0.39208984375, - "learning_rate": 4.246686007387431e-08, - "loss": 0.1293, - "reward": 0.4302455559372902, - "reward_std": 0.20216409116983414, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 24.845048904418945, + "kl": 2.396484375, + "learning_rate": 2.1233430036937156e-07, + "loss": 0.211, + "reward": 0.5267857313156128, + "reward_std": 0.12628904543817043, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419813156128, + "rewards/tag_count_reward": 0.4598214477300644, "step": 1986 }, { "clip_ratio": 0.0, - "completion_length": 1748.8907165527344, + "completion_length": 1715.0915832519531, "epoch": 0.5935329699051601, - "grad_norm": 6.230075836181641, - "kl": 0.849609375, - "learning_rate": 4.2415308204136904e-08, - "loss": 0.1255, - "reward": 0.4107143059372902, - "reward_std": 0.22383471950888634, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 22.309974670410156, + "kl": 2.78515625, + "learning_rate": 2.120765410206845e-07, + "loss": 0.1725, + "reward": 0.5005580484867096, + "reward_std": 0.12635646015405655, + "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.352678582072258, + "rewards/tag_count_reward": 0.446986623108387, "step": 1987 }, { "clip_ratio": 0.0, - "completion_length": 1622.3058776855469, + "completion_length": 1579.6340026855469, "epoch": 0.5938316779926817, - "grad_norm": 7.642927646636963, - "kl": 0.69775390625, - "learning_rate": 4.236376458580077e-08, - "loss": 0.1006, - "reward": 0.481584832072258, - "reward_std": 0.18050462752580643, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 13.213762283325195, + "kl": 2.375, + "learning_rate": 2.1181882292900386e-07, + "loss": 0.1692, + "reward": 0.5524553805589676, + "reward_std": 0.16034020483493805, + "rewards/accuracy_reward": 0.10044643585570157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3833705484867096, + "rewards/tag_count_reward": 0.4520089477300644, "step": 1988 }, { "clip_ratio": 0.0, - "completion_length": 1658.1384582519531, + "completion_length": 1570.2277221679688, "epoch": 0.5941303860802031, - "grad_norm": 6.475239276885986, - "kl": 0.66015625, - "learning_rate": 4.2312229274940355e-08, - "loss": 0.1168, - "reward": 0.4810268059372902, - "reward_std": 0.1776122972369194, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 25.873594284057617, + "kl": 2.1796875, + "learning_rate": 2.1156114637470178e-07, + "loss": 0.1642, + "reward": 0.606584832072258, + "reward_std": 0.13057277165353298, + "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232313156128, + "rewards/tag_count_reward": 0.4614955559372902, "step": 1989 }, { "clip_ratio": 0.0, - "completion_length": 1701.4219360351562, + "completion_length": 1673.118408203125, "epoch": 0.5944290941677246, - "grad_norm": 20.676559448242188, - "kl": 0.955078125, - "learning_rate": 4.2260702327621e-08, - "loss": 0.1246, - "reward": 0.4436384215950966, - "reward_std": 0.22326533868908882, - "rewards/accuracy_reward": 0.08928571944124997, + "grad_norm": 20.207181930541992, + "kl": 2.33984375, + "learning_rate": 2.1130351163810502e-07, + "loss": 0.1407, + "reward": 0.584263414144516, + "reward_std": 0.18858056142926216, + "rewards/accuracy_reward": 0.12723215110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526977300644, + "rewards/tag_count_reward": 0.4570312723517418, "step": 1990 }, { "clip_ratio": 0.0, - "completion_length": 1707.5670776367188, + "completion_length": 1631.0357971191406, "epoch": 0.594727802255246, - "grad_norm": 32.89483642578125, - "kl": 0.9873046875, - "learning_rate": 4.220918379989897e-08, - "loss": 0.1333, - "reward": 0.380580373108387, - "reward_std": 0.17391939833760262, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 30.100889205932617, + "kl": 3.125, + "learning_rate": 2.1104591899949487e-07, + "loss": 0.2222, + "reward": 0.4575893133878708, + "reward_std": 0.10735885240137577, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3761160895228386, + "rewards/tag_count_reward": 0.4508928880095482, "step": 1991 }, { "clip_ratio": 0.0, - "completion_length": 1686.4308471679688, + "completion_length": 1589.4085693359375, "epoch": 0.5950265103427675, - "grad_norm": 7.876748085021973, - "kl": 0.6533203125, - "learning_rate": 4.21576737478214e-08, - "loss": 0.1196, - "reward": 0.4949776977300644, - "reward_std": 0.25235485285520554, - "rewards/accuracy_reward": 0.1316964328289032, + "grad_norm": 21.332429885864258, + "kl": 2.6484375, + "learning_rate": 2.1078836873910698e-07, + "loss": 0.1565, + "reward": 0.6004464477300644, + "reward_std": 0.1508875098079443, + "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812574505806, + "rewards/tag_count_reward": 0.4642857387661934, "step": 1992 }, { "clip_ratio": 0.0, - "completion_length": 1719.83935546875, + "completion_length": 1675.1719665527344, "epoch": 0.595325218430289, - "grad_norm": 33.949195861816406, - "kl": 0.91015625, - "learning_rate": 4.210617222742616e-08, - "loss": 0.1401, - "reward": 0.3738839477300644, - "reward_std": 0.21431024372577667, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 19.49724006652832, + "kl": 3.78515625, + "learning_rate": 2.105308611371308e-07, + "loss": 0.2449, + "reward": 0.474330373108387, + "reward_std": 0.15705344267189503, + "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3582589477300644, + "rewards/tag_count_reward": 0.4453125298023224, "step": 1993 }, { "clip_ratio": 0.0, - "completion_length": 1760.0848999023438, + "completion_length": 1657.3906860351562, "epoch": 0.5956239265178105, - "grad_norm": 7.427504062652588, - "kl": 0.787109375, - "learning_rate": 4.205467929474185e-08, - "loss": 0.1258, - "reward": 0.3643973395228386, - "reward_std": 0.21350497752428055, - "rewards/accuracy_reward": 0.017857143888249993, + "grad_norm": 32.924652099609375, + "kl": 3.5234375, + "learning_rate": 2.1027339647370926e-07, + "loss": 0.1961, + "reward": 0.4693080484867096, + "reward_std": 0.12829893082380295, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401977300644, + "rewards/tag_count_reward": 0.4492187649011612, "step": 1994 }, { "clip_ratio": 0.0, - "completion_length": 1767.0536804199219, + "completion_length": 1736.8237609863281, "epoch": 0.5959226346053319, - "grad_norm": 18.771976470947266, - "kl": 0.9658203125, - "learning_rate": 4.200319500578774e-08, - "loss": 0.1181, - "reward": 0.4224330559372902, - "reward_std": 0.2509072422981262, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 83.11372375488281, + "kl": 5.58203125, + "learning_rate": 2.100159750289387e-07, + "loss": 0.3079, + "reward": 0.5318080633878708, + "reward_std": 0.17447687312960625, + "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4380580559372902, "step": 1995 }, { "clip_ratio": 0.0, - "completion_length": 1742.3505249023438, + "completion_length": 1739.4911804199219, "epoch": 0.5962213426928534, - "grad_norm": 15.729413986206055, - "kl": 0.48876953125, - "learning_rate": 4.19517194165737e-08, - "loss": 0.108, - "reward": 0.465959832072258, - "reward_std": 0.2041163258254528, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 22.46120834350586, + "kl": 4.15234375, + "learning_rate": 2.097585970828685e-07, + "loss": 0.2448, + "reward": 0.5641741454601288, + "reward_std": 0.1256833542138338, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4436384215950966, "step": 1996 }, { "clip_ratio": 0.0, - "completion_length": 1701.7790832519531, + "completion_length": 1659.4554443359375, "epoch": 0.5965200507803748, - "grad_norm": 12.528977394104004, - "kl": 0.54541015625, - "learning_rate": 4.190025258310012e-08, - "loss": 0.1134, - "reward": 0.4587053880095482, - "reward_std": 0.2617247998714447, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 20.094161987304688, + "kl": 3.474609375, + "learning_rate": 2.095012629155006e-07, + "loss": 0.2151, + "reward": 0.5597098618745804, + "reward_std": 0.1981860287487507, + "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.4458705559372902, "step": 1997 }, { "clip_ratio": 0.0, - "completion_length": 1752.3393859863281, + "completion_length": 1695.29248046875, "epoch": 0.5968187588678964, - "grad_norm": 5.337209701538086, - "kl": 0.798828125, - "learning_rate": 4.184879456135786e-08, - "loss": 0.1269, - "reward": 0.4547991380095482, - "reward_std": 0.19533061981201172, - "rewards/accuracy_reward": 0.11383928963914514, + "grad_norm": 16.4444522857666, + "kl": 3.83203125, + "learning_rate": 2.092439728067893e-07, + "loss": 0.2343, + "reward": 0.5619419887661934, + "reward_std": 0.15152296982705593, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4302455559372902, "step": 1998 }, { "clip_ratio": 0.0, - "completion_length": 1704.4420166015625, + "completion_length": 1652.2702026367188, "epoch": 0.5971174669554178, - "grad_norm": 6.705286502838135, - "kl": 0.72412109375, - "learning_rate": 4.179734540732822e-08, - "loss": 0.1135, - "reward": 0.4001116305589676, - "reward_std": 0.19999108090996742, - "rewards/accuracy_reward": 0.033482145285233855, + "grad_norm": 6.168159008026123, + "kl": 3.74609375, + "learning_rate": 2.0898672703664112e-07, + "loss": 0.2429, + "reward": 0.4581473469734192, + "reward_std": 0.14216610044240952, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.4291294813156128, "step": 1999 }, { "clip_ratio": 0.0, - "completion_length": 1698.7166137695312, + "completion_length": 1673.3416137695312, "epoch": 0.5974161750429393, - "grad_norm": 19.309070587158203, - "kl": 0.9150390625, - "learning_rate": 4.174590517698283e-08, - "loss": 0.1381, - "reward": 0.473772332072258, - "reward_std": 0.20954426750540733, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 29.924734115600586, + "kl": 1.763671875, + "learning_rate": 2.0872952588491415e-07, + "loss": 0.1207, + "reward": 0.5764509290456772, + "reward_std": 0.1512145884335041, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3755580559372902, + "rewards/tag_count_reward": 0.4603794887661934, "step": 2000 }, { "clip_ratio": 0.0, - "completion_length": 1420.9107971191406, + "completion_length": 2007.0134582519531, "epoch": 0.5977148831304607, - "grad_norm": 1.105141282081604, - "kl": 0.255859375, - "learning_rate": 4.1694473926283624e-08, - "loss": 0.0393, - "reward": 0.3950893059372902, - "reward_std": 0.21245869249105453, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 19.01376724243164, + "kl": 1.60546875, + "learning_rate": 2.0847236963141812e-07, + "loss": 0.1022, + "reward": 0.2494419738650322, + "reward_std": 0.10328670870512724, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214477300644, + "rewards/tag_count_reward": 0.2405133992433548, "step": 2001 }, { "clip_ratio": 0.0, - "completion_length": 1412.4665832519531, + "completion_length": 1958.4598999023438, "epoch": 0.5980135912179823, - "grad_norm": 1.8007359504699707, - "kl": 0.286865234375, - "learning_rate": 4.1643051711182743e-08, - "loss": 0.0429, - "reward": 0.4743303880095482, - "reward_std": 0.21934324502944946, - "rewards/accuracy_reward": 0.1517857201397419, + "grad_norm": 22.44007110595703, + "kl": 1.5703125, + "learning_rate": 2.082152585559137e-07, + "loss": 0.1331, + "reward": 0.3777901977300644, + "reward_std": 0.10609771031886339, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.2527901940047741, "step": 2002 }, { "clip_ratio": 0.0, - "completion_length": 1439.4822082519531, + "completion_length": 2026.2835388183594, "epoch": 0.5983122993055037, - "grad_norm": 1.8960949182510376, - "kl": 0.268798828125, - "learning_rate": 4.159163858762254e-08, - "loss": 0.0271, - "reward": 0.5284598469734192, - "reward_std": 0.23807164654135704, - "rewards/accuracy_reward": 0.2098214365541935, + "grad_norm": 17.095685958862305, + "kl": 1.673828125, + "learning_rate": 2.079581929381127e-07, + "loss": 0.0902, + "reward": 0.3867187649011612, + "reward_std": 0.09345119819045067, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318638414144516, + "rewards/tag_count_reward": 0.2349330484867096, "step": 2003 }, { "clip_ratio": 0.0, - "completion_length": 1402.8147888183594, + "completion_length": 2010.7277526855469, "epoch": 0.5986110073930252, - "grad_norm": 1.844132423400879, - "kl": 0.2802734375, - "learning_rate": 4.154023461153545e-08, - "loss": 0.0123, - "reward": 0.3722098395228386, - "reward_std": 0.2111089564859867, - "rewards/accuracy_reward": 0.04017857415601611, + "grad_norm": 20.289731979370117, + "kl": 1.697265625, + "learning_rate": 2.0770117305767726e-07, + "loss": 0.0975, + "reward": 0.2449776865541935, + "reward_std": 0.07792892586439848, + "rewards/accuracy_reward": 0.0022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.2427455447614193, "step": 2004 }, { "clip_ratio": 0.0, - "completion_length": 1336.6161193847656, + "completion_length": 1989.1385192871094, "epoch": 0.5989097154805466, - "grad_norm": 1.4591615200042725, - "kl": 0.2890625, - "learning_rate": 4.148883983884394e-08, - "loss": 0.0581, - "reward": 0.4955357387661934, - "reward_std": 0.20487921312451363, - "rewards/accuracy_reward": 0.1785714365541935, + "grad_norm": 22.676464080810547, + "kl": 1.71875, + "learning_rate": 2.074441991942197e-07, + "loss": 0.1194, + "reward": 0.3219866193830967, + "reward_std": 0.1178138442337513, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.316964291036129, + "rewards/tag_count_reward": 0.2371651902794838, "step": 2005 }, { "clip_ratio": 0.0, - "completion_length": 1418.93310546875, + "completion_length": 2013.8550109863281, "epoch": 0.5992084235680681, - "grad_norm": 1.2844178676605225, - "kl": 0.2998046875, - "learning_rate": 4.143745432546052e-08, - "loss": -0.0038, - "reward": 0.4497768059372902, - "reward_std": 0.19247402995824814, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 29.393930435180664, + "kl": 1.734375, + "learning_rate": 2.0718727162730259e-07, + "loss": 0.0967, + "reward": 0.3476562798023224, + "reward_std": 0.08774643391370773, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.2360491156578064, "step": 2006 }, { "clip_ratio": 0.0, - "completion_length": 1472.7411499023438, + "completion_length": 2016.9019165039062, "epoch": 0.5995071316555896, - "grad_norm": 0.9667583703994751, - "kl": 0.30712890625, - "learning_rate": 4.1386078127287574e-08, - "loss": 0.0001, - "reward": 0.3599330484867096, - "reward_std": 0.21735002472996712, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 22.131059646606445, + "kl": 1.76171875, + "learning_rate": 2.0693039063643784e-07, + "loss": 0.0932, + "reward": 0.2444196529686451, + "reward_std": 0.07599498424679041, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.2399553693830967, "step": 2007 }, { "clip_ratio": 0.0, - "completion_length": 1453.1473999023438, + "completion_length": 1983.3639221191406, "epoch": 0.5998058397431111, - "grad_norm": 1.4520924091339111, - "kl": 0.31494140625, - "learning_rate": 4.1334711300217365e-08, - "loss": 0.0329, - "reward": 0.3191964402794838, - "reward_std": 0.17764576897025108, - "rewards/accuracy_reward": 0.026785716181620955, + "grad_norm": 27.39740753173828, + "kl": 1.734375, + "learning_rate": 2.0667355650108682e-07, + "loss": 0.1162, + "reward": 0.2572544738650322, + "reward_std": 0.07956382818520069, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107238650322, + "rewards/tag_count_reward": 0.2438616156578064, "step": 2008 }, { "clip_ratio": 0.0, - "completion_length": 1397.7255249023438, + "completion_length": 2018.8773193359375, "epoch": 0.6001045478306325, - "grad_norm": 2.038682699203491, - "kl": 0.28466796875, - "learning_rate": 4.1283353900131964e-08, - "loss": 0.0317, - "reward": 0.4559151977300644, - "reward_std": 0.24211791902780533, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 27.842668533325195, + "kl": 1.82421875, + "learning_rate": 2.064167695006598e-07, + "loss": 0.1015, + "reward": 0.2700892984867096, + "reward_std": 0.06435556150972843, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.2343750111758709, "step": 2009 }, { "clip_ratio": 0.0, - "completion_length": 1420.7210388183594, + "completion_length": 1995.9130249023438, "epoch": 0.600403255918154, - "grad_norm": 2.185366153717041, - "kl": 0.277099609375, - "learning_rate": 4.1232005982903206e-08, - "loss": 0.0475, - "reward": 0.4012276977300644, - "reward_std": 0.20776939019560814, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 30.926021575927734, + "kl": 1.82421875, + "learning_rate": 2.0616002991451602e-07, + "loss": 0.1094, + "reward": 0.2851562649011612, + "reward_std": 0.07530072797089815, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.247209832072258, "step": 2010 }, { "clip_ratio": 0.0, - "completion_length": 1400.8951721191406, + "completion_length": 1957.8884887695312, "epoch": 0.6007019640056754, - "grad_norm": 1.469215989112854, - "kl": 0.30810546875, - "learning_rate": 4.1180667604392584e-08, - "loss": 0.0231, - "reward": 0.361607164144516, - "reward_std": 0.2198587767779827, - "rewards/accuracy_reward": 0.046875002793967724, + "grad_norm": 35.56877899169922, + "kl": 1.734375, + "learning_rate": 2.0590333802196293e-07, + "loss": 0.128, + "reward": 0.279017873108387, + "reward_std": 0.09334464650601149, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.2566964402794838, "step": 2011 }, { "clip_ratio": 0.0, - "completion_length": 1423.26123046875, + "completion_length": 1993.5223999023438, "epoch": 0.601000672093197, - "grad_norm": 1.8092372417449951, - "kl": 0.32763671875, - "learning_rate": 4.1129338820451206e-08, - "loss": 0.0209, - "reward": 0.463169664144516, - "reward_std": 0.2287929505109787, - "rewards/accuracy_reward": 0.15178572200238705, + "grad_norm": 35.18021011352539, + "kl": 1.998046875, + "learning_rate": 2.0564669410225606e-07, + "loss": 0.0972, + "reward": 0.3526785895228386, + "reward_std": 0.060251260176301, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.2410714365541935, "step": 2012 }, { "clip_ratio": 0.0, - "completion_length": 1344.9554138183594, + "completion_length": 1958.9353637695312, "epoch": 0.6012993801807184, - "grad_norm": 2.337693214416504, - "kl": 0.267333984375, - "learning_rate": 4.107801968691979e-08, - "loss": 0.0486, - "reward": 0.4252232313156128, - "reward_std": 0.20602432638406754, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 38.1002311706543, + "kl": 1.8515625, + "learning_rate": 2.0539009843459895e-07, + "loss": 0.1373, + "reward": 0.301897332072258, + "reward_std": 0.10135246813297272, + "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3537946566939354, + "rewards/tag_count_reward": 0.2572544738650322, "step": 2013 }, { "clip_ratio": 0.0, - "completion_length": 1419.3058776855469, + "completion_length": 1948.46435546875, "epoch": 0.6015980882682399, - "grad_norm": 2.136259078979492, - "kl": 0.270751953125, - "learning_rate": 4.10267102596285e-08, - "loss": 0.0442, - "reward": 0.4207589477300644, - "reward_std": 0.25104551389813423, - "rewards/accuracy_reward": 0.08705357648432255, + "grad_norm": 41.63920211791992, + "kl": 1.94921875, + "learning_rate": 2.0513355129814252e-07, + "loss": 0.1188, + "reward": 0.3035714477300644, + "reward_std": 0.09025296848267317, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.2566964402794838, "step": 2014 }, { "clip_ratio": 0.0, - "completion_length": 1418.1719360351562, + "completion_length": 1944.9353637695312, "epoch": 0.6018967963557613, - "grad_norm": 2.2952959537506104, - "kl": 0.29248046875, - "learning_rate": 4.097541059439698e-08, - "loss": 0.027, - "reward": 0.4185267984867096, - "reward_std": 0.20555156096816063, - "rewards/accuracy_reward": 0.08705357881262898, + "grad_norm": 75.75386047363281, + "kl": 1.83984375, + "learning_rate": 2.048770529719849e-07, + "loss": 0.1295, + "reward": 0.313058041036129, + "reward_std": 0.10595975816249847, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.2617187574505806, "step": 2015 }, { "clip_ratio": 0.0, - "completion_length": 1364.6741943359375, + "completion_length": 1971.0625610351562, "epoch": 0.6021955044432828, - "grad_norm": 1.3045475482940674, - "kl": 0.25927734375, - "learning_rate": 4.092412074703421e-08, - "loss": 0.0173, - "reward": 0.4235491305589676, - "reward_std": 0.2140561304986477, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 88.37937927246094, + "kl": 1.90625, + "learning_rate": 2.0462060373517106e-07, + "loss": 0.1249, + "reward": 0.3041294738650322, + "reward_std": 0.08976395009085536, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.2550223246216774, "step": 2016 }, { "clip_ratio": 0.0, - "completion_length": 1431.1719360351562, + "completion_length": 1994.0827026367188, "epoch": 0.6024942125308043, - "grad_norm": 2.24501371383667, - "kl": 0.291259765625, - "learning_rate": 4.087284077333855e-08, - "loss": 0.0423, - "reward": 0.3543526902794838, - "reward_std": 0.19578063115477562, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 76.98424530029297, + "kl": 1.95703125, + "learning_rate": 2.0436420386669275e-07, + "loss": 0.1096, + "reward": 0.247209832072258, + "reward_std": 0.04999090917408466, + "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.247209832072258, "step": 2017 }, { "clip_ratio": 0.0, - "completion_length": 1421.9576416015625, + "completion_length": 1954.884033203125, "epoch": 0.6027929206183258, - "grad_norm": 2.2016568183898926, - "kl": 0.31005859375, - "learning_rate": 4.0821570729097574e-08, - "loss": 0.0284, - "reward": 0.400111623108387, - "reward_std": 0.20976438745856285, - "rewards/accuracy_reward": 0.07812500465661287, + "grad_norm": 185.7693634033203, + "kl": 1.51953125, + "learning_rate": 2.0410785364548785e-07, + "loss": 0.1068, + "reward": 0.290178582072258, + "reward_std": 0.08779922686517239, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866305589676, + "rewards/tag_count_reward": 0.2522321529686451, "step": 2018 }, { "clip_ratio": 0.0, - "completion_length": 1432.0201721191406, + "completion_length": 1966.5357971191406, "epoch": 0.6030916287058472, - "grad_norm": 1.9927716255187988, - "kl": 0.3037109375, - "learning_rate": 4.077031067008804e-08, - "loss": 0.0227, - "reward": 0.4179687649011612, - "reward_std": 0.19791356846690178, - "rewards/accuracy_reward": 0.10267857392318547, + "grad_norm": 93.97416687011719, + "kl": 1.88671875, + "learning_rate": 2.0385155335044023e-07, + "loss": 0.1024, + "reward": 0.3191964514553547, + "reward_std": 0.07120133191347122, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.2433035783469677, "step": 2019 }, { "clip_ratio": 0.0, - "completion_length": 1343.9353332519531, + "completion_length": 1930.4598999023438, "epoch": 0.6033903367933687, - "grad_norm": 1.30902898311615, - "kl": 0.314453125, - "learning_rate": 4.071906065207591e-08, - "loss": 0.0495, - "reward": 0.4698660895228386, - "reward_std": 0.18715155497193336, - "rewards/accuracy_reward": 0.1517857238650322, + "grad_norm": 138.41859436035156, + "kl": 1.78125, + "learning_rate": 2.0359530326037955e-07, + "loss": 0.1427, + "reward": 0.3638393133878708, + "reward_std": 0.08370322547852993, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.2522321492433548, "step": 2020 }, { "clip_ratio": 0.0, - "completion_length": 1369.4754943847656, + "completion_length": 1942.587158203125, "epoch": 0.6036890448808901, - "grad_norm": 3.779177665710449, - "kl": 0.271240234375, - "learning_rate": 4.066782073081616e-08, - "loss": 0.0365, - "reward": 0.5078125149011612, - "reward_std": 0.2093944288790226, - "rewards/accuracy_reward": 0.1562500037252903, + "grad_norm": 164.16201782226562, + "kl": 1.666015625, + "learning_rate": 2.033391036540808e-07, + "loss": 0.1273, + "reward": 0.352120541036129, + "reward_std": 0.09485107287764549, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625223517418, + "rewards/tag_count_reward": 0.2628348395228386, "step": 2021 }, { "clip_ratio": 0.0, - "completion_length": 1404.7857666015625, + "completion_length": 1941.7300109863281, "epoch": 0.6039877529684117, - "grad_norm": 1.953383445739746, - "kl": 0.318359375, - "learning_rate": 4.0616590962052796e-08, - "loss": 0.0346, - "reward": 0.3733259066939354, - "reward_std": 0.1916869580745697, - "rewards/accuracy_reward": 0.053571430733427405, + "grad_norm": 156.23495483398438, + "kl": 1.669921875, + "learning_rate": 2.03082954810264e-07, + "loss": 0.1224, + "reward": 0.286830373108387, + "reward_std": 0.06754599511623383, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.251116082072258, "step": 2022 }, { "clip_ratio": 0.0, - "completion_length": 1379.4732666015625, + "completion_length": 1953.6139221191406, "epoch": 0.6042864610559331, - "grad_norm": 2.281245708465576, - "kl": 0.30419921875, - "learning_rate": 4.056537140151879e-08, - "loss": 0.024, - "reward": 0.412946455180645, - "reward_std": 0.18307414278388023, - "rewards/accuracy_reward": 0.08482143259607255, + "grad_norm": 163.2364044189453, + "kl": 1.533203125, + "learning_rate": 2.0282685700759397e-07, + "loss": 0.1152, + "reward": 0.3297991268336773, + "reward_std": 0.06811636220663786, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.2583705447614193, "step": 2023 }, { "clip_ratio": 0.0, - "completion_length": 1384.3304138183594, + "completion_length": 1930.8281860351562, "epoch": 0.6045851691434545, - "grad_norm": 2.95780611038208, - "kl": 0.333984375, - "learning_rate": 4.051416210493602e-08, - "loss": 0.0396, - "reward": 0.3470982387661934, - "reward_std": 0.18092423677444458, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 130.30120849609375, + "kl": 1.75390625, + "learning_rate": 2.0257081052468007e-07, + "loss": 0.1284, + "reward": 0.2734375223517418, + "reward_std": 0.08655294217169285, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.2600446566939354, "step": 2024 }, { "clip_ratio": 0.0, - "completion_length": 1450.8728332519531, + "completion_length": 1946.7702026367188, "epoch": 0.604883877230976, - "grad_norm": 1.361657977104187, - "kl": 0.32177734375, - "learning_rate": 4.046296312801516e-08, - "loss": 0.0222, - "reward": 0.4285714477300644, - "reward_std": 0.21364851668477058, - "rewards/accuracy_reward": 0.11160714970901608, + "grad_norm": 132.37033081054688, + "kl": 1.76171875, + "learning_rate": 2.0231481564007579e-07, + "loss": 0.1151, + "reward": 0.3387276902794838, + "reward_std": 0.0722337681800127, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.2606026828289032, "step": 2025 }, { "clip_ratio": 0.0, - "completion_length": 1398.6964721679688, + "completion_length": 1922.9264526367188, "epoch": 0.6051825853184974, - "grad_norm": 1.8929544687271118, - "kl": 0.282958984375, - "learning_rate": 4.041177452645568e-08, - "loss": 0.0085, - "reward": 0.4056919813156128, - "reward_std": 0.17845412716269493, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 202.0115966796875, + "kl": 1.666015625, + "learning_rate": 2.020588726322784e-07, + "loss": 0.1389, + "reward": 0.3085937649011612, + "reward_std": 0.09983004443347454, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.2684151902794838, "step": 2026 }, { "clip_ratio": 0.0, - "completion_length": 1388.8371276855469, + "completion_length": 1940.4331359863281, "epoch": 0.605481293406019, - "grad_norm": 2.266953468322754, - "kl": 0.333984375, - "learning_rate": 4.0360596355945774e-08, - "loss": 0.0274, - "reward": 0.356584832072258, - "reward_std": 0.19049397110939026, - "rewards/accuracy_reward": 0.05357143119908869, + "grad_norm": 113.05741119384766, + "kl": 1.4765625, + "learning_rate": 2.0180298177972888e-07, + "loss": 0.1023, + "reward": 0.3074776902794838, + "reward_std": 0.08548478037118912, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.2606026828289032, "step": 2027 }, { "clip_ratio": 0.0, - "completion_length": 1402.2947082519531, + "completion_length": 1916.3973999023438, "epoch": 0.6057800014935404, - "grad_norm": 2.4730944633483887, - "kl": 0.294921875, - "learning_rate": 4.030942867216227e-08, - "loss": 0.0111, - "reward": 0.4369419738650322, - "reward_std": 0.18635417893528938, - "rewards/accuracy_reward": 0.09821428940631449, + "grad_norm": 172.66815185546875, + "kl": 1.373046875, + "learning_rate": 2.0154714336081135e-07, + "loss": 0.1127, + "reward": 0.3437500149011612, + "reward_std": 0.10583369247615337, + "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276828289032, + "rewards/tag_count_reward": 0.2656250223517418, "step": 2028 }, { "clip_ratio": 0.0, - "completion_length": 1383.6897888183594, + "completion_length": 1864.9576416015625, "epoch": 0.6060787095810619, - "grad_norm": 1.9883402585983276, - "kl": 0.2900390625, - "learning_rate": 4.025827153077059e-08, - "loss": 0.0128, - "reward": 0.4040178805589676, - "reward_std": 0.21125473827123642, - "rewards/accuracy_reward": 0.06696428661234677, + "grad_norm": 196.3884735107422, + "kl": 1.265625, + "learning_rate": 2.0129135765385297e-07, + "loss": 0.1341, + "reward": 0.3152901902794838, + "reward_std": 0.11865677125751972, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.2684151828289032, "step": 2029 }, { "clip_ratio": 0.0, - "completion_length": 1373.2813110351562, + "completion_length": 1896.5156860351562, "epoch": 0.6063774176685833, - "grad_norm": 1.6573885679244995, - "kl": 0.281005859375, - "learning_rate": 4.0207124987424685e-08, - "loss": 0.0429, - "reward": 0.4536830633878708, - "reward_std": 0.2325446493923664, - "rewards/accuracy_reward": 0.10267857694998384, + "grad_norm": 96.03591918945312, + "kl": 1.05078125, + "learning_rate": 2.010356249371234e-07, + "loss": 0.1336, + "reward": 0.3164062574505806, + "reward_std": 0.1234282087534666, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044738650322, + "rewards/tag_count_reward": 0.2695312574505806, "step": 2030 }, { "clip_ratio": 0.0, - "completion_length": 1361.1808776855469, + "completion_length": 1825.9197082519531, "epoch": 0.6066761257561049, - "grad_norm": 3.3545236587524414, - "kl": 0.267333984375, - "learning_rate": 4.0155989097766983e-08, - "loss": 0.0354, - "reward": 0.5301339477300644, - "reward_std": 0.2226709984242916, - "rewards/accuracy_reward": 0.16294643213041127, + "grad_norm": 81.84765625, + "kl": 0.9638671875, + "learning_rate": 2.0077994548883493e-07, + "loss": 0.1172, + "reward": 0.3895089402794838, + "reward_std": 0.15263395570218563, + "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3671875149011612, + "rewards/tag_count_reward": 0.2779017984867096, "step": 2031 }, { "clip_ratio": 0.0, - "completion_length": 1327.1340026855469, + "completion_length": 1894.4175109863281, "epoch": 0.6069748338436263, - "grad_norm": 2.3053252696990967, - "kl": 0.35205078125, - "learning_rate": 4.010486391742835e-08, - "loss": 0.0623, - "reward": 0.380580373108387, - "reward_std": 0.18132338300347328, - "rewards/accuracy_reward": 0.06919643143191934, + "grad_norm": 94.24299621582031, + "kl": 1.181640625, + "learning_rate": 2.0052431958714174e-07, + "loss": 0.1031, + "reward": 0.3069196566939354, + "reward_std": 0.0905121136456728, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.2645089402794838, "step": 2032 }, { "clip_ratio": 0.0, - "completion_length": 1444.9598693847656, + "completion_length": 1932.4978637695312, "epoch": 0.6072735419311478, - "grad_norm": 2.7830793857574463, - "kl": 0.3408203125, - "learning_rate": 4.0053749502027936e-08, - "loss": 0.0067, - "reward": 0.3671875149011612, - "reward_std": 0.18362025544047356, - "rewards/accuracy_reward": 0.06250000465661287, + "grad_norm": 100.99922943115234, + "kl": 1.212890625, + "learning_rate": 2.002687475101397e-07, + "loss": 0.1005, + "reward": 0.3052455484867096, + "reward_std": 0.07347106747329235, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.262834832072258, "step": 2033 }, { "clip_ratio": 0.0, - "completion_length": 1378.7969360351562, + "completion_length": 1916.0246276855469, "epoch": 0.6075722500186692, - "grad_norm": 2.5840373039245605, - "kl": 0.3515625, - "learning_rate": 4.000264590717324e-08, - "loss": 0.0187, - "reward": 0.3404017984867096, - "reward_std": 0.1949838250875473, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 82.82646942138672, + "kl": 1.1640625, + "learning_rate": 2.000132295358662e-07, + "loss": 0.0869, + "reward": 0.282366082072258, + "reward_std": 0.08921909797936678, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.271205373108387, "step": 2034 }, { "clip_ratio": 0.0, - "completion_length": 1340.5245971679688, + "completion_length": 1923.6563415527344, "epoch": 0.6078709581061907, - "grad_norm": 1.778052568435669, - "kl": 0.32666015625, - "learning_rate": 3.995155318845993e-08, - "loss": 0.0438, - "reward": 0.4882812798023224, - "reward_std": 0.18497229740023613, - "rewards/accuracy_reward": 0.15625000931322575, + "grad_norm": 87.85645294189453, + "kl": 1.103515625, + "learning_rate": 1.9975776594229965e-07, + "loss": 0.1112, + "reward": 0.3750000074505806, + "reward_std": 0.08628001902252436, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312723517418, + "rewards/tag_count_reward": 0.2611607238650322, "step": 2035 }, { "clip_ratio": 0.0, - "completion_length": 1405.2277221679688, + "completion_length": 1911.1675415039062, "epoch": 0.6081696661937122, - "grad_norm": 3.5241281986236572, - "kl": 0.32958984375, - "learning_rate": 3.9900471401471906e-08, - "loss": 0.0248, - "reward": 0.3487723395228386, - "reward_std": 0.17017562687397003, - "rewards/accuracy_reward": 0.03348214295692742, + "grad_norm": 87.55707550048828, + "kl": 1.091796875, + "learning_rate": 1.9950235700735953e-07, + "loss": 0.0917, + "reward": 0.2851562574505806, + "reward_std": 0.10026535391807556, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.2695312574505806, "step": 2036 }, { "clip_ratio": 0.0, - "completion_length": 1320.5625915527344, + "completion_length": 1900.3281860351562, "epoch": 0.6084683742812337, - "grad_norm": 1.9309957027435303, - "kl": 0.326416015625, - "learning_rate": 3.984940060178114e-08, - "loss": 0.0741, - "reward": 0.4073660895228386, - "reward_std": 0.1711849756538868, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 95.55062866210938, + "kl": 1.154296875, + "learning_rate": 1.992470030089057e-07, + "loss": 0.1061, + "reward": 0.3236607313156128, + "reward_std": 0.10007515829056501, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.2700892984867096, "step": 2037 }, { "clip_ratio": 0.0, - "completion_length": 1399.107177734375, + "completion_length": 1925.9665832519531, "epoch": 0.6087670823687551, - "grad_norm": 2.5783822536468506, - "kl": 0.3486328125, - "learning_rate": 3.979834084494765e-08, - "loss": 0.0422, - "reward": 0.4045758992433548, - "reward_std": 0.21709511056542397, - "rewards/accuracy_reward": 0.09151786426082253, + "grad_norm": 69.78916931152344, + "kl": 1.0185546875, + "learning_rate": 1.9899170422473825e-07, + "loss": 0.0856, + "reward": 0.322544664144516, + "reward_std": 0.10184823349118233, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.271205373108387, "step": 2038 }, { "clip_ratio": 0.0, - "completion_length": 1347.7723693847656, + "completion_length": 1872.0112609863281, "epoch": 0.6090657904562766, - "grad_norm": 2.194063663482666, - "kl": 0.30029296875, - "learning_rate": 3.974729218651945e-08, - "loss": 0.0226, - "reward": 0.4804687649011612, - "reward_std": 0.1942402347922325, - "rewards/accuracy_reward": 0.13169643399305642, + "grad_norm": 90.38148498535156, + "kl": 1.095703125, + "learning_rate": 1.9873646093259727e-07, + "loss": 0.099, + "reward": 0.4034598395228386, + "reward_std": 0.13785859383642673, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.348772332072258, + "rewards/tag_count_reward": 0.2717633992433548, "step": 2039 }, { "clip_ratio": 0.0, - "completion_length": 1343.7143859863281, + "completion_length": 1861.9465026855469, "epoch": 0.609364498543798, - "grad_norm": 2.3282127380371094, - "kl": 0.31689453125, - "learning_rate": 3.9696254682032486e-08, - "loss": 0.0302, - "reward": 0.4882812649011612, - "reward_std": 0.19412844255566597, - "rewards/accuracy_reward": 0.15625001047737896, + "grad_norm": 68.22972869873047, + "kl": 0.9384765625, + "learning_rate": 1.9848127341016244e-07, + "loss": 0.1023, + "reward": 0.4012277014553547, + "reward_std": 0.12528756260871887, + "rewards/accuracy_reward": 0.12276786682195961, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.2784598357975483, "step": 2040 }, { "clip_ratio": 0.0, - "completion_length": 1332.33935546875, + "completion_length": 1892.946533203125, "epoch": 0.6096632066313196, - "grad_norm": 3.162238597869873, - "kl": 0.3740234375, - "learning_rate": 3.9645228387010547e-08, - "loss": 0.0382, - "reward": 0.3309151977300644, - "reward_std": 0.18589255958795547, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 62.6482048034668, + "kl": 0.9833984375, + "learning_rate": 1.9822614193505275e-07, + "loss": 0.1073, + "reward": 0.2806919738650322, + "reward_std": 0.10459551215171814, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.2739955484867096, "step": 2041 }, { "clip_ratio": 0.0, - "completion_length": 1339.7969665527344, + "completion_length": 1854.4509887695312, "epoch": 0.609961914718841, - "grad_norm": 2.7973978519439697, - "kl": 0.32080078125, - "learning_rate": 3.9594213356965236e-08, - "loss": 0.0283, - "reward": 0.4280134066939354, - "reward_std": 0.21722497418522835, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 60.31330490112305, + "kl": 0.8564453125, + "learning_rate": 1.979710667848262e-07, + "loss": 0.1283, + "reward": 0.3314732238650322, + "reward_std": 0.12542915157973766, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.2845982238650322, "step": 2042 }, { "clip_ratio": 0.0, - "completion_length": 1329.9576721191406, + "completion_length": 1887.2567749023438, "epoch": 0.6102606228063625, - "grad_norm": 1.9211632013320923, - "kl": 0.3447265625, - "learning_rate": 3.954320964739593e-08, - "loss": 0.0208, - "reward": 0.407924123108387, - "reward_std": 0.20724401623010635, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 62.87117385864258, + "kl": 0.9365234375, + "learning_rate": 1.9771604823697966e-07, + "loss": 0.0959, + "reward": 0.297991082072258, + "reward_std": 0.1309618316590786, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.2756696566939354, "step": 2043 }, { "clip_ratio": 0.0, - "completion_length": 1490.9933471679688, + "completion_length": 1903.8572692871094, "epoch": 0.6105593308938839, - "grad_norm": 3.0726754665374756, - "kl": 0.3134765625, - "learning_rate": 3.9492217313789655e-08, - "loss": 0.0232, - "reward": 0.3978794738650322, - "reward_std": 0.16338566318154335, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 33.77011489868164, + "kl": 0.82421875, + "learning_rate": 1.9746108656894826e-07, + "loss": 0.0797, + "reward": 0.3521205484867096, + "reward_std": 0.10524630546569824, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.278459832072258, "step": 2044 }, { "clip_ratio": 0.0, - "completion_length": 1367.0245666503906, + "completion_length": 1872.1540832519531, "epoch": 0.6108580389814054, - "grad_norm": 1.871185302734375, - "kl": 0.309814453125, - "learning_rate": 3.944123641162105e-08, - "loss": 0.0249, - "reward": 0.4229910895228386, - "reward_std": 0.19422469846904278, - "rewards/accuracy_reward": 0.07142857648432255, + "grad_norm": 30.331754684448242, + "kl": 0.7509765625, + "learning_rate": 1.9720618205810527e-07, + "loss": 0.0974, + "reward": 0.3253348395228386, + "reward_std": 0.1536751687526703, + "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625149011612, + "rewards/tag_count_reward": 0.3007812649011612, "step": 2045 }, { "clip_ratio": 0.0, - "completion_length": 1401.0313110351562, + "completion_length": 1883.1942749023438, "epoch": 0.6111567470689269, - "grad_norm": 2.2430334091186523, - "kl": 0.3232421875, - "learning_rate": 3.939026699635237e-08, - "loss": 0.0485, - "reward": 0.3666294887661934, - "reward_std": 0.18055245280265808, - "rewards/accuracy_reward": 0.024553572293370962, + "grad_norm": 64.93643188476562, + "kl": 0.845703125, + "learning_rate": 1.9695133498176187e-07, + "loss": 0.1039, + "reward": 0.2952009066939354, + "reward_std": 0.14137830026447773, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.342075914144516, + "rewards/tag_count_reward": 0.2840401902794838, "step": 2046 }, { "clip_ratio": 0.0, - "completion_length": 1394.8505249023438, + "completion_length": 1883.3505554199219, "epoch": 0.6114554551564484, - "grad_norm": 2.579338550567627, - "kl": 0.347900390625, - "learning_rate": 3.933930912343334e-08, - "loss": 0.0371, - "reward": 0.4285714402794838, - "reward_std": 0.17913731187582016, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 27.461055755615234, + "kl": 0.7763671875, + "learning_rate": 1.966965456171667e-07, + "loss": 0.0923, + "reward": 0.3537946566939354, + "reward_std": 0.11342634446918964, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.282366082072258, "step": 2047 }, { "clip_ratio": 0.0, - "completion_length": 1362.3616943359375, + "completion_length": 1896.0513916015625, "epoch": 0.6117541632439698, - "grad_norm": 2.29754638671875, - "kl": 0.34130859375, - "learning_rate": 3.928836284830112e-08, - "loss": 0.0396, - "reward": 0.3872768133878708, - "reward_std": 0.18594546616077423, - "rewards/accuracy_reward": 0.0602678619325161, + "grad_norm": 26.486169815063477, + "kl": 0.7763671875, + "learning_rate": 1.964418142415056e-07, + "loss": 0.0729, + "reward": 0.3247767984867096, + "reward_std": 0.12322649359703064, + "rewards/accuracy_reward": 0.04241071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.2823660895228386, "step": 2048 }, { "clip_ratio": 0.0, - "completion_length": 1435.5871276855469, + "completion_length": 1924.8415832519531, "epoch": 0.6120528713314913, - "grad_norm": 2.6107258796691895, - "kl": 0.40673828125, - "learning_rate": 3.923742822638026e-08, - "loss": 0.0255, - "reward": 0.3515625149011612, - "reward_std": 0.17327842116355896, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 21.21870994567871, + "kl": 0.7314453125, + "learning_rate": 1.9618714113190128e-07, + "loss": 0.0743, + "reward": 0.321986623108387, + "reward_std": 0.09982621483504772, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.2840401902794838, "step": 2049 }, { "clip_ratio": 0.0, - "completion_length": 1426.5335388183594, + "completion_length": 1888.7366943359375, "epoch": 0.6123515794190127, - "grad_norm": 1.7397973537445068, - "kl": 0.33642578125, - "learning_rate": 3.918650531308265e-08, - "loss": 0.0097, - "reward": 0.4335937798023224, - "reward_std": 0.19213301688432693, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 31.675918579101562, + "kl": 0.724609375, + "learning_rate": 1.9593252656541325e-07, + "loss": 0.0982, + "reward": 0.3783482313156128, + "reward_std": 0.14071377366781235, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/tag_count_reward": 0.2957589402794838, "step": 2050 }, { "clip_ratio": 0.0, - "completion_length": 1379.37060546875, + "completion_length": 1837.0603332519531, "epoch": 0.6126502875065343, - "grad_norm": 2.3762035369873047, - "kl": 0.322265625, - "learning_rate": 3.913559416380743e-08, - "loss": 0.0381, - "reward": 0.416294664144516, - "reward_std": 0.20545155555009842, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 19.6038875579834, + "kl": 0.6416015625, + "learning_rate": 1.9567797081903714e-07, + "loss": 0.116, + "reward": 0.3805803805589676, + "reward_std": 0.16843822970986366, + "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.3069196566939354, "step": 2051 }, { "clip_ratio": 0.0, - "completion_length": 1349.3371276855469, + "completion_length": 1881.3482971191406, "epoch": 0.6129489955940557, - "grad_norm": 1.793558120727539, - "kl": 0.35693359375, - "learning_rate": 3.9084694833940925e-08, - "loss": 0.0568, - "reward": 0.444196455180645, - "reward_std": 0.1925005130469799, - "rewards/accuracy_reward": 0.12723214668221772, + "grad_norm": 21.44715118408203, + "kl": 0.6943359375, + "learning_rate": 1.9542347416970463e-07, + "loss": 0.1001, + "reward": 0.4095982313156128, + "reward_std": 0.14218728058040142, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.316964291036129, + "rewards/tag_count_reward": 0.2912946566939354, "step": 2052 }, { "clip_ratio": 0.0, - "completion_length": 1388.9866943359375, + "completion_length": 1919.774658203125, "epoch": 0.6132477036815772, - "grad_norm": 1.9176783561706543, - "kl": 0.30712890625, - "learning_rate": 3.903380737885662e-08, - "loss": 0.0381, - "reward": 0.3950893059372902, - "reward_std": 0.1851927451789379, - "rewards/accuracy_reward": 0.06473214668221772, + "grad_norm": 14.921602249145508, + "kl": 0.6875, + "learning_rate": 1.951690368942831e-07, + "loss": 0.0824, + "reward": 0.3292410895228386, + "reward_std": 0.12462631799280643, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.2890625149011612, "step": 2053 }, { "clip_ratio": 0.0, - "completion_length": 1412.79248046875, + "completion_length": 1904.2523193359375, "epoch": 0.6135464117690986, - "grad_norm": 1.3186395168304443, - "kl": 0.38525390625, - "learning_rate": 3.898293185391509e-08, - "loss": 0.0321, - "reward": 0.4285714402794838, - "reward_std": 0.18671483919024467, - "rewards/accuracy_reward": 0.12053571501746774, + "grad_norm": 8.613449096679688, + "kl": 0.6416015625, + "learning_rate": 1.9491465926957546e-07, + "loss": 0.0999, + "reward": 0.3945312723517418, + "reward_std": 0.11939899995923042, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.2873883992433548, "step": 2054 }, { "clip_ratio": 0.0, - "completion_length": 1433.4152221679688, + "completion_length": 1891.8705749511719, "epoch": 0.6138451198566202, - "grad_norm": 1.8014085292816162, - "kl": 0.3427734375, - "learning_rate": 3.8932068314463916e-08, - "loss": 0.0198, - "reward": 0.3197544813156128, - "reward_std": 0.1727081872522831, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 7.6890082359313965, + "kl": 0.630859375, + "learning_rate": 1.9466034157231959e-07, + "loss": 0.0801, + "reward": 0.3041294738650322, + "reward_std": 0.13820013962686062, + "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3108258992433548, + "rewards/tag_count_reward": 0.2929687649011612, "step": 2055 }, { "clip_ratio": 0.0, - "completion_length": 1455.7076416015625, + "completion_length": 1915.8058471679688, "epoch": 0.6141438279441416, - "grad_norm": 2.2924587726593018, - "kl": 0.34521484375, - "learning_rate": 3.8881216815837636e-08, - "loss": 0.0392, - "reward": 0.3632812798023224, - "reward_std": 0.19070767983794212, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 6.97948694229126, + "kl": 0.59765625, + "learning_rate": 1.9440608407918817e-07, + "loss": 0.0809, + "reward": 0.333705373108387, + "reward_std": 0.1400212198495865, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.286830373108387, "step": 2056 }, { "clip_ratio": 0.0, - "completion_length": 1348.4799499511719, + "completion_length": 1887.3661499023438, "epoch": 0.6144425360316631, - "grad_norm": 2.2504513263702393, - "kl": 0.33935546875, - "learning_rate": 3.883037741335771e-08, - "loss": 0.0498, - "reward": 0.4190848395228386, - "reward_std": 0.22089245915412903, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 9.253793716430664, + "kl": 0.5830078125, + "learning_rate": 1.9415188706678857e-07, + "loss": 0.1031, + "reward": 0.3275669813156128, + "reward_std": 0.17872971296310425, + "rewards/accuracy_reward": 0.03571428684517741, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.2918526828289032, "step": 2057 }, { "clip_ratio": 0.0, - "completion_length": 1386.6340026855469, + "completion_length": 1910.5202026367188, "epoch": 0.6147412441191845, - "grad_norm": 1.8121980428695679, - "kl": 0.3583984375, - "learning_rate": 3.877955016233242e-08, - "loss": 0.0457, - "reward": 0.4486607238650322, - "reward_std": 0.23565979674458504, - "rewards/accuracy_reward": 0.12500000279396772, + "grad_norm": 7.67112398147583, + "kl": 0.5751953125, + "learning_rate": 1.938977508116621e-07, + "loss": 0.0811, + "reward": 0.3822544813156128, + "reward_std": 0.16895470768213272, + "rewards/accuracy_reward": 0.09598214388825, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.286272332072258, "step": 2058 }, { "clip_ratio": 0.0, - "completion_length": 1316.0513916015625, + "completion_length": 1829.6652526855469, "epoch": 0.615039952206706, - "grad_norm": 2.117058277130127, - "kl": 0.27783203125, - "learning_rate": 3.872873511805683e-08, - "loss": 0.0426, - "reward": 0.5117187723517418, - "reward_std": 0.22229767963290215, - "rewards/accuracy_reward": 0.1450892947614193, + "grad_norm": 14.717704772949219, + "kl": 0.53515625, + "learning_rate": 1.9364367559028415e-07, + "loss": 0.1143, + "reward": 0.4101562723517418, + "reward_std": 0.15780089795589447, + "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.3164062649011612, "step": 2059 }, { "clip_ratio": 0.0, - "completion_length": 1410.6116943359375, + "completion_length": 1880.62060546875, "epoch": 0.6153386602942275, - "grad_norm": 2.069913148880005, - "kl": 0.282958984375, - "learning_rate": 3.867793233581272e-08, - "loss": 0.0377, - "reward": 0.3995535895228386, - "reward_std": 0.1686110757291317, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 15.343208312988281, + "kl": 0.51123046875, + "learning_rate": 1.933896616790636e-07, + "loss": 0.0817, + "reward": 0.3231026902794838, + "reward_std": 0.1713215596973896, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357142873108387, + "rewards/tag_count_reward": 0.3074776902794838, "step": 2060 }, { "clip_ratio": 0.0, - "completion_length": 1403.0134582519531, + "completion_length": 1866.5000915527344, "epoch": 0.615637368381749, - "grad_norm": 2.596275568008423, - "kl": 0.32373046875, - "learning_rate": 3.8627141870868555e-08, - "loss": 0.0207, - "reward": 0.337611623108387, - "reward_std": 0.1576022356748581, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 9.42824935913086, + "kl": 0.5029296875, + "learning_rate": 1.9313570935434275e-07, + "loss": 0.0881, + "reward": 0.3130580484867096, + "reward_std": 0.1517094001173973, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.3041294738650322, "step": 2061 }, { "clip_ratio": 0.0, - "completion_length": 1411.7701416015625, + "completion_length": 1908.1585388183594, "epoch": 0.6159360764692704, - "grad_norm": 2.8845319747924805, - "kl": 0.333984375, - "learning_rate": 3.857636377847937e-08, - "loss": 0.0255, - "reward": 0.474330373108387, - "reward_std": 0.23542051389813423, - "rewards/accuracy_reward": 0.1540178619325161, + "grad_norm": 8.376147270202637, + "kl": 0.5029296875, + "learning_rate": 1.9288181889239685e-07, + "loss": 0.0718, + "reward": 0.4107142984867096, + "reward_std": 0.1813274547457695, + "rewards/accuracy_reward": 0.1227678582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.2879464402794838, "step": 2062 }, { "clip_ratio": 0.0, - "completion_length": 1336.7300109863281, + "completion_length": 1828.9197082519531, "epoch": 0.6162347845567919, - "grad_norm": 2.666660785675049, - "kl": 0.31396484375, - "learning_rate": 3.852559811388675e-08, - "loss": 0.0439, - "reward": 0.4575893059372902, - "reward_std": 0.19315983355045319, - "rewards/accuracy_reward": 0.10937500838190317, + "grad_norm": 8.778471946716309, + "kl": 0.466796875, + "learning_rate": 1.9262799056943377e-07, + "loss": 0.1016, + "reward": 0.411830373108387, + "reward_std": 0.1879202388226986, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482142984867096, + "rewards/tag_count_reward": 0.3158482313156128, "step": 2063 }, { "clip_ratio": 0.0, - "completion_length": 1362.1250915527344, + "completion_length": 1781.7880249023438, "epoch": 0.6165334926443133, - "grad_norm": 1.9331989288330078, - "kl": 0.31201171875, - "learning_rate": 3.8474844932318784e-08, - "loss": 0.049, - "reward": 0.4006696566939354, - "reward_std": 0.21045466512441635, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 6.207269191741943, + "kl": 0.43359375, + "learning_rate": 1.9237422466159392e-07, + "loss": 0.1151, + "reward": 0.3616071566939354, + "reward_std": 0.1996200494468212, + "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3582589402794838, + "rewards/tag_count_reward": 0.3281250149011612, "step": 2064 }, { "clip_ratio": 0.0, - "completion_length": 1436.6295166015625, + "completion_length": 1896.0201416015625, "epoch": 0.6168322007318349, - "grad_norm": 4.969583034515381, - "kl": 0.33251953125, - "learning_rate": 3.842410428898995e-08, - "loss": 0.0568, - "reward": 0.4955357387661934, - "reward_std": 0.2118680477142334, - "rewards/accuracy_reward": 0.1517857201397419, + "grad_norm": 6.0737104415893555, + "kl": 0.49951171875, + "learning_rate": 1.9212052144494973e-07, + "loss": 0.0689, + "reward": 0.4174107313156128, + "reward_std": 0.18051912635564804, + "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.2968750149011612, "step": 2065 }, { "clip_ratio": 0.0, - "completion_length": 1423.0245971679688, + "completion_length": 1884.4107971191406, "epoch": 0.6171309088193563, - "grad_norm": 2.8911540508270264, - "kl": 0.38671875, - "learning_rate": 3.837337623910107e-08, - "loss": 0.0412, - "reward": 0.3253348395228386, - "reward_std": 0.16596726700663567, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 7.498432159423828, + "kl": 0.455078125, + "learning_rate": 1.9186688119550537e-07, + "loss": 0.0778, + "reward": 0.2924107164144516, + "reward_std": 0.16048693284392357, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.2879464328289032, "step": 2066 }, { "clip_ratio": 0.0, - "completion_length": 1344.5290832519531, + "completion_length": 1878.3862609863281, "epoch": 0.6174296169068777, - "grad_norm": 4.625120162963867, - "kl": 0.3427734375, - "learning_rate": 3.8322660837839314e-08, - "loss": 0.0403, - "reward": 0.368861623108387, - "reward_std": 0.18778765574097633, - "rewards/accuracy_reward": 0.03125000232830644, + "grad_norm": 6.3126983642578125, + "kl": 0.42333984375, + "learning_rate": 1.916133041891966e-07, + "loss": 0.0871, + "reward": 0.325892873108387, + "reward_std": 0.20067593455314636, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3376116156578064, + "rewards/tag_count_reward": 0.2946428656578064, "step": 2067 }, { "clip_ratio": 0.0, - "completion_length": 1419.0536499023438, + "completion_length": 1874.4353637695312, "epoch": 0.6177283249943992, - "grad_norm": 2.7703819274902344, - "kl": 0.38134765625, - "learning_rate": 3.8271958140378076e-08, - "loss": 0.0503, - "reward": 0.4408482313156128, - "reward_std": 0.1894945204257965, - "rewards/accuracy_reward": 0.129464291036129, + "grad_norm": 6.604072093963623, + "kl": 0.408203125, + "learning_rate": 1.9135979070189038e-07, + "loss": 0.0921, + "reward": 0.4179687574505806, + "reward_std": 0.16319535300135612, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.3041294738650322, "step": 2068 }, { "clip_ratio": 0.0, - "completion_length": 1313.2500610351562, + "completion_length": 1804.5826721191406, "epoch": 0.6180270330819206, - "grad_norm": 3.523756265640259, - "kl": 0.291259765625, - "learning_rate": 3.8221268201876904e-08, - "loss": 0.0704, - "reward": 0.5217634290456772, - "reward_std": 0.23114676028490067, - "rewards/accuracy_reward": 0.15625000558793545, + "grad_norm": 7.38966178894043, + "kl": 0.37939453125, + "learning_rate": 1.9110634100938453e-07, + "loss": 0.1164, + "reward": 0.428571455180645, + "reward_std": 0.22308167815208435, + "rewards/accuracy_reward": 0.12723214831203222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.365513414144516, + "rewards/tag_count_reward": 0.301339291036129, "step": 2069 }, { "clip_ratio": 0.0, - "completion_length": 1348.3438110351562, + "completion_length": 1885.0067749023438, "epoch": 0.6183257411694422, - "grad_norm": 2.9423341751098633, - "kl": 0.40771484375, - "learning_rate": 3.817059107748148e-08, - "loss": 0.0521, - "reward": 0.4185268059372902, - "reward_std": 0.22387558594346046, - "rewards/accuracy_reward": 0.10937500838190317, + "grad_norm": 2.2965540885925293, + "kl": 0.42919921875, + "learning_rate": 1.9085295538740742e-07, + "loss": 0.083, + "reward": 0.356584832072258, + "reward_std": 0.1893683262169361, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.2695312574505806, "step": 2070 }, { "clip_ratio": 0.0, - "completion_length": 1381.4710693359375, + "completion_length": 1804.2344665527344, "epoch": 0.6186244492569636, - "grad_norm": 3.642451047897339, - "kl": 0.3740234375, - "learning_rate": 3.811992682232357e-08, - "loss": 0.0619, - "reward": 0.4921875149011612, - "reward_std": 0.167843796312809, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 3.3730814456939697, + "kl": 0.35693359375, + "learning_rate": 1.9059963411161787e-07, + "loss": 0.1204, + "reward": 0.4531250223517418, + "reward_std": 0.17145071178674698, + "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.3058035746216774, "step": 2071 }, { "clip_ratio": 0.0, - "completion_length": 1355.8929138183594, + "completion_length": 1877.2790832519531, "epoch": 0.6189231573444851, - "grad_norm": 3.728930950164795, - "kl": 0.37939453125, - "learning_rate": 3.8069275491520904e-08, - "loss": 0.0561, - "reward": 0.3900669813156128, - "reward_std": 0.17123501375317574, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 2.264995574951172, + "kl": 0.37255859375, + "learning_rate": 1.9034637745760451e-07, + "loss": 0.0932, + "reward": 0.3292410895228386, + "reward_std": 0.1646197885274887, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.2912946566939354, "step": 2072 }, { "clip_ratio": 0.0, - "completion_length": 1451.3080749511719, + "completion_length": 1857.0067749023438, "epoch": 0.6192218654320065, - "grad_norm": 2.734208822250366, - "kl": 0.373046875, - "learning_rate": 3.801863714017717e-08, - "loss": 0.0558, - "reward": 0.3325892984867096, - "reward_std": 0.1926947720348835, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 3.5510640144348145, + "kl": 0.3671875, + "learning_rate": 1.9009318570088585e-07, + "loss": 0.0898, + "reward": 0.3046875149011612, + "reward_std": 0.185846209526062, + "rewards/accuracy_reward": 0.013392857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.2912946566939354, "step": 2073 }, { "clip_ratio": 0.0, - "completion_length": 1434.6139221191406, + "completion_length": 1866.1898498535156, "epoch": 0.619520573519528, - "grad_norm": 2.7554759979248047, - "kl": 0.35498046875, - "learning_rate": 3.796801182338191e-08, - "loss": 0.0379, - "reward": 0.3833705484867096, - "reward_std": 0.1706489622592926, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 1.8919751644134521, + "kl": 0.3564453125, + "learning_rate": 1.8984005911690954e-07, + "loss": 0.0727, + "reward": 0.3297991305589676, + "reward_std": 0.1632586196064949, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342633992433548, + "rewards/tag_count_reward": 0.2896205484867096, "step": 2074 }, { "clip_ratio": 0.0, - "completion_length": 1388.8750915527344, + "completion_length": 1790.6139221191406, "epoch": 0.6198192816070495, - "grad_norm": 2.7257416248321533, - "kl": 0.35107421875, - "learning_rate": 3.7917399596210536e-08, - "loss": 0.0391, - "reward": 0.377232164144516, - "reward_std": 0.2073918581008911, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 2.4560365676879883, + "kl": 0.34033203125, + "learning_rate": 1.8958699798105268e-07, + "loss": 0.1238, + "reward": 0.3242187649011612, + "reward_std": 0.21345408260822296, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857238650322, + "rewards/tag_count_reward": 0.2974330559372902, "step": 2075 }, { "clip_ratio": 0.0, - "completion_length": 1338.2322082519531, + "completion_length": 1823.35498046875, "epoch": 0.620117989694571, - "grad_norm": 1.7016698122024536, - "kl": 0.37255859375, - "learning_rate": 3.7866800513724185e-08, - "loss": 0.0372, - "reward": 0.5145089477300644, - "reward_std": 0.1902744136750698, - "rewards/accuracy_reward": 0.1830357201397419, + "grad_norm": 1.6656031608581543, + "kl": 0.34326171875, + "learning_rate": 1.8933400256862092e-07, + "loss": 0.0942, + "reward": 0.475446455180645, + "reward_std": 0.19583533704280853, + "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.3080357313156128, "step": 2076 }, { "clip_ratio": 0.0, - "completion_length": 1390.02685546875, + "completion_length": 1856.97998046875, "epoch": 0.6204166977820924, - "grad_norm": 3.064152240753174, - "kl": 0.36962890625, - "learning_rate": 3.781621463096968e-08, - "loss": 0.0805, - "reward": 0.4263393133878708, - "reward_std": 0.1954450160264969, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 2.081491470336914, + "kl": 0.34716796875, + "learning_rate": 1.890810731548484e-07, + "loss": 0.088, + "reward": 0.3733259066939354, + "reward_std": 0.1813489869236946, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.281808041036129, "step": 2077 }, { "clip_ratio": 0.0, - "completion_length": 1402.77685546875, + "completion_length": 1871.9598999023438, "epoch": 0.6207154058696139, - "grad_norm": 2.6899092197418213, - "kl": 0.39306640625, - "learning_rate": 3.7765642002979525e-08, - "loss": 0.0473, - "reward": 0.4068080484867096, - "reward_std": 0.20957007631659508, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 2.64741587638855, + "kl": 0.3427734375, + "learning_rate": 1.8882821001489764e-07, + "loss": 0.0889, + "reward": 0.3314732238650322, + "reward_std": 0.20153598487377167, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.2779017984867096, "step": 2078 }, { "clip_ratio": 0.0, - "completion_length": 1432.993408203125, + "completion_length": 1888.7567749023438, "epoch": 0.6210141139571353, - "grad_norm": 3.802117109298706, - "kl": 0.38916015625, - "learning_rate": 3.771508268477178e-08, - "loss": 0.0438, - "reward": 0.3705357313156128, - "reward_std": 0.20632772147655487, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 2.3889336585998535, + "kl": 0.3154296875, + "learning_rate": 1.8857541342385888e-07, + "loss": 0.0924, + "reward": 0.2968750149011612, + "reward_std": 0.20180265977978706, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.279017873108387, "step": 2079 }, { "clip_ratio": 0.0, - "completion_length": 1410.6786193847656, + "completion_length": 1889.4777526855469, "epoch": 0.6213128220446569, - "grad_norm": 3.288353443145752, - "kl": 0.39111328125, - "learning_rate": 3.7664536731350016e-08, - "loss": 0.041, - "reward": 0.3705357387661934, - "reward_std": 0.16407105512917042, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 1.319521427154541, + "kl": 0.3251953125, + "learning_rate": 1.883226836567501e-07, + "loss": 0.0774, + "reward": 0.329241082072258, + "reward_std": 0.19447574391961098, + "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.2801339402794838, "step": 2080 }, { "clip_ratio": 0.0, - "completion_length": 1438.58935546875, + "completion_length": 1810.5179443359375, "epoch": 0.6216115301321783, - "grad_norm": 3.9753923416137695, - "kl": 0.3515625, - "learning_rate": 3.7614004197703277e-08, - "loss": 0.0648, - "reward": 0.3744419813156128, - "reward_std": 0.20583751797676086, - "rewards/accuracy_reward": 0.044642857974395156, + "grad_norm": 1.205213189125061, + "kl": 0.296875, + "learning_rate": 1.8807002098851636e-07, + "loss": 0.1122, + "reward": 0.333705373108387, + "reward_std": 0.22254730015993118, + "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991305589676, + "rewards/tag_count_reward": 0.3002232238650322, "step": 2081 }, { "clip_ratio": 0.0, - "completion_length": 1412.154052734375, - "epoch": 0.6219102382196998, - "grad_norm": 3.127699136734009, - "kl": 0.37109375, - "learning_rate": 3.756348513880602e-08, - "loss": 0.0532, - "reward": 0.4760044813156128, - "reward_std": 0.15534258261322975, - "rewards/accuracy_reward": 0.1473214365541935, + "completion_length": 1881.915283203125, + "epoch": 0.6219102382196998, + "grad_norm": 0.7983621954917908, + "kl": 0.3154296875, + "learning_rate": 1.8781742569403008e-07, + "loss": 0.0988, + "reward": 0.423549123108387, + "reward_std": 0.1816374883055687, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.2717634066939354, "step": 2082 }, { "clip_ratio": 0.0, - "completion_length": 1374.63623046875, + "completion_length": 1816.7389526367188, "epoch": 0.6222089463072212, - "grad_norm": 2.0968074798583984, - "kl": 0.394287109375, - "learning_rate": 3.7512979609618026e-08, - "loss": 0.076, - "reward": 0.3789062649011612, - "reward_std": 0.2037981078028679, - "rewards/accuracy_reward": 0.06696428940631449, + "grad_norm": 0.8310419321060181, + "kl": 0.2890625, + "learning_rate": 1.875648980480901e-07, + "loss": 0.103, + "reward": 0.3878348395228386, + "reward_std": 0.21784145012497902, + "rewards/accuracy_reward": 0.07812500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.309709832072258, "step": 2083 }, { "clip_ratio": 0.0, - "completion_length": 1355.9822082519531, + "completion_length": 1836.32373046875, "epoch": 0.6225076543947428, - "grad_norm": 2.804266929626465, - "kl": 0.39013671875, - "learning_rate": 3.746248766508434e-08, - "loss": 0.0657, - "reward": 0.4587053805589676, - "reward_std": 0.19948383793234825, - "rewards/accuracy_reward": 0.1294642947614193, + "grad_norm": 1.8516379594802856, + "kl": 0.30078125, + "learning_rate": 1.873124383254217e-07, + "loss": 0.1052, + "reward": 0.408482164144516, + "reward_std": 0.1849219687283039, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.2991071566939354, "step": 2084 }, { "clip_ratio": 0.0, - "completion_length": 1350.2813110351562, + "completion_length": 1867.2322387695312, "epoch": 0.6228063624822642, - "grad_norm": 2.738132953643799, - "kl": 0.3974609375, - "learning_rate": 3.7412009360135285e-08, - "loss": 0.045, - "reward": 0.4799107313156128, - "reward_std": 0.19161950051784515, - "rewards/accuracy_reward": 0.14285715413279831, + "grad_norm": 1.6333746910095215, + "kl": 0.3291015625, + "learning_rate": 1.8706004680067644e-07, + "loss": 0.1025, + "reward": 0.4190848395228386, + "reward_std": 0.17525043338537216, + "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.2918526977300644, "step": 2085 }, { "clip_ratio": 0.0, - "completion_length": 1408.9978332519531, + "completion_length": 1808.4019165039062, "epoch": 0.6231050705697857, - "grad_norm": 3.546863555908203, - "kl": 0.3603515625, - "learning_rate": 3.736154474968629e-08, - "loss": 0.0972, - "reward": 0.411830373108387, - "reward_std": 0.17414865642786026, - "rewards/accuracy_reward": 0.0758928582072258, + "grad_norm": 1.0347784757614136, + "kl": 0.294921875, + "learning_rate": 1.8680772374843145e-07, + "loss": 0.1268, + "reward": 0.3632812649011612, + "reward_std": 0.203450296074152, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.2985491305589676, "step": 2086 }, { "clip_ratio": 0.0, - "completion_length": 1427.3482971191406, + "completion_length": 1822.8304443359375, "epoch": 0.6234037786573071, - "grad_norm": 3.890117883682251, - "kl": 0.359375, - "learning_rate": 3.7311093888637904e-08, - "loss": 0.0516, - "reward": 0.431919664144516, - "reward_std": 0.217327568680048, - "rewards/accuracy_reward": 0.09151786123402417, + "grad_norm": 1.7334294319152832, + "kl": 0.296875, + "learning_rate": 1.865554694431895e-07, + "loss": 0.1078, + "reward": 0.3800223395228386, + "reward_std": 0.20891457423567772, + "rewards/accuracy_reward": 0.06696429057046771, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.3130580484867096, "step": 2087 }, { "clip_ratio": 0.0, - "completion_length": 1383.0201416015625, + "completion_length": 1807.9822387695312, "epoch": 0.6237024867448286, - "grad_norm": 3.120703935623169, - "kl": 0.3466796875, - "learning_rate": 3.726065683187571e-08, - "loss": 0.0739, - "reward": 0.4140625223517418, - "reward_std": 0.18963712081313133, - "rewards/accuracy_reward": 0.0691964328289032, + "grad_norm": 1.623219609260559, + "kl": 0.3056640625, + "learning_rate": 1.8630328415937854e-07, + "loss": 0.1221, + "reward": 0.3900669813156128, + "reward_std": 0.18368515372276306, + "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.344866082072258, + "rewards/tag_count_reward": 0.3164062649011612, "step": 2088 }, { "clip_ratio": 0.0, - "completion_length": 1410.9621276855469, + "completion_length": 1761.2322387695312, "epoch": 0.6240011948323501, - "grad_norm": 5.067451477050781, - "kl": 0.357177734375, - "learning_rate": 3.72102336342703e-08, - "loss": 0.0637, - "reward": 0.4023437723517418, - "reward_std": 0.19610876217484474, - "rewards/accuracy_reward": 0.06250000419095159, + "grad_norm": 1.5474106073379517, + "kl": 0.279052734375, + "learning_rate": 1.8605116817135149e-07, + "loss": 0.1032, + "reward": 0.3939732313156128, + "reward_std": 0.1731840819120407, + "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.3359375223517418, "step": 2089 }, { "clip_ratio": 0.0, - "completion_length": 1431.7813415527344, + "completion_length": 1857.1072387695312, "epoch": 0.6242999029198716, - "grad_norm": 4.6005330085754395, - "kl": 0.3681640625, - "learning_rate": 3.715982435067717e-08, - "loss": 0.0591, - "reward": 0.3772321566939354, - "reward_std": 0.19777248799800873, - "rewards/accuracy_reward": 0.055803574388846755, + "grad_norm": 1.4291529655456543, + "kl": 0.3056640625, + "learning_rate": 1.8579912175338586e-07, + "loss": 0.1094, + "reward": 0.3638392984867096, + "reward_std": 0.17072179168462753, + "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.3147321566939354, "step": 2090 }, { "clip_ratio": 0.0, - "completion_length": 1393.8795166015625, + "completion_length": 1780.3728637695312, "epoch": 0.624598611007393, - "grad_norm": 5.019958972930908, - "kl": 0.39892578125, - "learning_rate": 3.7109429035936656e-08, - "loss": 0.0557, - "reward": 0.364955373108387, - "reward_std": 0.14963438734412193, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 1.7529492378234863, + "kl": 0.28759765625, + "learning_rate": 1.8554714517968328e-07, + "loss": 0.1381, + "reward": 0.3967634066939354, + "reward_std": 0.1704740896821022, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.3342634066939354, "step": 2091 }, { "clip_ratio": 0.0, - "completion_length": 1445.3326416015625, + "completion_length": 1817.4576416015625, "epoch": 0.6248973190949145, - "grad_norm": 4.4218831062316895, - "kl": 0.36279296875, - "learning_rate": 3.705904774487396e-08, - "loss": 0.0455, - "reward": 0.4391741305589676, - "reward_std": 0.2083924189209938, - "rewards/accuracy_reward": 0.1026785746216774, + "grad_norm": 1.4974085092544556, + "kl": 0.3212890625, + "learning_rate": 1.8529523872436977e-07, + "loss": 0.1111, + "reward": 0.4229910895228386, + "reward_std": 0.1995687335729599, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.3158482238650322, "step": 2092 }, { "clip_ratio": 0.0, - "completion_length": 1396.7522888183594, + "completion_length": 1762.3728332519531, "epoch": 0.6251960271824359, - "grad_norm": 4.699893474578857, - "kl": 0.33447265625, - "learning_rate": 3.700868053229896e-08, - "loss": 0.0742, - "reward": 0.4190848395228386, - "reward_std": 0.1786315254867077, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 1.1088848114013672, + "kl": 0.30078125, + "learning_rate": 1.850434026614948e-07, + "loss": 0.1091, + "reward": 0.3906250149011612, + "reward_std": 0.15429268591105938, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169887661934, + "rewards/tag_count_reward": 0.3325893059372902, "step": 2093 }, { "clip_ratio": 0.0, - "completion_length": 1456.7098999023438, + "completion_length": 1827.7947387695312, "epoch": 0.6254947352699575, - "grad_norm": 5.603359699249268, - "kl": 0.40234375, - "learning_rate": 3.695832745300626e-08, - "loss": 0.0438, - "reward": 0.4453125074505806, - "reward_std": 0.20706118270754814, - "rewards/accuracy_reward": 0.11383928754366934, + "grad_norm": 1.5202471017837524, + "kl": 0.31103515625, + "learning_rate": 1.8479163726503127e-07, + "loss": 0.1084, + "reward": 0.4196428805589676, + "reward_std": 0.18249313905835152, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.3281250149011612, "step": 2094 }, { "clip_ratio": 0.0, - "completion_length": 1369.1027221679688, + "completion_length": 1721.10498046875, "epoch": 0.6257934433574789, - "grad_norm": 4.811675548553467, - "kl": 0.322265625, - "learning_rate": 3.690798856177505e-08, - "loss": 0.0626, - "reward": 0.3833705559372902, - "reward_std": 0.17823020368814468, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 1.0438306331634521, + "kl": 0.27880859375, + "learning_rate": 1.8453994280887524e-07, + "loss": 0.1678, + "reward": 0.3750000149011612, + "reward_std": 0.16945624724030495, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3565848395228386, + "rewards/tag_count_reward": 0.3526785895228386, "step": 2095 }, { "clip_ratio": 0.0, - "completion_length": 1456.2344665527344, + "completion_length": 1826.57373046875, "epoch": 0.6260921514450004, - "grad_norm": 5.506725311279297, - "kl": 0.4248046875, - "learning_rate": 3.685766391336915e-08, - "loss": 0.0639, - "reward": 0.3766741156578064, - "reward_std": 0.20943843200802803, - "rewards/accuracy_reward": 0.07366071920841932, + "grad_norm": 0.7396443486213684, + "kl": 0.3291015625, + "learning_rate": 1.8428831956684576e-07, + "loss": 0.1198, + "reward": 0.400111623108387, + "reward_std": 0.17511969432234764, + "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.3197544813156128, "step": 2096 }, { "clip_ratio": 0.0, - "completion_length": 1415.7835693359375, + "completion_length": 1833.0045471191406, "epoch": 0.6263908595325218, - "grad_norm": 4.397119522094727, - "kl": 0.384765625, - "learning_rate": 3.680735356253682e-08, - "loss": 0.0386, - "reward": 0.4910714477300644, - "reward_std": 0.19855409488081932, - "rewards/accuracy_reward": 0.16517857927829027, + "grad_norm": 1.3237217664718628, + "kl": 0.35302734375, + "learning_rate": 1.840367678126841e-07, + "loss": 0.1208, + "reward": 0.475446455180645, + "reward_std": 0.14890214428305626, + "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928805589676, + "rewards/tag_count_reward": 0.3169643059372902, "step": 2097 }, { "clip_ratio": 0.0, - "completion_length": 1440.2188110351562, + "completion_length": 1817.6027526855469, "epoch": 0.6266895676200434, - "grad_norm": 4.639224052429199, - "kl": 0.44384765625, - "learning_rate": 3.675705756401078e-08, - "loss": 0.0587, - "reward": 0.344308041036129, - "reward_std": 0.17438779026269913, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 0.9102877974510193, + "kl": 0.330810546875, + "learning_rate": 1.8378528782005393e-07, + "loss": 0.1034, + "reward": 0.3666294813156128, + "reward_std": 0.16900014504790306, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.301897332072258, + "rewards/tag_count_reward": 0.3242187723517418, "step": 2098 }, { "clip_ratio": 0.0, - "completion_length": 1369.0469055175781, + "completion_length": 1729.2433471679688, "epoch": 0.6269882757075648, - "grad_norm": 6.0742292404174805, - "kl": 0.40966796875, - "learning_rate": 3.670677597250818e-08, - "loss": 0.0685, - "reward": 0.404017873108387, - "reward_std": 0.22312000021338463, - "rewards/accuracy_reward": 0.06250000465661287, + "grad_norm": 0.8752507567405701, + "kl": 0.31982421875, + "learning_rate": 1.8353387986254092e-07, + "loss": 0.1534, + "reward": 0.3984375149011612, + "reward_std": 0.19472495838999748, + "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.3426339477300644, "step": 2099 }, { "clip_ratio": 0.0, - "completion_length": 1426.6138916015625, + "completion_length": 1773.6741943359375, "epoch": 0.6272869837950863, - "grad_norm": 6.500479698181152, - "kl": 0.39453125, - "learning_rate": 3.665650884273045e-08, - "loss": 0.0383, - "reward": 0.4302455484867096, - "reward_std": 0.1861368790268898, - "rewards/accuracy_reward": 0.09151786006987095, + "grad_norm": 1.2365474700927734, + "kl": 0.30224609375, + "learning_rate": 1.8328254421365226e-07, + "loss": 0.1339, + "reward": 0.4525669887661934, + "reward_std": 0.17674745246767998, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.356584832072258, "step": 2100 }, { "clip_ratio": 0.0, - "completion_length": 1446.2835693359375, + "completion_length": 1798.4643859863281, "epoch": 0.6275856918826077, - "grad_norm": 5.894349098205566, - "kl": 0.3857421875, - "learning_rate": 3.66062562293633e-08, - "loss": 0.0405, - "reward": 0.3950893059372902, - "reward_std": 0.18128804117441177, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 1.1902239322662354, + "kl": 0.33642578125, + "learning_rate": 1.830312811468165e-07, + "loss": 0.1414, + "reward": 0.404017873108387, + "reward_std": 0.15046238899230957, + "rewards/accuracy_reward": 0.05580357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.3482143059372902, "step": 2101 }, { "clip_ratio": 0.0, - "completion_length": 1422.5514221191406, + "completion_length": 1749.0670471191406, "epoch": 0.6278843999701292, - "grad_norm": 5.327123641967773, - "kl": 0.38330078125, - "learning_rate": 3.655601818707662e-08, - "loss": 0.0666, - "reward": 0.4202009066939354, - "reward_std": 0.2010842263698578, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 0.9776829481124878, + "kl": 0.30517578125, + "learning_rate": 1.827800909353831e-07, + "loss": 0.1478, + "reward": 0.4570312723517418, + "reward_std": 0.19961456209421158, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830559372902, + "rewards/tag_count_reward": 0.361049123108387, "step": 2102 }, { "clip_ratio": 0.0, - "completion_length": 1389.5803833007812, + "completion_length": 1696.3683776855469, "epoch": 0.6281831080576507, - "grad_norm": 4.376345157623291, - "kl": 0.359375, - "learning_rate": 3.6505794770524514e-08, - "loss": 0.0572, - "reward": 0.3816964477300644, - "reward_std": 0.16352875344455242, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 0.7158123850822449, + "kl": 0.3125, + "learning_rate": 1.8252897385262258e-07, + "loss": 0.1189, + "reward": 0.4123884215950966, + "reward_std": 0.1254323273897171, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250223517418, + "rewards/tag_count_reward": 0.3588169887661934, "step": 2103 }, { "clip_ratio": 0.0, - "completion_length": 1474.5156860351562, + "completion_length": 1788.5447082519531, "epoch": 0.6284818161451722, - "grad_norm": 3.870590925216675, - "kl": 0.37109375, - "learning_rate": 3.6455586034345134e-08, - "loss": 0.0609, - "reward": 0.3738839402794838, - "reward_std": 0.16733966767787933, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 0.7134464979171753, + "kl": 0.32666015625, + "learning_rate": 1.8227793017172566e-07, + "loss": 0.1378, + "reward": 0.4073660969734192, + "reward_std": 0.1486248429864645, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.3537946566939354, "step": 2104 }, { "clip_ratio": 0.0, - "completion_length": 1430.1719360351562, + "completion_length": 1743.8081359863281, "epoch": 0.6287805242326936, - "grad_norm": 5.809938907623291, - "kl": 0.3525390625, - "learning_rate": 3.640539203316063e-08, - "loss": 0.0454, - "reward": 0.473772332072258, - "reward_std": 0.1996864639222622, - "rewards/accuracy_reward": 0.12053572130389512, + "grad_norm": 0.6041935086250305, + "kl": 0.30126953125, + "learning_rate": 1.8202696016580316e-07, + "loss": 0.1267, + "reward": 0.5050223469734192, + "reward_std": 0.18869824893772602, + "rewards/accuracy_reward": 0.13169643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3532366305589676, + "rewards/tag_count_reward": 0.373325914144516, "step": 2105 }, { "clip_ratio": 0.0, - "completion_length": 1439.7813415527344, + "completion_length": 1734.3572387695312, "epoch": 0.6290792323202151, - "grad_norm": 5.905862808227539, - "kl": 0.388671875, - "learning_rate": 3.63552128215772e-08, - "loss": 0.0633, - "reward": 0.436383955180645, - "reward_std": 0.19392670318484306, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 0.5518330931663513, + "kl": 0.3251953125, + "learning_rate": 1.81776064107886e-07, + "loss": 0.1403, + "reward": 0.4921875149011612, + "reward_std": 0.17287111282348633, + "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.3627232387661934, "step": 2106 }, { "clip_ratio": 0.0, - "completion_length": 1426.1339721679688, + "completion_length": 1681.6139221191406, "epoch": 0.6293779404077365, - "grad_norm": 6.272287368774414, - "kl": 0.376953125, - "learning_rate": 3.630504845418488e-08, - "loss": 0.0917, - "reward": 0.4681919887661934, - "reward_std": 0.1889251470565796, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 0.5544466376304626, + "kl": 0.273681640625, + "learning_rate": 1.815252422709244e-07, + "loss": 0.1501, + "reward": 0.5301339477300644, + "reward_std": 0.1689399555325508, + "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.3872767984867096, "step": 2107 }, { "clip_ratio": 0.0, - "completion_length": 1442.77685546875, + "completion_length": 1680.368408203125, "epoch": 0.6296766484952581, - "grad_norm": 7.936668872833252, - "kl": 0.38623046875, - "learning_rate": 3.625489898555759e-08, - "loss": 0.0787, - "reward": 0.428571455180645, - "reward_std": 0.18310129642486572, - "rewards/accuracy_reward": 0.0959821455180645, + "grad_norm": 0.8801741600036621, + "kl": 0.279541015625, + "learning_rate": 1.8127449492778796e-07, + "loss": 0.1538, + "reward": 0.4994419887661934, + "reward_std": 0.16287441924214363, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893133878708, + "rewards/tag_count_reward": 0.3833705484867096, "step": 2108 }, { "clip_ratio": 0.0, - "completion_length": 1423.4598999023438, + "completion_length": 1713.9889221191406, "epoch": 0.6299753565827795, - "grad_norm": 5.268916606903076, - "kl": 0.39404296875, - "learning_rate": 3.6204764470253035e-08, - "loss": 0.0626, - "reward": 0.4531250149011612, - "reward_std": 0.19209931790828705, - "rewards/accuracy_reward": 0.11607143469154835, + "grad_norm": 0.5388920307159424, + "kl": 0.291259765625, + "learning_rate": 1.8102382235126518e-07, + "loss": 0.1479, + "reward": 0.5122768133878708, + "reward_std": 0.16867785528302193, + "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.3783482313156128, "step": 2109 }, { "clip_ratio": 0.0, - "completion_length": 1452.1831359863281, + "completion_length": 1750.76123046875, "epoch": 0.6302740646703009, - "grad_norm": 8.50667667388916, - "kl": 0.39013671875, - "learning_rate": 3.6154644962812664e-08, - "loss": 0.0701, - "reward": 0.3437500149011612, - "reward_std": 0.185895886272192, - "rewards/accuracy_reward": 0.022321430034935474, + "grad_norm": 0.6623998284339905, + "kl": 0.3154296875, + "learning_rate": 1.8077322481406332e-07, + "loss": 0.1265, + "reward": 0.3995535895228386, + "reward_std": 0.1532442942261696, + "rewards/accuracy_reward": 0.026785714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285746216774, + "rewards/tag_count_reward": 0.372767873108387, "step": 2110 }, { "clip_ratio": 0.0, - "completion_length": 1423.9732971191406, + "completion_length": 1654.4822082519531, "epoch": 0.6305727727578224, - "grad_norm": 5.7852463722229, - "kl": 0.35009765625, - "learning_rate": 3.610454051776159e-08, - "loss": 0.0689, - "reward": 0.3950893059372902, - "reward_std": 0.1883166842162609, - "rewards/accuracy_reward": 0.060267859837040305, + "grad_norm": 0.4257897734642029, + "kl": 0.26318359375, + "learning_rate": 1.8052270258880795e-07, + "loss": 0.1225, + "reward": 0.4771205633878708, + "reward_std": 0.17281510308384895, + "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4012276902794838, "step": 2111 }, { "clip_ratio": 0.0, - "completion_length": 1464.4397888183594, + "completion_length": 1777.5804138183594, "epoch": 0.6308714808453438, - "grad_norm": 5.997575759887695, - "kl": 0.40185546875, - "learning_rate": 3.605445118960851e-08, - "loss": 0.0594, - "reward": 0.364955373108387, - "reward_std": 0.1559458151459694, + "grad_norm": 0.725608229637146, + "kl": 0.34912109375, + "learning_rate": 1.8027225594804257e-07, + "loss": 0.1419, + "reward": 0.3995535895228386, + "reward_std": 0.13435711525380611, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.3593750074505806, "step": 2112 }, { "clip_ratio": 0.0, - "completion_length": 1401.63623046875, + "completion_length": 1617.80810546875, "epoch": 0.6311701889328654, - "grad_norm": 5.52393913269043, - "kl": 0.3720703125, - "learning_rate": 3.600437703284575e-08, - "loss": 0.0721, - "reward": 0.4486607313156128, - "reward_std": 0.2110895998775959, - "rewards/accuracy_reward": 0.10937500651925802, + "grad_norm": 0.4285762906074524, + "kl": 0.27099609375, + "learning_rate": 1.8002188516422873e-07, + "loss": 0.1523, + "reward": 0.4994419887661934, + "reward_std": 0.149072527885437, + "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.3989955633878708, "step": 2113 }, { "clip_ratio": 0.0, - "completion_length": 1483.6272888183594, + "completion_length": 1762.5804443359375, "epoch": 0.6314688970203868, - "grad_norm": 3.6957249641418457, - "kl": 0.3740234375, - "learning_rate": 3.5954318101949045e-08, - "loss": 0.0639, - "reward": 0.4330357313156128, - "reward_std": 0.19995862245559692, - "rewards/accuracy_reward": 0.10714286100119352, + "grad_norm": 0.6978349685668945, + "kl": 0.3232421875, + "learning_rate": 1.7977159050974522e-07, + "loss": 0.1341, + "reward": 0.4559151902794838, + "reward_std": 0.1532731056213379, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.3710937649011612, "step": 2114 }, { "clip_ratio": 0.0, - "completion_length": 1422.7322082519531, + "completion_length": 1732.7389221191406, "epoch": 0.6317676051079083, - "grad_norm": 7.633825302124023, - "kl": 0.392578125, - "learning_rate": 3.5904274451377634e-08, - "loss": 0.0501, - "reward": 0.4458705633878708, - "reward_std": 0.24786751717329025, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 0.38722729682922363, + "kl": 0.31640625, + "learning_rate": 1.7952137225688816e-07, + "loss": 0.0963, + "reward": 0.526785746216774, + "reward_std": 0.23611834272742271, + "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.3794643059372902, "step": 2115 }, { "clip_ratio": 0.0, - "completion_length": 1384.7389221191406, + "completion_length": 1719.0960693359375, "epoch": 0.6320663131954297, - "grad_norm": 10.45849323272705, - "kl": 0.42724609375, - "learning_rate": 3.5854246135574066e-08, - "loss": 0.07, - "reward": 0.424107164144516, - "reward_std": 0.18778224289417267, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 0.4838818907737732, + "kl": 0.3466796875, + "learning_rate": 1.7927123067787033e-07, + "loss": 0.1491, + "reward": 0.5000000298023224, + "reward_std": 0.18269427865743637, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.372767873108387, "step": 2116 }, { "clip_ratio": 0.0, - "completion_length": 1488.3683471679688, + "completion_length": 1824.3259887695312, "epoch": 0.6323650212829512, - "grad_norm": 5.826597690582275, - "kl": 0.375, - "learning_rate": 3.5804233208964284e-08, - "loss": 0.0629, - "reward": 0.4162946715950966, - "reward_std": 0.1790376491844654, - "rewards/accuracy_reward": 0.08705357322469354, + "grad_norm": 0.4241102337837219, + "kl": 0.3349609375, + "learning_rate": 1.7902116604482143e-07, + "loss": 0.1143, + "reward": 0.4587053805589676, + "reward_std": 0.15574504621326923, + "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.3694196566939354, "step": 2117 }, { "clip_ratio": 0.0, - "completion_length": 1495.2746276855469, + "completion_length": 1785.5759582519531, "epoch": 0.6326637293704727, - "grad_norm": 5.585185527801514, - "kl": 0.41796875, - "learning_rate": 3.5754235725957445e-08, - "loss": 0.0399, - "reward": 0.3225446492433548, - "reward_std": 0.16779286786913872, + "grad_norm": 0.4569185972213745, + "kl": 0.32763671875, + "learning_rate": 1.7877117862978722e-07, + "loss": 0.1126, + "reward": 0.3816964477300644, + "reward_std": 0.14352201856672764, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.3683035895228386, "step": 2118 }, { "clip_ratio": 0.0, - "completion_length": 1448.6763916015625, + "completion_length": 1776.9532165527344, "epoch": 0.6329624374579942, - "grad_norm": 9.146811485290527, - "kl": 0.39111328125, - "learning_rate": 3.570425374094588e-08, - "loss": 0.0488, - "reward": 0.3342634066939354, - "reward_std": 0.14260813407599926, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 0.39548131823539734, + "kl": 0.32763671875, + "learning_rate": 1.785212687047294e-07, + "loss": 0.1134, + "reward": 0.4056919813156128, + "reward_std": 0.17617937549948692, + "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.3722098395228386, "step": 2119 }, { "clip_ratio": 0.0, - "completion_length": 1490.7679138183594, + "completion_length": 1696.8014221191406, "epoch": 0.6332611455455156, - "grad_norm": 7.134737014770508, - "kl": 0.369140625, - "learning_rate": 3.565428730830513e-08, - "loss": 0.0546, - "reward": 0.3588169813156128, - "reward_std": 0.1501086950302124, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 0.34707358479499817, + "kl": 0.2841796875, + "learning_rate": 1.7827143654152565e-07, + "loss": 0.1235, + "reward": 0.427455373108387, + "reward_std": 0.13176509737968445, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.3984375223517418, "step": 2120 }, { "clip_ratio": 0.0, - "completion_length": 1487.368408203125, + "completion_length": 1735.0915832519531, "epoch": 0.6335598536330371, - "grad_norm": 7.8896002769470215, - "kl": 0.3515625, - "learning_rate": 3.560433648239375e-08, - "loss": 0.0803, - "reward": 0.3934151902794838, - "reward_std": 0.20536361634731293, - "rewards/accuracy_reward": 0.06473214738070965, + "grad_norm": 0.38854965567588806, + "kl": 0.266845703125, + "learning_rate": 1.7802168241196876e-07, + "loss": 0.1088, + "reward": 0.4905134066939354, + "reward_std": 0.1862284429371357, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.3989955559372902, "step": 2121 }, { "clip_ratio": 0.0, - "completion_length": 1388.9197082519531, + "completion_length": 1694.2925109863281, "epoch": 0.6338585617205585, - "grad_norm": 9.894698143005371, - "kl": 0.43896484375, - "learning_rate": 3.5554401317553354e-08, - "loss": 0.0814, - "reward": 0.356584832072258, - "reward_std": 0.15422919392585754, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 1371.439208984375, + "kl": 4.068359375, + "learning_rate": 1.7777200658776678e-07, + "loss": 0.2675, + "reward": 0.4419643059372902, + "reward_std": 0.16847360506653786, + "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062574505806, + "rewards/tag_count_reward": 0.3772321566939354, "step": 2122 }, { "clip_ratio": 0.0, - "completion_length": 1433.13623046875, + "completion_length": 1685.0179138183594, "epoch": 0.6341572698080801, - "grad_norm": 6.894671440124512, - "kl": 0.365234375, - "learning_rate": 3.550448186810849e-08, - "loss": 0.0752, - "reward": 0.4765625223517418, - "reward_std": 0.2308347187936306, - "rewards/accuracy_reward": 0.13616071734577417, + "grad_norm": 0.5019664764404297, + "kl": 0.290283203125, + "learning_rate": 1.7752240934054247e-07, + "loss": 0.1359, + "reward": 0.5440848395228386, + "reward_std": 0.19859091006219387, + "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.3967634066939354, "step": 2123 }, { "clip_ratio": 0.0, - "completion_length": 1405.8973999023438, + "completion_length": 1720.4509582519531, "epoch": 0.6344559778956015, - "grad_norm": 10.270512580871582, - "kl": 0.44287109375, - "learning_rate": 3.5454578188366654e-08, - "loss": 0.0744, - "reward": 0.4369419813156128, - "reward_std": 0.15187473595142365, - "rewards/accuracy_reward": 0.11607143515720963, + "grad_norm": 0.6113325953483582, + "kl": 0.31396484375, + "learning_rate": 1.772728909418333e-07, + "loss": 0.128, + "reward": 0.5106026977300644, + "reward_std": 0.1420088279992342, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.392299123108387, "step": 2124 }, { "clip_ratio": 0.0, - "completion_length": 1449.8706359863281, + "completion_length": 1743.2612609863281, "epoch": 0.634754685983123, - "grad_norm": 5.693600177764893, - "kl": 0.37109375, - "learning_rate": 3.540469033261815e-08, - "loss": 0.0132, - "reward": 0.4107142984867096, - "reward_std": 0.13257848285138607, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 0.622826874256134, + "kl": 0.2978515625, + "learning_rate": 1.7702345166309075e-07, + "loss": 0.1133, + "reward": 0.464285746216774, + "reward_std": 0.126892926171422, + "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.3861607387661934, "step": 2125 }, { "clip_ratio": 0.0, - "completion_length": 1425.83935546875, + "completion_length": 1780.6786804199219, "epoch": 0.6350533940706444, - "grad_norm": 9.414541244506836, - "kl": 0.396484375, - "learning_rate": 3.535481835513605e-08, - "loss": 0.0555, - "reward": 0.4252232313156128, - "reward_std": 0.1887710727751255, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 0.9348044991493225, + "kl": 0.3115234375, + "learning_rate": 1.7677409177568025e-07, + "loss": 0.1073, + "reward": 0.4492187798023224, + "reward_std": 0.16591059044003487, + "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3448660969734192, + "rewards/tag_count_reward": 0.3822544813156128, "step": 2126 }, { "clip_ratio": 0.0, - "completion_length": 1443.3974304199219, + "completion_length": 1764.5402221679688, "epoch": 0.635352102158166, - "grad_norm": 8.699043273925781, - "kl": 0.37109375, - "learning_rate": 3.530496231017621e-08, - "loss": 0.0656, - "reward": 0.4034598395228386, - "reward_std": 0.1891399398446083, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 0.40611472725868225, + "kl": 0.30126953125, + "learning_rate": 1.7652481155088106e-07, + "loss": 0.1094, + "reward": 0.467075914144516, + "reward_std": 0.1750699169933796, + "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.388950914144516, "step": 2127 }, { "clip_ratio": 0.0, - "completion_length": 1522.7054138183594, + "completion_length": 1822.5782165527344, "epoch": 0.6356508102456874, - "grad_norm": 7.4040207862854, - "kl": 0.40869140625, - "learning_rate": 3.5255122251977114e-08, - "loss": 0.0447, - "reward": 0.4419643133878708, - "reward_std": 0.18500856682658195, - "rewards/accuracy_reward": 0.12946429196745157, + "grad_norm": 1.2373063564300537, + "kl": 0.35888671875, + "learning_rate": 1.7627561125988557e-07, + "loss": 0.0972, + "reward": 0.513950914144516, + "reward_std": 0.16292742639780045, + "rewards/accuracy_reward": 0.1361607185099274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.3777901902794838, "step": 2128 }, { "clip_ratio": 0.0, - "completion_length": 1426.2522888183594, + "completion_length": 1713.8438415527344, "epoch": 0.6359495183332089, - "grad_norm": 10.762430191040039, - "kl": 0.37744140625, - "learning_rate": 3.520529823475985e-08, - "loss": 0.0672, - "reward": 0.4196428656578064, - "reward_std": 0.18383062817156315, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 0.4926605224609375, + "kl": 0.265869140625, + "learning_rate": 1.7602649117379925e-07, + "loss": 0.1156, + "reward": 0.5033482387661934, + "reward_std": 0.13990241661667824, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.330357164144516, + "rewards/tag_count_reward": 0.4051339402794838, "step": 2129 }, { "clip_ratio": 0.0, - "completion_length": 1472.2835693359375, + "completion_length": 1745.6139221191406, "epoch": 0.6362482264207303, - "grad_norm": 8.643102645874023, - "kl": 0.41162109375, - "learning_rate": 3.515549031272806e-08, - "loss": 0.0801, - "reward": 0.3292410895228386, - "reward_std": 0.1828749217092991, - "rewards/accuracy_reward": 0.020089286379516125, + "grad_norm": 0.4288490414619446, + "kl": 0.292236328125, + "learning_rate": 1.7577745156364033e-07, + "loss": 0.0938, + "reward": 0.4302455484867096, + "reward_std": 0.178131565451622, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.3989955559372902, "step": 2130 }, { "clip_ratio": 0.0, - "completion_length": 1388.6451416015625, + "completion_length": 1725.0849304199219, "epoch": 0.6365469345082518, - "grad_norm": 6.486000061035156, - "kl": 0.3603515625, - "learning_rate": 3.510569854006791e-08, - "loss": 0.0531, - "reward": 0.4375000149011612, - "reward_std": 0.1911369003355503, - "rewards/accuracy_reward": 0.09375000279396772, + "grad_norm": 0.41701826453208923, + "kl": 0.2763671875, + "learning_rate": 1.7552849270033954e-07, + "loss": 0.0956, + "reward": 0.4799107238650322, + "reward_std": 0.16452564299106598, + "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.3973214477300644, "step": 2131 }, { "clip_ratio": 0.0, - "completion_length": 1473.5536499023438, + "completion_length": 1793.4420471191406, "epoch": 0.6368456425957733, - "grad_norm": 9.462711334228516, - "kl": 0.3828125, - "learning_rate": 3.505592297094794e-08, - "loss": 0.0654, - "reward": 0.491071455180645, - "reward_std": 0.2513725012540817, - "rewards/accuracy_reward": 0.15401786426082253, + "grad_norm": 0.46655797958374023, + "kl": 0.296630859375, + "learning_rate": 1.752796148547397e-07, + "loss": 0.0929, + "reward": 0.5563616454601288, + "reward_std": 0.2164175733923912, + "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.3956473469734192, "step": 2132 }, { "clip_ratio": 0.0, - "completion_length": 1310.1361999511719, + "completion_length": 1730.8773193359375, "epoch": 0.6371443506832948, - "grad_norm": 10.193946838378906, - "kl": 0.3828125, - "learning_rate": 3.500616365951909e-08, - "loss": 0.0775, - "reward": 0.4737723395228386, - "reward_std": 0.1731637604534626, - "rewards/accuracy_reward": 0.13169643771834671, + "grad_norm": 0.4649956226348877, + "kl": 0.30029296875, + "learning_rate": 1.7503081829759545e-07, + "loss": 0.0957, + "reward": 0.5290178880095482, + "reward_std": 0.16916153579950333, + "rewards/accuracy_reward": 0.14062500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.342075914144516, + "rewards/tag_count_reward": 0.388392873108387, "step": 2133 }, { "clip_ratio": 0.0, - "completion_length": 1439.2054138183594, + "completion_length": 1745.0022888183594, "epoch": 0.6374430587708162, - "grad_norm": 10.87529468536377, - "kl": 0.39208984375, - "learning_rate": 3.4956420659914635e-08, - "loss": 0.0852, - "reward": 0.4620535969734192, - "reward_std": 0.2164171077311039, - "rewards/accuracy_reward": 0.12500000488944352, + "grad_norm": 8.007674217224121, + "kl": 0.343505859375, + "learning_rate": 1.7478210329957315e-07, + "loss": 0.1052, + "reward": 0.5200893059372902, + "reward_std": 0.17915472760796547, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.4040178656578064, "step": 2134 }, { "clip_ratio": 0.0, - "completion_length": 1397.4777526855469, + "completion_length": 1755.0625915527344, "epoch": 0.6377417668583377, - "grad_norm": 7.608232021331787, - "kl": 0.31298828125, - "learning_rate": 3.490669402625007e-08, - "loss": 0.0419, - "reward": 0.4369419813156128, - "reward_std": 0.1938200667500496, - "rewards/accuracy_reward": 0.09151786286383867, + "grad_norm": 0.5245476961135864, + "kl": 0.2451171875, + "learning_rate": 1.7453347013125035e-07, + "loss": 0.0868, + "reward": 0.5273437723517418, + "reward_std": 0.20810982212424278, + "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3454241156578064, + "rewards/tag_count_reward": 0.4023437723517418, "step": 2135 }, { "clip_ratio": 0.0, - "completion_length": 1392.024658203125, + "completion_length": 1714.6697082519531, "epoch": 0.6380404749458591, - "grad_norm": 10.12797737121582, - "kl": 0.376953125, - "learning_rate": 3.485698381262311e-08, - "loss": 0.0728, - "reward": 0.4536830559372902, - "reward_std": 0.19097505509853363, - "rewards/accuracy_reward": 0.11383929336443543, + "grad_norm": 0.5301141142845154, + "kl": 0.22265625, + "learning_rate": 1.7428491906311555e-07, + "loss": 0.08, + "reward": 0.554129496216774, + "reward_std": 0.18122979626059532, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2136 }, { "clip_ratio": 0.0, - "completion_length": 1418.8795166015625, + "completion_length": 1786.2969360351562, "epoch": 0.6383391830333807, - "grad_norm": 9.3640775680542, - "kl": 0.419921875, - "learning_rate": 3.480729007311357e-08, - "loss": 0.0632, - "reward": 0.4023437649011612, - "reward_std": 0.17107393592596054, - "rewards/accuracy_reward": 0.09375000186264515, + "grad_norm": 0.3667783737182617, + "kl": 0.290771484375, + "learning_rate": 1.7403645036556785e-07, + "loss": 0.0903, + "reward": 0.491071455180645, + "reward_std": 0.15836292132735252, + "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937649011612, + "rewards/tag_count_reward": 0.397321455180645, "step": 2137 }, { "clip_ratio": 0.0, - "completion_length": 1358.9330749511719, + "completion_length": 1705.6540832519531, "epoch": 0.6386378911209021, - "grad_norm": 8.931136131286621, - "kl": 0.369873046875, - "learning_rate": 3.4757612861783405e-08, - "loss": 0.0561, - "reward": 0.4815848469734192, - "reward_std": 0.21705275774002075, - "rewards/accuracy_reward": 0.13839286053553224, + "grad_norm": 0.3673921823501587, + "kl": 0.260498046875, + "learning_rate": 1.73788064308917e-07, + "loss": 0.0957, + "reward": 0.5613839700818062, + "reward_std": 0.2142045982182026, + "rewards/accuracy_reward": 0.15848214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.4029018133878708, "step": 2138 }, { "clip_ratio": 0.0, - "completion_length": 1389.2344360351562, + "completion_length": 1711.5067749023438, "epoch": 0.6389365992084236, - "grad_norm": 10.940924644470215, - "kl": 0.33056640625, - "learning_rate": 3.4707952232676516e-08, - "loss": 0.0681, - "reward": 0.415736623108387, - "reward_std": 0.18142638728022575, - "rewards/accuracy_reward": 0.05803571594879031, + "grad_norm": 0.45877182483673096, + "kl": 0.23193359375, + "learning_rate": 1.7353976116338258e-07, + "loss": 0.105, + "reward": 0.4771205559372902, + "reward_std": 0.1556875165551901, + "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3577009066939354, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2139 }, { "clip_ratio": 0.0, - "completion_length": 1400.7589721679688, + "completion_length": 1701.8482666015625, "epoch": 0.639235307295945, - "grad_norm": 11.297125816345215, - "kl": 0.37841796875, - "learning_rate": 3.465830823981883e-08, - "loss": 0.0767, - "reward": 0.4073660895228386, - "reward_std": 0.17442082986235619, - "rewards/accuracy_reward": 0.07142857182770967, + "grad_norm": 0.4578523635864258, + "kl": 0.236328125, + "learning_rate": 1.7329154119909413e-07, + "loss": 0.0999, + "reward": 0.4921875223517418, + "reward_std": 0.11706082336604595, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2140 }, { "clip_ratio": 0.0, - "completion_length": 1347.0045471191406, + "completion_length": 1719.2478332519531, "epoch": 0.6395340153834665, - "grad_norm": 13.225234031677246, - "kl": 0.34326171875, - "learning_rate": 3.460868093721811e-08, - "loss": 0.0983, - "reward": 0.4676339477300644, - "reward_std": 0.17161382362246513, - "rewards/accuracy_reward": 0.11830357578583062, + "grad_norm": 0.453637957572937, + "kl": 0.264892578125, + "learning_rate": 1.7304340468609058e-07, + "loss": 0.0868, + "reward": 0.521205373108387, + "reward_std": 0.1501566246151924, + "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3493303656578064, + "rewards/tag_count_reward": 0.4073660895228386, "step": 2141 }, { "clip_ratio": 0.0, - "completion_length": 1407.9442443847656, + "completion_length": 1718.2813110351562, "epoch": 0.639832723470988, - "grad_norm": 14.030302047729492, - "kl": 0.3994140625, - "learning_rate": 3.455907037886404e-08, - "loss": 0.0781, - "reward": 0.4972098544239998, - "reward_std": 0.1751926802098751, - "rewards/accuracy_reward": 0.1696428705472499, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "grad_norm": 0.4856603443622589, + "kl": 0.242919921875, + "learning_rate": 1.727953518943202e-07, + "loss": 0.1056, + "reward": 0.5814732387661934, + "reward_std": 0.1565337087959051, + "rewards/accuracy_reward": 0.16741072502918541, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4140625223517418, "step": 2142 }, { "clip_ratio": 0.0, - "completion_length": 1452.0782165527344, + "completion_length": 1794.7656860351562, "epoch": 0.6401314315585095, - "grad_norm": 11.751813888549805, - "kl": 0.41064453125, - "learning_rate": 3.4509476618728015e-08, - "loss": 0.0709, - "reward": 0.3543526902794838, - "reward_std": 0.20746931433677673, - "rewards/accuracy_reward": 0.03125000046566129, + "grad_norm": 0.9509595036506653, + "kl": 0.264892578125, + "learning_rate": 1.7254738309364008e-07, + "loss": 0.0839, + "reward": 0.444196455180645, + "reward_std": 0.17128043435513973, + "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026828289032, + "rewards/tag_count_reward": 0.4062500223517418, "step": 2143 }, { "clip_ratio": 0.0, - "completion_length": 1358.9665832519531, + "completion_length": 1666.4509887695312, "epoch": 0.6404301396460309, - "grad_norm": 16.03166961669922, - "kl": 0.34619140625, - "learning_rate": 3.4459899710763196e-08, - "loss": 0.0896, - "reward": 0.464285746216774, - "reward_std": 0.17406337335705757, - "rewards/accuracy_reward": 0.1004464365541935, + "grad_norm": 0.4770304560661316, + "kl": 0.206298828125, + "learning_rate": 1.7229949855381598e-07, + "loss": 0.1051, + "reward": 0.5368303954601288, + "reward_std": 0.12840250879526138, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3638393059372902, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2144 }, { "clip_ratio": 0.0, - "completion_length": 1452.1228332519531, + "completion_length": 1731.5692749023438, "epoch": 0.6407288477335524, - "grad_norm": 12.41916561126709, - "kl": 0.337890625, - "learning_rate": 3.44103397089044e-08, - "loss": 0.0808, - "reward": 0.3309151977300644, - "reward_std": 0.1793864667415619, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 0.4486764669418335, + "kl": 0.223876953125, + "learning_rate": 1.72051698544522e-07, + "loss": 0.0949, + "reward": 0.4497768133878708, + "reward_std": 0.15864156931638718, + "rewards/accuracy_reward": 0.03125000046566129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901977300644, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2145 }, { "clip_ratio": 0.0, - "completion_length": 1439.3058776855469, + "completion_length": 1743.7433776855469, "epoch": 0.6410275558210738, - "grad_norm": 12.08767318725586, - "kl": 0.359375, - "learning_rate": 3.4360796667068046e-08, - "loss": 0.0832, - "reward": 0.4029018133878708, - "reward_std": 0.1679161749780178, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 0.41825202107429504, + "kl": 0.213134765625, + "learning_rate": 1.7180398333534025e-07, + "loss": 0.0813, + "reward": 0.466517873108387, + "reward_std": 0.11457138508558273, + "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625223517418, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2146 }, { "clip_ratio": 0.0, - "completion_length": 1382.6138916015625, + "completion_length": 1713.2991638183594, "epoch": 0.6413262639085954, - "grad_norm": 7.969903469085693, - "kl": 0.39892578125, - "learning_rate": 3.4311270639152125e-08, - "loss": 0.0377, - "reward": 0.4140625149011612, - "reward_std": 0.15697452798485756, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 0.31682437658309937, + "kl": 0.243408203125, + "learning_rate": 1.715563531957606e-07, + "loss": 0.068, + "reward": 0.5245535969734192, + "reward_std": 0.14568622782826424, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482238650322, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2147 }, { "clip_ratio": 0.0, - "completion_length": 1430.1965026855469, + "completion_length": 1757.3192749023438, "epoch": 0.6416249719961168, - "grad_norm": 15.424529075622559, - "kl": 0.37060546875, - "learning_rate": 3.426176167903606e-08, - "loss": 0.075, - "reward": 0.3928571566939354, - "reward_std": 0.17054926604032516, - "rewards/accuracy_reward": 0.06250000186264515, + "grad_norm": 1.4602437019348145, + "kl": 0.24853515625, + "learning_rate": 1.7130880839518032e-07, + "loss": 0.0658, + "reward": 0.4871652126312256, + "reward_std": 0.14345146343111992, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571492433548, + "rewards/tag_count_reward": 0.4157366305589676, "step": 2148 }, { "clip_ratio": 0.0, - "completion_length": 1453.7076416015625, + "completion_length": 1757.8550109863281, "epoch": 0.6419236800836383, - "grad_norm": 14.47196102142334, - "kl": 0.40283203125, - "learning_rate": 3.42122698405808e-08, - "loss": 0.0576, - "reward": 0.4135044887661934, - "reward_std": 0.18757230788469315, - "rewards/accuracy_reward": 0.08928572107106447, + "grad_norm": 0.45338594913482666, + "kl": 0.2587890625, + "learning_rate": 1.7106134920290398e-07, + "loss": 0.0911, + "reward": 0.5228794813156128, + "reward_std": 0.19778629764914513, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187574505806, + "rewards/tag_count_reward": 0.4023437649011612, "step": 2149 }, { "clip_ratio": 0.0, - "completion_length": 1495.9509582519531, + "completion_length": 1822.2947387695312, "epoch": 0.6422223881711597, - "grad_norm": 13.389369010925293, - "kl": 0.3525390625, - "learning_rate": 3.4162795177628577e-08, - "loss": 0.0622, - "reward": 0.4001116305589676, - "reward_std": 0.1602749191224575, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 0.6706060767173767, + "kl": 0.259033203125, + "learning_rate": 1.708139758881429e-07, + "loss": 0.0899, + "reward": 0.4854910895228386, + "reward_std": 0.15445445477962494, + "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.3939732238650322, "step": 2150 }, { "clip_ratio": 0.0, - "completion_length": 1450.1406860351562, + "completion_length": 1787.3259887695312, "epoch": 0.6425210962586813, - "grad_norm": 16.146121978759766, - "kl": 0.4140625, - "learning_rate": 3.411333774400299e-08, - "loss": 0.0697, - "reward": 0.3320312649011612, - "reward_std": 0.18875395134091377, - "rewards/accuracy_reward": 0.022321429569274187, + "grad_norm": 0.483112633228302, + "kl": 0.28759765625, + "learning_rate": 1.7056668872001496e-07, + "loss": 0.0995, + "reward": 0.423549123108387, + "reward_std": 0.17098619788885117, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "rewards/tag_count_reward": 0.392299123108387, "step": 2151 }, { "clip_ratio": 0.0, - "completion_length": 1479.3840026855469, + "completion_length": 1807.5469665527344, "epoch": 0.6428198043462027, - "grad_norm": 12.998468399047852, - "kl": 0.3896484375, - "learning_rate": 3.406389759350888e-08, - "loss": 0.0622, - "reward": 0.3783482238650322, - "reward_std": 0.18559667468070984, - "rewards/accuracy_reward": 0.05580357532016933, + "grad_norm": 0.4360291659832001, + "kl": 0.256591796875, + "learning_rate": 1.7031948796754443e-07, + "loss": 0.0561, + "reward": 0.4960937723517418, + "reward_std": 0.17814015597105026, + "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2152 }, { "clip_ratio": 0.0, - "completion_length": 1428.1630249023438, + "completion_length": 1734.9933776855469, "epoch": 0.6431185124337241, - "grad_norm": 12.029916763305664, - "kl": 0.32666015625, - "learning_rate": 3.401447477993229e-08, - "loss": 0.0737, - "reward": 0.434151791036129, - "reward_std": 0.1645366158336401, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 0.5238285660743713, + "kl": 0.2197265625, + "learning_rate": 1.7007237389966145e-07, + "loss": 0.0648, + "reward": 0.4977678880095482, + "reward_std": 0.134385802783072, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3582589402794838, + "rewards/tag_count_reward": 0.4218750149011612, "step": 2153 }, { "clip_ratio": 0.0, - "completion_length": 1520.4666137695312, + "completion_length": 1830.7210693359375, "epoch": 0.6434172205212456, - "grad_norm": 9.65036678314209, - "kl": 0.3447265625, - "learning_rate": 3.3965069357040374e-08, - "loss": 0.0426, - "reward": 0.405133955180645, - "reward_std": 0.18650568649172783, - "rewards/accuracy_reward": 0.06696428777649999, + "grad_norm": 0.758245587348938, + "kl": 0.29296875, + "learning_rate": 1.6982534678520188e-07, + "loss": 0.0838, + "reward": 0.450334832072258, + "reward_std": 0.1505792960524559, + "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.392299123108387, "step": 2154 }, { "clip_ratio": 0.0, - "completion_length": 1441.1228332519531, + "completion_length": 1716.5915832519531, "epoch": 0.643715928608767, - "grad_norm": 13.9950590133667, - "kl": 0.29443359375, - "learning_rate": 3.3915681378581414e-08, - "loss": 0.0744, - "reward": 0.4587053656578064, - "reward_std": 0.223468367010355, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 0.620316743850708, + "kl": 0.20068359375, + "learning_rate": 1.6957840689290706e-07, + "loss": 0.0693, + "reward": 0.538504496216774, + "reward_std": 0.19045366533100605, + "rewards/accuracy_reward": 0.10937500325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2155 }, { "clip_ratio": 0.0, - "completion_length": 1474.4085693359375, - "epoch": 0.6440146366962886, - "grad_norm": 15.77125072479248, - "kl": 0.35595703125, - "learning_rate": 3.386631089828468e-08, - "loss": 0.0579, - "reward": 0.4709821566939354, - "reward_std": 0.19790473952889442, - "rewards/accuracy_reward": 0.1272321492433548, + "completion_length": 1808.3081359863281, + "epoch": 0.6440146366962886, + "grad_norm": 1.0322672128677368, + "kl": 0.240478515625, + "learning_rate": 1.693315544914234e-07, + "loss": 0.0666, + "reward": 0.5440848618745804, + "reward_std": 0.19268004596233368, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.407924123108387, "step": 2156 }, { "clip_ratio": 0.0, - "completion_length": 1409.4755249023438, + "completion_length": 1758.8482666015625, "epoch": 0.64431334478381, - "grad_norm": 16.158586502075195, - "kl": 0.37841796875, - "learning_rate": 3.3816957969860415e-08, - "loss": 0.0873, - "reward": 0.3649553656578064, - "reward_std": 0.142506692558527, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 1.4129730463027954, + "kl": 0.257080078125, + "learning_rate": 1.6908478984930208e-07, + "loss": 0.0938, + "reward": 0.4654018059372902, + "reward_std": 0.15272756479680538, + "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4051339402794838, "step": 2157 }, { "clip_ratio": 0.0, - "completion_length": 1422.2813415527344, + "completion_length": 1771.3505249023438, "epoch": 0.6446120528713315, - "grad_norm": 18.779422760009766, - "kl": 0.4052734375, - "learning_rate": 3.376762264699977e-08, - "loss": 0.0881, - "reward": 0.3722098395228386, - "reward_std": 0.1711370050907135, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 2.950673818588257, + "kl": 0.29931640625, + "learning_rate": 1.6883811323499887e-07, + "loss": 0.0948, + "reward": 0.4486607387661934, + "reward_std": 0.14786637760698795, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062574505806, + "rewards/tag_count_reward": 0.392857164144516, "step": 2158 }, { "clip_ratio": 0.0, - "completion_length": 1407.2902221679688, + "completion_length": 1727.5603332519531, "epoch": 0.6449107609588529, - "grad_norm": 17.51749038696289, - "kl": 0.35791015625, - "learning_rate": 3.371830498337475e-08, - "loss": 0.0587, - "reward": 0.4704241380095482, - "reward_std": 0.15807343646883965, - "rewards/accuracy_reward": 0.1294642947614193, + "grad_norm": 3.739187479019165, + "kl": 0.223876953125, + "learning_rate": 1.6859152491687372e-07, + "loss": 0.088, + "reward": 0.5541294887661934, + "reward_std": 0.1490048784762621, + "rewards/accuracy_reward": 0.13169643888249993, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2159 }, { "clip_ratio": 0.0, - "completion_length": 1411.9978332519531, + "completion_length": 1792.5134582519531, "epoch": 0.6452094690463744, - "grad_norm": 13.311241149902344, - "kl": 0.36181640625, - "learning_rate": 3.366900503263812e-08, - "loss": 0.0622, - "reward": 0.392299123108387, - "reward_std": 0.18309513852000237, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 4.240115642547607, + "kl": 0.2724609375, + "learning_rate": 1.683450251631906e-07, + "loss": 0.0861, + "reward": 0.4927455633878708, + "reward_std": 0.16888438537716866, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2160 }, { "clip_ratio": 0.0, - "completion_length": 1420.6161499023438, + "completion_length": 1769.6451721191406, "epoch": 0.6455081771338959, - "grad_norm": 13.979496002197266, - "kl": 0.294189453125, - "learning_rate": 3.3619722848423415e-08, - "loss": 0.0622, - "reward": 0.4391741305589676, - "reward_std": 0.16713030077517033, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 5.114086151123047, + "kl": 0.35498046875, + "learning_rate": 1.6809861424211707e-07, + "loss": 0.0774, + "reward": 0.5083705633878708, + "reward_std": 0.12216413952410221, + "rewards/accuracy_reward": 0.10267857951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.4056919887661934, "step": 2161 }, { "clip_ratio": 0.0, - "completion_length": 1415.2500610351562, + "completion_length": 1795.5089721679688, "epoch": 0.6458068852214174, - "grad_norm": 19.469018936157227, - "kl": 0.2919921875, - "learning_rate": 3.35704584843448e-08, - "loss": 0.0614, - "reward": 0.445312537252903, - "reward_std": 0.15339215099811554, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 7.29588508605957, + "kl": 0.376953125, + "learning_rate": 1.67852292421724e-07, + "loss": 0.0681, + "reward": 0.4960937649011612, + "reward_std": 0.127341253682971, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232313156128, + "rewards/tag_count_reward": 0.411272332072258, "step": 2162 }, { "clip_ratio": 0.0, - "completion_length": 1436.1407165527344, + "completion_length": 1829.1808776855469, "epoch": 0.6461055933089388, - "grad_norm": 14.949745178222656, - "kl": 0.4375, - "learning_rate": 3.352121199399709e-08, - "loss": 0.0712, - "reward": 0.3498883992433548, - "reward_std": 0.1489091645926237, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 6.904472351074219, + "kl": 0.4560546875, + "learning_rate": 1.6760605996998545e-07, + "loss": 0.0937, + "reward": 0.431919664144516, + "reward_std": 0.14569793827831745, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3007812649011612, + "rewards/tag_count_reward": 0.3850446566939354, "step": 2163 }, { "clip_ratio": 0.0, - "completion_length": 1418.9330749511719, + "completion_length": 1758.8282165527344, "epoch": 0.6464043013964603, - "grad_norm": 18.79974937438965, - "kl": 0.380859375, - "learning_rate": 3.347198343095564e-08, - "loss": 0.0731, - "reward": 0.3722098395228386, - "reward_std": 0.15283194929361343, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 6.7539591789245605, + "kl": 0.41064453125, + "learning_rate": 1.673599171547782e-07, + "loss": 0.0877, + "reward": 0.4620535969734192, + "reward_std": 0.13200645707547665, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.4107143059372902, "step": 2164 }, { "clip_ratio": 0.0, - "completion_length": 1396.1719055175781, + "completion_length": 1789.8505249023438, "epoch": 0.6467030094839817, - "grad_norm": 15.351831436157227, - "kl": 0.323486328125, - "learning_rate": 3.342277284877629e-08, - "loss": 0.0763, - "reward": 0.3895089328289032, - "reward_std": 0.18909045681357384, - "rewards/accuracy_reward": 0.03348214388824999, + "grad_norm": 9.889727592468262, + "kl": 0.4228515625, + "learning_rate": 1.6711386424388144e-07, + "loss": 0.0708, + "reward": 0.4542410895228386, + "reward_std": 0.15225978195667267, + "rewards/accuracy_reward": 0.04241071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.411830373108387, "step": 2165 }, { "clip_ratio": 0.0, - "completion_length": 1436.1853332519531, + "completion_length": 1794.8906860351562, "epoch": 0.6470017175715033, - "grad_norm": 15.781403541564941, - "kl": 0.35498046875, - "learning_rate": 3.337358030099534e-08, - "loss": 0.064, - "reward": 0.407924123108387, - "reward_std": 0.17477922514081, - "rewards/accuracy_reward": 0.06696428940631449, + "grad_norm": 7.766774654388428, + "kl": 0.609375, + "learning_rate": 1.668679015049767e-07, + "loss": 0.0808, + "reward": 0.4843750223517418, + "reward_std": 0.17038054764270782, + "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.340959832072258, + "rewards/tag_count_reward": 0.408482164144516, "step": 2166 }, { "clip_ratio": 0.0, - "completion_length": 1418.3683776855469, + "completion_length": 1784.8036499023438, "epoch": 0.6473004256590247, - "grad_norm": 18.154970169067383, - "kl": 0.35791015625, - "learning_rate": 3.3324405841129466e-08, - "loss": 0.0672, - "reward": 0.511160746216774, - "reward_std": 0.18416129052639008, - "rewards/accuracy_reward": 0.17187500232830644, + "grad_norm": 10.321952819824219, + "kl": 0.55810546875, + "learning_rate": 1.6662202920564734e-07, + "loss": 0.089, + "reward": 0.6183036118745804, + "reward_std": 0.17240991070866585, + "rewards/accuracy_reward": 0.19419643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.424107164144516, "step": 2167 }, { "clip_ratio": 0.0, - "completion_length": 1421.7411499023438, + "completion_length": 1759.4866943359375, "epoch": 0.6475991337465462, - "grad_norm": 15.132081985473633, - "kl": 0.3359375, - "learning_rate": 3.327524952267565e-08, - "loss": 0.0654, - "reward": 0.4196428805589676, - "reward_std": 0.1509198471903801, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 11.509559631347656, + "kl": 0.677734375, + "learning_rate": 1.6637624761337825e-07, + "loss": 0.1015, + "reward": 0.5228794887661934, + "reward_std": 0.1483767181634903, + "rewards/accuracy_reward": 0.09151786426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.431361623108387, "step": 2168 }, { "clip_ratio": 0.0, - "completion_length": 1374.63623046875, + "completion_length": 1733.0536499023438, "epoch": 0.6478978418340676, - "grad_norm": 18.723203659057617, - "kl": 0.3505859375, - "learning_rate": 3.322611139911114e-08, - "loss": 0.0644, - "reward": 0.3984375223517418, - "reward_std": 0.1681508980691433, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 14.783576011657715, + "kl": 0.896484375, + "learning_rate": 1.661305569955557e-07, + "loss": 0.1343, + "reward": 0.467075914144516, + "reward_std": 0.13916569203138351, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.344866082072258, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2169 }, { "clip_ratio": 0.0, - "completion_length": 1419.6808471679688, + "completion_length": 1822.3438415527344, "epoch": 0.6481965499215891, - "grad_norm": 17.7283992767334, - "kl": 0.34814453125, - "learning_rate": 3.317699152389342e-08, - "loss": 0.0426, - "reward": 0.400669664144516, - "reward_std": 0.17209583148360252, - "rewards/accuracy_reward": 0.06026785867288709, + "grad_norm": 18.20441246032715, + "kl": 0.884765625, + "learning_rate": 1.658849576194671e-07, + "loss": 0.0967, + "reward": 0.4670759215950966, + "reward_std": 0.1291603073477745, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4068080484867096, "step": 2170 }, { "clip_ratio": 0.0, - "completion_length": 1414.6764221191406, + "completion_length": 1747.0715026855469, "epoch": 0.6484952580091106, - "grad_norm": 19.739301681518555, - "kl": 0.37841796875, - "learning_rate": 3.312788995046009e-08, - "loss": 0.0757, - "reward": 0.4207589477300644, - "reward_std": 0.17486289143562317, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 16.15131187438965, + "kl": 0.67724609375, + "learning_rate": 1.6563944975230047e-07, + "loss": 0.0829, + "reward": 0.5368303805589676, + "reward_std": 0.15096243284642696, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.431919664144516, "step": 2171 }, { "clip_ratio": 0.0, - "completion_length": 1418.3728332519531, + "completion_length": 1802.1831359863281, "epoch": 0.6487939660966321, - "grad_norm": 17.217445373535156, - "kl": 0.3603515625, - "learning_rate": 3.307880673222884e-08, - "loss": 0.0446, - "reward": 0.3733259066939354, - "reward_std": 0.13782239146530628, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 13.471420288085938, + "kl": 1.05859375, + "learning_rate": 1.653940336611442e-07, + "loss": 0.1081, + "reward": 0.4486607313156128, + "reward_std": 0.11494387127459049, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.4107143059372902, "step": 2172 }, { "clip_ratio": 0.0, - "completion_length": 1366.2589721679688, + "completion_length": 1733.8995971679688, "epoch": 0.6490926741841535, - "grad_norm": 23.09713363647461, - "kl": 0.3427734375, - "learning_rate": 3.3029741922597424e-08, - "loss": 0.0651, - "reward": 0.3822544813156128, - "reward_std": 0.17076737061142921, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 16.804391860961914, + "kl": 0.904296875, + "learning_rate": 1.651487096129871e-07, + "loss": 0.1198, + "reward": 0.4575893059372902, + "reward_std": 0.14100381731987, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3577009066939354, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2173 }, { "clip_ratio": 0.0, - "completion_length": 1427.904052734375, + "completion_length": 1820.7500915527344, "epoch": 0.649391382271675, - "grad_norm": 17.96119499206543, - "kl": 0.3623046875, - "learning_rate": 3.2980695574943526e-08, - "loss": 0.0759, - "reward": 0.3510044813156128, - "reward_std": 0.15662049129605293, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 220.9008026123047, + "kl": 2.4931640625, + "learning_rate": 1.6490347787471765e-07, + "loss": 0.1621, + "reward": 0.4313616305589676, + "reward_std": 0.1305142119526863, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.3934151977300644, "step": 2174 }, { "clip_ratio": 0.0, - "completion_length": 1452.5647888183594, + "completion_length": 1836.2077026367188, "epoch": 0.6496900903591964, - "grad_norm": 21.396587371826172, - "kl": 0.36376953125, - "learning_rate": 3.293166774262478e-08, - "loss": 0.0824, - "reward": 0.426897332072258, - "reward_std": 0.23925255611538887, - "rewards/accuracy_reward": 0.09598214784637094, + "grad_norm": 152.51202392578125, + "kl": 1.8408203125, + "learning_rate": 1.646583387131239e-07, + "loss": 0.1248, + "reward": 0.5016741305589676, + "reward_std": 0.19546056166291237, + "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4101562649011612, "step": 2175 }, { "clip_ratio": 0.0, - "completion_length": 1446.7433471679688, + "completion_length": 1843.8817443847656, "epoch": 0.649988798446718, - "grad_norm": 19.378042221069336, - "kl": 0.41650390625, - "learning_rate": 3.288265847897863e-08, - "loss": 0.0691, - "reward": 0.3565848395228386, - "reward_std": 0.1642337590456009, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 54.85565948486328, + "kl": 0.6376953125, + "learning_rate": 1.6441329239489314e-07, + "loss": 0.0802, + "reward": 0.4654018133878708, + "reward_std": 0.173141997307539, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.4118303805589676, "step": 2176 }, { "clip_ratio": 0.0, - "completion_length": 1452.3393859863281, + "completion_length": 1842.1898193359375, "epoch": 0.6502875065342394, - "grad_norm": 21.812599182128906, - "kl": 0.34033203125, - "learning_rate": 3.28336678373224e-08, - "loss": 0.0571, - "reward": 0.3599330559372902, - "reward_std": 0.16392797976732254, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 44.84663772583008, + "kl": 0.6259765625, + "learning_rate": 1.64168339186612e-07, + "loss": 0.0892, + "reward": 0.4464285895228386, + "reward_std": 0.1605691984295845, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3487723395228386, + "rewards/tag_count_reward": 0.415178582072258, "step": 2177 }, { "clip_ratio": 0.0, - "completion_length": 1465.8683776855469, + "completion_length": 1856.4398193359375, "epoch": 0.6505862146217609, - "grad_norm": 19.946901321411133, - "kl": 0.36962890625, - "learning_rate": 3.278469587095307e-08, - "loss": 0.083, - "reward": 0.384486623108387, - "reward_std": 0.1665470004081726, - "rewards/accuracy_reward": 0.053571430733427405, + "grad_norm": 45.75059127807617, + "kl": 0.830078125, + "learning_rate": 1.6392347935476537e-07, + "loss": 0.082, + "reward": 0.4793526902794838, + "reward_std": 0.16352969780564308, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151828289032, + "rewards/tag_count_reward": 0.4034598395228386, "step": 2178 }, { "clip_ratio": 0.0, - "completion_length": 1402.5670471191406, + "completion_length": 1811.52685546875, "epoch": 0.6508849227092823, - "grad_norm": 21.628948211669922, - "kl": 0.36474609375, - "learning_rate": 3.273574263314735e-08, - "loss": 0.0686, - "reward": 0.4229910895228386, - "reward_std": 0.22065863385796547, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 37.970272064208984, + "kl": 0.955078125, + "learning_rate": 1.6367871316573672e-07, + "loss": 0.0993, + "reward": 0.5256696566939354, + "reward_std": 0.192394794896245, + "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2179 }, { "clip_ratio": 0.0, - "completion_length": 1412.544677734375, + "completion_length": 1829.3817749023438, "epoch": 0.6511836307968039, - "grad_norm": 19.72955894470215, - "kl": 0.34716796875, - "learning_rate": 3.268680817716157e-08, - "loss": 0.0711, - "reward": 0.3599330484867096, - "reward_std": 0.13523980602622032, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 27.142709732055664, + "kl": 1.58203125, + "learning_rate": 1.6343404088580786e-07, + "loss": 0.1375, + "reward": 0.4330357238650322, + "reward_std": 0.13291746377944946, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.3883928656578064, "step": 2180 }, { "clip_ratio": 0.0, - "completion_length": 1492.4442443847656, + "completion_length": 1915.9353332519531, "epoch": 0.6514823388843253, - "grad_norm": 18.621654510498047, - "kl": 0.34228515625, - "learning_rate": 3.2637892556231624e-08, - "loss": 0.0487, - "reward": 0.3861607238650322, - "reward_std": 0.18245650455355644, - "rewards/accuracy_reward": 0.06250000488944352, + "grad_norm": 94.56024169921875, + "kl": 3.009765625, + "learning_rate": 1.631894627811581e-07, + "loss": 0.1664, + "reward": 0.4564732313156128, + "reward_std": 0.15921980701386929, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.3939732313156128, "step": 2181 }, { "clip_ratio": 0.0, - "completion_length": 1389.65185546875, + "completion_length": 1791.46435546875, "epoch": 0.6517810469718468, - "grad_norm": 18.69159507751465, - "kl": 0.28564453125, - "learning_rate": 3.2588995823572904e-08, - "loss": 0.0545, - "reward": 0.5111607313156128, - "reward_std": 0.20856374502182007, - "rewards/accuracy_reward": 0.14508929383009672, + "grad_norm": 18.58778953552246, + "kl": 0.962890625, + "learning_rate": 1.629449791178645e-07, + "loss": 0.0827, + "reward": 0.5535714626312256, + "reward_std": 0.16586069762706757, + "rewards/accuracy_reward": 0.1272321459837258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3660714402794838, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2182 }, { "clip_ratio": 0.0, - "completion_length": 1373.6317749023438, + "completion_length": 1784.7165832519531, "epoch": 0.6520797550593682, - "grad_norm": 22.33964729309082, - "kl": 0.3486328125, - "learning_rate": 3.254011803238025e-08, - "loss": 0.0426, - "reward": 0.4090401828289032, - "reward_std": 0.1738818623125553, - "rewards/accuracy_reward": 0.0669642873108387, + "grad_norm": 16.2475528717041, + "kl": 1.94140625, + "learning_rate": 1.6270059016190126e-07, + "loss": 0.1528, + "reward": 0.4771205633878708, + "reward_std": 0.13889878802001476, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3420758992433548, + "rewards/tag_count_reward": 0.4034598395228386, "step": 2183 }, { "clip_ratio": 0.0, - "completion_length": 1432.8080749511719, + "completion_length": 1828.3996276855469, "epoch": 0.6523784631468897, - "grad_norm": 18.456096649169922, - "kl": 0.34521484375, - "learning_rate": 3.249125923582794e-08, - "loss": 0.0695, - "reward": 0.3632812723517418, - "reward_std": 0.18820993974804878, - "rewards/accuracy_reward": 0.05580357578583062, + "grad_norm": 432.28411865234375, + "kl": 6.80078125, + "learning_rate": 1.6245629617913968e-07, + "loss": 0.2558, + "reward": 0.4760044887661934, + "reward_std": 0.16295186430215836, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2184 }, { "clip_ratio": 0.0, - "completion_length": 1480.6027221679688, + "completion_length": 1882.7835693359375, "epoch": 0.6526771712344112, - "grad_norm": 21.764354705810547, - "kl": 0.380859375, - "learning_rate": 3.2442419487069546e-08, - "loss": 0.073, - "reward": 0.3783482313156128, - "reward_std": 0.1976274624466896, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 35.409759521484375, + "kl": 3.23046875, + "learning_rate": 1.6221209743534773e-07, + "loss": 0.1713, + "reward": 0.4380580633878708, + "reward_std": 0.14730950631201267, + "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.3822544887661934, "step": 2185 }, { "clip_ratio": 0.0, - "completion_length": 1343.7255249023438, + "completion_length": 1782.0648193359375, "epoch": 0.6529758793219327, - "grad_norm": 22.469257354736328, - "kl": 0.33349609375, - "learning_rate": 3.23935988392379e-08, - "loss": 0.0844, - "reward": 0.4068080559372902, - "reward_std": 0.15733392909169197, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 18.855337142944336, + "kl": 1.251953125, + "learning_rate": 1.619679941961895e-07, + "loss": 0.1013, + "reward": 0.5217634066939354, + "reward_std": 0.13952495530247688, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2186 }, { "clip_ratio": 0.0, - "completion_length": 1451.43310546875, + "completion_length": 1852.3393859863281, "epoch": 0.6532745874094541, - "grad_norm": 18.175987243652344, - "kl": 0.29541015625, - "learning_rate": 3.2344797345445124e-08, - "loss": 0.0589, - "reward": 0.4190848469734192, - "reward_std": 0.16514546982944012, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 13.165804862976074, + "kl": 1.74609375, + "learning_rate": 1.6172398672722562e-07, + "loss": 0.1097, + "reward": 0.4966518208384514, + "reward_std": 0.14323166385293007, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4051339477300644, "step": 2187 }, { "clip_ratio": 0.0, - "completion_length": 1401.4889221191406, + "completion_length": 1771.9777526855469, "epoch": 0.6535732954969756, - "grad_norm": 18.038593292236328, - "kl": 0.33251953125, - "learning_rate": 3.229601505878244e-08, - "loss": 0.0769, - "reward": 0.4213169887661934, - "reward_std": 0.18845491483807564, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 31.907453536987305, + "kl": 1.2939453125, + "learning_rate": 1.614800752939122e-07, + "loss": 0.1357, + "reward": 0.4960937798023224, + "reward_std": 0.16885284520685673, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2188 }, { "clip_ratio": 0.0, - "completion_length": 1399.8415832519531, + "completion_length": 1804.4063415527344, "epoch": 0.653872003584497, - "grad_norm": 15.764102935791016, - "kl": 0.29541015625, - "learning_rate": 3.2247252032320195e-08, - "loss": 0.0833, - "reward": 0.4051339402794838, - "reward_std": 0.15935077145695686, - "rewards/accuracy_reward": 0.06250000302679837, + "grad_norm": 30.073810577392578, + "kl": 2.70703125, + "learning_rate": 1.6123626016160096e-07, + "loss": 0.1913, + "reward": 0.4854910969734192, + "reward_std": 0.16061488725245, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2189 }, { "clip_ratio": 0.0, - "completion_length": 1389.5982666015625, + "completion_length": 1810.6250610351562, "epoch": 0.6541707116720186, - "grad_norm": 18.71897315979004, - "kl": 0.30224609375, - "learning_rate": 3.219850831910776e-08, - "loss": 0.074, - "reward": 0.3872767984867096, - "reward_std": 0.17491626553237438, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 14.673182487487793, + "kl": 2.46875, + "learning_rate": 1.6099254159553883e-07, + "loss": 0.1488, + "reward": 0.475446455180645, + "reward_std": 0.18951418064534664, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2190 }, { "clip_ratio": 0.0, - "completion_length": 1520.4397583007812, + "completion_length": 1879.9822387695312, "epoch": 0.65446941975954, - "grad_norm": 18.250198364257812, - "kl": 0.33056640625, - "learning_rate": 3.214978397217357e-08, - "loss": 0.0374, - "reward": 0.3320312649011612, - "reward_std": 0.16129238903522491, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 21.033632278442383, + "kl": 1.779296875, + "learning_rate": 1.6074891986086783e-07, + "loss": 0.1174, + "reward": 0.4414062649011612, + "reward_std": 0.15120753087103367, + "rewards/accuracy_reward": 0.02901785750873387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.4123884066939354, "step": 2191 }, { "clip_ratio": 0.0, - "completion_length": 1492.5156860351562, + "completion_length": 1866.6362609863281, "epoch": 0.6547681278470615, - "grad_norm": 18.0384578704834, - "kl": 0.31884765625, - "learning_rate": 3.2101079044524895e-08, - "loss": 0.064, - "reward": 0.3632812723517418, - "reward_std": 0.1808977760374546, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 19.368297576904297, + "kl": 1.576171875, + "learning_rate": 1.6050539522262448e-07, + "loss": 0.1043, + "reward": 0.4737723469734192, + "reward_std": 0.1611400619149208, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.4068080559372902, "step": 2192 }, { "clip_ratio": 0.0, - "completion_length": 1409.3036193847656, + "completion_length": 1802.8862609863281, "epoch": 0.6550668359345829, - "grad_norm": 21.193899154663086, - "kl": 0.311767578125, - "learning_rate": 3.205239358914793e-08, - "loss": 0.084, - "reward": 0.4486607313156128, - "reward_std": 0.21843058988451958, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 16.645069122314453, + "kl": 2.197265625, + "learning_rate": 1.6026196794573965e-07, + "loss": 0.1657, + "reward": 0.513950914144516, + "reward_std": 0.18210257589817047, + "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.4068080484867096, "step": 2193 }, { "clip_ratio": 0.0, - "completion_length": 1437.0782165527344, + "completion_length": 1836.3996276855469, "epoch": 0.6553655440221045, - "grad_norm": 17.397480010986328, - "kl": 0.3193359375, - "learning_rate": 3.2003727659007696e-08, - "loss": 0.0543, - "reward": 0.5245536044239998, - "reward_std": 0.17944806069135666, - "rewards/accuracy_reward": 0.1986607201397419, + "grad_norm": 56.880218505859375, + "kl": 3.7578125, + "learning_rate": 1.6001863829503847e-07, + "loss": 0.2208, + "reward": 0.6110491305589676, + "reward_std": 0.18799839727580547, + "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928805589676, + "rewards/tag_count_reward": 0.4056919813156128, "step": 2194 }, { "clip_ratio": 0.0, - "completion_length": 1479.0491638183594, + "completion_length": 1817.149658203125, "epoch": 0.6556642521096259, - "grad_norm": 23.948627471923828, - "kl": 0.33251953125, - "learning_rate": 3.195508130704795e-08, - "loss": 0.0914, - "reward": 0.4174107387661934, - "reward_std": 0.20809148997068405, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 38.62685012817383, + "kl": 3.294921875, + "learning_rate": 1.5977540653523975e-07, + "loss": 0.1887, + "reward": 0.502232164144516, + "reward_std": 0.19252017885446548, + "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4107142984867096, "step": 2195 }, { "clip_ratio": 0.0, - "completion_length": 1420.7634887695312, + "completion_length": 1862.2500610351562, "epoch": 0.6559629601971473, - "grad_norm": 19.55733871459961, - "kl": 0.33349609375, - "learning_rate": 3.190645458619114e-08, - "loss": 0.0787, - "reward": 0.3632812649011612, - "reward_std": 0.14993217960000038, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 21.49437141418457, + "kl": 1.771484375, + "learning_rate": 1.5953227293095568e-07, + "loss": 0.128, + "reward": 0.4581473395228386, + "reward_std": 0.12906410917639732, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.409040205180645, "step": 2196 }, { "clip_ratio": 0.0, - "completion_length": 1458.3348999023438, + "completion_length": 1861.0625915527344, "epoch": 0.6562616682846688, - "grad_norm": 20.747447967529297, - "kl": 0.31591796875, - "learning_rate": 3.185784754933838e-08, - "loss": 0.0607, - "reward": 0.3967634215950966, - "reward_std": 0.19743676483631134, - "rewards/accuracy_reward": 0.06919643213041127, + "grad_norm": 22.62534523010254, + "kl": 1.90234375, + "learning_rate": 1.592892377466919e-07, + "loss": 0.1182, + "reward": 0.487165205180645, + "reward_std": 0.16745640523731709, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2197 }, { "clip_ratio": 0.0, - "completion_length": 1417.3795166015625, + "completion_length": 1847.6563110351562, "epoch": 0.6565603763721902, - "grad_norm": 21.7032527923584, - "kl": 0.3203125, - "learning_rate": 3.1809260249369366e-08, - "loss": 0.0698, - "reward": 0.3777901902794838, - "reward_std": 0.18162843585014343, - "rewards/accuracy_reward": 0.05580357578583062, + "grad_norm": 10.633485794067383, + "kl": 2.142578125, + "learning_rate": 1.5904630124684685e-07, + "loss": 0.1384, + "reward": 0.474330373108387, + "reward_std": 0.134651405736804, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.411830373108387, "step": 2198 }, { "clip_ratio": 0.0, - "completion_length": 1406.7232666015625, + "completion_length": 1810.618408203125, "epoch": 0.6568590844597118, - "grad_norm": 28.33058738708496, - "kl": 0.3427734375, - "learning_rate": 3.176069273914233e-08, - "loss": 0.0576, - "reward": 0.3883928805589676, - "reward_std": 0.2315540872514248, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 45.121280670166016, + "kl": 3.83203125, + "learning_rate": 1.5880346369571163e-07, + "loss": 0.2187, + "reward": 0.492187537252903, + "reward_std": 0.23960452899336815, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325892984867096, + "rewards/tag_count_reward": 0.416294664144516, "step": 2199 }, { "clip_ratio": 0.0, - "completion_length": 1393.9665832519531, + "completion_length": 1823.0379943847656, "epoch": 0.6571577925472332, - "grad_norm": 22.544540405273438, - "kl": 0.35595703125, - "learning_rate": 3.1712145071493925e-08, - "loss": 0.0876, - "reward": 0.3582589477300644, - "reward_std": 0.2012617066502571, - "rewards/accuracy_reward": 0.03571428777649999, + "grad_norm": 41.59630584716797, + "kl": 3.8046875, + "learning_rate": 1.585607253574696e-07, + "loss": 0.2071, + "reward": 0.4291294813156128, + "reward_std": 0.1582975797355175, + "rewards/accuracy_reward": 0.031250001629814506, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.3978794813156128, "step": 2200 }, { "clip_ratio": 0.0, - "completion_length": 1465.4018249511719, + "completion_length": 1858.71435546875, "epoch": 0.6574565006347547, - "grad_norm": 23.918914794921875, - "kl": 0.3271484375, - "learning_rate": 3.1663617299239295e-08, - "loss": 0.0648, - "reward": 0.3945312723517418, - "reward_std": 0.15514741465449333, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 50.36886978149414, + "kl": 1.4921875, + "learning_rate": 1.583180864961965e-07, + "loss": 0.107, + "reward": 0.4787946566939354, + "reward_std": 0.11614236608147621, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4051339477300644, "step": 2201 }, { "clip_ratio": 0.0, - "completion_length": 1336.7969360351562, + "completion_length": 1714.7344665527344, "epoch": 0.6577552087222761, - "grad_norm": 19.427392959594727, - "kl": 0.30078125, - "learning_rate": 3.1615109475171885e-08, - "loss": 0.0708, - "reward": 0.4375000223517418, - "reward_std": 0.1874083336442709, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 30.95758819580078, + "kl": 1.0771484375, + "learning_rate": 1.5807554737585943e-07, + "loss": 0.0973, + "reward": 0.5145089402794838, + "reward_std": 0.1429465338587761, + "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821566939354, + "rewards/tag_count_reward": 0.4207589402794838, "step": 2202 }, { "clip_ratio": 0.0, - "completion_length": 1393.1317443847656, + "completion_length": 1822.4241943359375, "epoch": 0.6580539168097976, - "grad_norm": 21.909805297851562, - "kl": 0.34130859375, - "learning_rate": 3.1566621652063455e-08, - "loss": 0.0344, - "reward": 0.387834832072258, - "reward_std": 0.19422614201903343, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 49.198951721191406, + "kl": 1.419921875, + "learning_rate": 1.578331082603173e-07, + "loss": 0.1048, + "reward": 0.4787946715950966, + "reward_std": 0.16841862350702286, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2203 }, { "clip_ratio": 0.0, - "completion_length": 1327.930908203125, + "completion_length": 1734.1161804199219, "epoch": 0.658352624897319, - "grad_norm": 27.924270629882812, - "kl": 0.361328125, - "learning_rate": 3.151815388266399e-08, - "loss": 0.1011, - "reward": 0.4308035969734192, - "reward_std": 0.17126930877566338, - "rewards/accuracy_reward": 0.10491071734577417, + "grad_norm": 16.387285232543945, + "kl": 2.59375, + "learning_rate": 1.5759076941331995e-07, + "loss": 0.1882, + "reward": 0.5318080559372902, + "reward_std": 0.14454777166247368, + "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2204 }, { "clip_ratio": 0.0, - "completion_length": 1463.2567443847656, + "completion_length": 1853.4443054199219, "epoch": 0.6586513329848406, - "grad_norm": 23.63907814025879, - "kl": 0.359375, - "learning_rate": 3.1469706219701694e-08, - "loss": 0.0516, - "reward": 0.328683041036129, - "reward_std": 0.18123772740364075, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 44.269378662109375, + "kl": 4.7109375, + "learning_rate": 1.5734853109850848e-07, + "loss": 0.2651, + "reward": 0.4190848469734192, + "reward_std": 0.1389799825847149, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.3989955559372902, "step": 2205 }, { "clip_ratio": 0.0, - "completion_length": 1456.6719360351562, + "completion_length": 1844.524658203125, "epoch": 0.658950041072362, - "grad_norm": 21.089414596557617, - "kl": 0.3017578125, - "learning_rate": 3.1421278715882875e-08, - "loss": 0.0444, - "reward": 0.3945312649011612, - "reward_std": 0.18235691264271736, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 14.374629974365234, + "kl": 2.439453125, + "learning_rate": 1.5710639357941437e-07, + "loss": 0.1494, + "reward": 0.5055803805589676, + "reward_std": 0.15176177397370338, + "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2206 }, { "clip_ratio": 0.0, - "completion_length": 1454.9777221679688, + "completion_length": 1849.821533203125, "epoch": 0.6592487491598835, - "grad_norm": 24.67913246154785, - "kl": 0.36962890625, - "learning_rate": 3.137287142389189e-08, - "loss": 0.0704, - "reward": 0.3426339477300644, - "reward_std": 0.18898973241448402, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 17.543710708618164, + "kl": 2.673828125, + "learning_rate": 1.5686435711945946e-07, + "loss": 0.1578, + "reward": 0.452008955180645, + "reward_std": 0.16376896388828754, + "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.4095982387661934, "step": 2207 }, { "clip_ratio": 0.0, - "completion_length": 1356.169677734375, + "completion_length": 1736.1585693359375, "epoch": 0.6595474572474049, - "grad_norm": 18.38365364074707, - "kl": 0.276123046875, - "learning_rate": 3.132448439639115e-08, - "loss": 0.0637, - "reward": 0.4118303880095482, - "reward_std": 0.18621904030442238, - "rewards/accuracy_reward": 0.05580357299186289, + "grad_norm": 14.489166259765625, + "kl": 1.99609375, + "learning_rate": 1.5662242198195574e-07, + "loss": 0.1368, + "reward": 0.494977705180645, + "reward_std": 0.1917777769267559, + "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2208 }, { "clip_ratio": 0.0, - "completion_length": 1386.8750610351562, + "completion_length": 1787.99560546875, "epoch": 0.6598461653349265, - "grad_norm": 27.699369430541992, - "kl": 0.361328125, - "learning_rate": 3.1276117686020986e-08, - "loss": 0.0835, - "reward": 0.4358259066939354, - "reward_std": 0.2040140964090824, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 18.134517669677734, + "kl": 2.3671875, + "learning_rate": 1.5638058843010492e-07, + "loss": 0.1563, + "reward": 0.5089285969734192, + "reward_std": 0.13993413373827934, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.408482164144516, "step": 2209 }, { "clip_ratio": 0.0, - "completion_length": 1404.3728637695312, + "completion_length": 1825.7166137695312, "epoch": 0.6601448734224479, - "grad_norm": 25.582778930664062, - "kl": 0.3505859375, - "learning_rate": 3.1227771345399644e-08, - "loss": 0.0479, - "reward": 0.4296875223517418, - "reward_std": 0.18032335489988327, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 16.48717498779297, + "kl": 2.630859375, + "learning_rate": 1.5613885672699821e-07, + "loss": 0.1683, + "reward": 0.550781287252903, + "reward_std": 0.1636018007993698, + "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2210 }, { "clip_ratio": 0.0, - "completion_length": 1436.3951721191406, + "completion_length": 1848.7880554199219, "epoch": 0.6604435815099694, - "grad_norm": 29.366485595703125, - "kl": 0.30029296875, - "learning_rate": 3.117944542712318e-08, - "loss": 0.0651, - "reward": 0.415178582072258, - "reward_std": 0.1655748002231121, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 10.177712440490723, + "kl": 2.810546875, + "learning_rate": 1.5589722713561592e-07, + "loss": 0.1602, + "reward": 0.4815848469734192, + "reward_std": 0.15495957620441914, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3571428656578064, + "rewards/tag_count_reward": 0.4146205484867096, "step": 2211 }, { "clip_ratio": 0.0, - "completion_length": 1462.9286193847656, + "completion_length": 1819.0893859863281, "epoch": 0.6607422895974908, - "grad_norm": 27.549291610717773, - "kl": 0.264892578125, - "learning_rate": 3.113113998376551e-08, - "loss": 0.0452, - "reward": 0.4760044887661934, - "reward_std": 0.16562404111027718, - "rewards/accuracy_reward": 0.12053571874275804, + "grad_norm": 84.11796569824219, + "kl": 3.57421875, + "learning_rate": 1.5565569991882753e-07, + "loss": 0.1919, + "reward": 0.5379464477300644, + "reward_std": 0.1255164686590433, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3554687723517418, + "rewards/tag_count_reward": 0.4174107313156128, "step": 2212 }, { "clip_ratio": 0.0, - "completion_length": 1304.404052734375, + "completion_length": 1733.2076416015625, "epoch": 0.6610409976850123, - "grad_norm": 24.198762893676758, - "kl": 0.3193359375, - "learning_rate": 3.1082855067878174e-08, - "loss": 0.0814, - "reward": 0.5418527200818062, - "reward_std": 0.17079072073101997, - "rewards/accuracy_reward": 0.18303572619333863, + "grad_norm": 13.702842712402344, + "kl": 2.60546875, + "learning_rate": 1.554142753393909e-07, + "loss": 0.1806, + "reward": 0.6021205559372902, + "reward_std": 0.14941392093896866, + "rewards/accuracy_reward": 0.19196429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169738650322, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2213 }, { "clip_ratio": 0.0, - "completion_length": 1425.6741638183594, + "completion_length": 1824.5849304199219, "epoch": 0.6613397057725338, - "grad_norm": 21.231021881103516, - "kl": 0.328125, - "learning_rate": 3.103459073199045e-08, - "loss": 0.043, - "reward": 0.400111623108387, - "reward_std": 0.1647486500442028, - "rewards/accuracy_reward": 0.06250000302679837, + "grad_norm": 48.1385383605957, + "kl": 2.044921875, + "learning_rate": 1.5517295365995225e-07, + "loss": 0.129, + "reward": 0.5212053805589676, + "reward_std": 0.163785882294178, + "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.416294664144516, "step": 2214 }, { "clip_ratio": 0.0, - "completion_length": 1430.29248046875, + "completion_length": 1876.3326721191406, "epoch": 0.6616384138600553, - "grad_norm": 25.167118072509766, - "kl": 0.404296875, - "learning_rate": 3.09863470286092e-08, - "loss": 0.0544, - "reward": 0.4564732313156128, - "reward_std": 0.17078553698956966, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 13.208414077758789, + "kl": 3.453125, + "learning_rate": 1.54931735143046e-07, + "loss": 0.1959, + "reward": 0.5708705559372902, + "reward_std": 0.16594872437417507, + "rewards/accuracy_reward": 0.17633929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297991082072258, + "rewards/tag_count_reward": 0.3945312649011612, "step": 2215 }, { "clip_ratio": 0.0, - "completion_length": 1430.6451721191406, + "completion_length": 1877.8014221191406, "epoch": 0.6619371219475767, - "grad_norm": 24.41437530517578, - "kl": 0.345703125, - "learning_rate": 3.0938124010218846e-08, - "loss": 0.0503, - "reward": 0.4921875149011612, - "reward_std": 0.21191227436065674, - "rewards/accuracy_reward": 0.16517857951112092, + "grad_norm": 12.01906681060791, + "kl": 3.1953125, + "learning_rate": 1.5469062005109423e-07, + "loss": 0.1928, + "reward": 0.561941996216774, + "reward_std": 0.16843882016837597, + "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4056919813156128, "step": 2216 }, { "clip_ratio": 0.0, - "completion_length": 1402.7366638183594, + "completion_length": 1776.3393859863281, "epoch": 0.6622358300350982, - "grad_norm": 25.796844482421875, - "kl": 0.3173828125, - "learning_rate": 3.0889921729281304e-08, - "loss": 0.0685, - "reward": 0.4118303805589676, - "reward_std": 0.212571170181036, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 34.699825286865234, + "kl": 1.857421875, + "learning_rate": 1.5444960864640651e-07, + "loss": 0.1194, + "reward": 0.4799107313156128, + "reward_std": 0.19737237319350243, + "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2217 }, { "clip_ratio": 0.0, - "completion_length": 1443.9063110351562, + "completion_length": 1822.2813415527344, "epoch": 0.6625345381226196, - "grad_norm": 29.77545738220215, - "kl": 0.34814453125, - "learning_rate": 3.084174023823592e-08, - "loss": 0.0648, - "reward": 0.416294664144516, - "reward_std": 0.1874612532556057, - "rewards/accuracy_reward": 0.08705357555299997, + "grad_norm": 13.85798168182373, + "kl": 2.771484375, + "learning_rate": 1.542087011911796e-07, + "loss": 0.1774, + "reward": 0.5044643059372902, + "reward_std": 0.15126293525099754, + "rewards/accuracy_reward": 0.09598214877769351, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2218 }, { "clip_ratio": 0.0, - "completion_length": 1406.6027526855469, + "completion_length": 1830.1987609863281, "epoch": 0.6628332462101412, - "grad_norm": 27.9174747467041, - "kl": 0.32763671875, - "learning_rate": 3.079357958949946e-08, - "loss": 0.0748, - "reward": 0.415178582072258, - "reward_std": 0.19360121339559555, - "rewards/accuracy_reward": 0.06919643026776612, + "grad_norm": 31.499422073364258, + "kl": 3.76171875, + "learning_rate": 1.539678979474973e-07, + "loss": 0.2186, + "reward": 0.5016741454601288, + "reward_std": 0.17718639224767685, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345982164144516, + "rewards/tag_count_reward": 0.4146205633878708, "step": 2219 }, { "clip_ratio": 0.0, - "completion_length": 1447.3750915527344, + "completion_length": 1852.2590026855469, "epoch": 0.6631319542976626, - "grad_norm": 24.499357223510742, - "kl": 0.316650390625, - "learning_rate": 3.074543983546597e-08, - "loss": 0.0662, - "reward": 0.4202009066939354, - "reward_std": 0.19388573616743088, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 16.087862014770508, + "kl": 2.578125, + "learning_rate": 1.5372719917732986e-07, + "loss": 0.1572, + "reward": 0.5016741305589676, + "reward_std": 0.16891697607934475, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401902794838, + "rewards/tag_count_reward": 0.4012276977300644, "step": 2220 }, { "clip_ratio": 0.0, - "completion_length": 1416.38623046875, + "completion_length": 1833.6719665527344, "epoch": 0.6634306623851841, - "grad_norm": 22.684717178344727, - "kl": 0.31298828125, - "learning_rate": 3.0697321028506795e-08, - "loss": 0.0474, - "reward": 0.4693080559372902, - "reward_std": 0.1789991706609726, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 37.167659759521484, + "kl": 2.279296875, + "learning_rate": 1.5348660514253397e-07, + "loss": 0.1585, + "reward": 0.548549123108387, + "reward_std": 0.15856679901480675, + "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080559372902, + "rewards/tag_count_reward": 0.412388414144516, "step": 2221 }, { "clip_ratio": 0.0, - "completion_length": 1425.57373046875, + "completion_length": 1885.4643859863281, "epoch": 0.6637293704727055, - "grad_norm": 23.7296199798584, - "kl": 0.35595703125, - "learning_rate": 3.0649223220970455e-08, - "loss": 0.065, - "reward": 0.3710937649011612, - "reward_std": 0.17016143910586834, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 53.50740051269531, + "kl": 4.953125, + "learning_rate": 1.5324611610485227e-07, + "loss": 0.2605, + "reward": 0.4481026902794838, + "reward_std": 0.1638985201716423, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.3789062723517418, "step": 2222 }, { "clip_ratio": 0.0, - "completion_length": 1414.3750610351562, + "completion_length": 1815.0156860351562, "epoch": 0.664028078560227, - "grad_norm": 22.523841857910156, - "kl": 0.3447265625, - "learning_rate": 3.060114646518269e-08, - "loss": 0.0685, - "reward": 0.4609375223517418, - "reward_std": 0.1618118453770876, - "rewards/accuracy_reward": 0.13392857951112092, + "grad_norm": 53.27512741088867, + "kl": 2.48046875, + "learning_rate": 1.5300573232591347e-07, + "loss": 0.1716, + "reward": 0.5558036044239998, + "reward_std": 0.13264968432486057, + "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089402794838, + "rewards/tag_count_reward": 0.4107143133878708, "step": 2223 }, { "clip_ratio": 0.0, - "completion_length": 1372.0871276855469, + "completion_length": 1805.5134582519531, "epoch": 0.6643267866477485, - "grad_norm": 29.12217903137207, - "kl": 0.38330078125, - "learning_rate": 3.055309081344628e-08, - "loss": 0.0731, - "reward": 0.4302455633878708, - "reward_std": 0.1415562741458416, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 14.959280967712402, + "kl": 4.41796875, + "learning_rate": 1.527654540672314e-07, + "loss": 0.2547, + "reward": 0.5117187723517418, + "reward_std": 0.14256246387958527, + "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186383992433548, + "rewards/tag_count_reward": 0.3867187723517418, "step": 2224 }, { "clip_ratio": 0.0, - "completion_length": 1435.63623046875, + "completion_length": 1835.7969665527344, "epoch": 0.66462549473527, - "grad_norm": 26.986305236816406, - "kl": 0.38671875, - "learning_rate": 3.0505056318041045e-08, - "loss": 0.0799, - "reward": 0.3928571566939354, - "reward_std": 0.15439508110284805, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 23.359363555908203, + "kl": 3.00390625, + "learning_rate": 1.5252528159020524e-07, + "loss": 0.1958, + "reward": 0.4905134215950966, + "reward_std": 0.1470353864133358, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4012276977300644, "step": 2225 }, { "clip_ratio": 0.0, - "completion_length": 1540.5603332519531, + "completion_length": 1880.8907165527344, "epoch": 0.6649242028227914, - "grad_norm": 26.531675338745117, - "kl": 0.3095703125, - "learning_rate": 3.045704303122384e-08, - "loss": 0.0692, - "reward": 0.3722098395228386, - "reward_std": 0.22661402076482773, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 7.5873332023620605, + "kl": 3.29296875, + "learning_rate": 1.5228521515611919e-07, + "loss": 0.181, + "reward": 0.4905134215950966, + "reward_std": 0.2158062420785427, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2226 }, { "clip_ratio": 0.0, - "completion_length": 1387.8639221191406, + "completion_length": 1733.2165832519531, "epoch": 0.6652229109103129, - "grad_norm": 28.666303634643555, - "kl": 0.35009765625, - "learning_rate": 3.04090510052284e-08, - "loss": 0.0875, - "reward": 0.4308035895228386, - "reward_std": 0.16737782582640648, - "rewards/accuracy_reward": 0.10044643469154835, + "grad_norm": 29.001022338867188, + "kl": 2.75390625, + "learning_rate": 1.52045255026142e-07, + "loss": 0.1869, + "reward": 0.5083705559372902, + "reward_std": 0.13273309543728828, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.330357164144516, + "rewards/tag_count_reward": 0.4168526902794838, "step": 2227 }, { "clip_ratio": 0.0, - "completion_length": 1444.4308471679688, + "completion_length": 1825.2300109863281, "epoch": 0.6655216189978344, - "grad_norm": 30.391691207885742, - "kl": 0.32275390625, - "learning_rate": 3.036108029226535e-08, - "loss": 0.0686, - "reward": 0.4168526977300644, - "reward_std": 0.2088300846517086, - "rewards/accuracy_reward": 0.07142857578583062, + "grad_norm": 5.926249980926514, + "kl": 2.294921875, + "learning_rate": 1.5180540146132675e-07, + "loss": 0.1414, + "reward": 0.5424107313156128, + "reward_std": 0.19186085276305676, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2228 }, { "clip_ratio": 0.0, - "completion_length": 1442.7835693359375, + "completion_length": 1860.5871276855469, "epoch": 0.6658203270853559, - "grad_norm": 24.05158042907715, - "kl": 0.38134765625, - "learning_rate": 3.0313130944522115e-08, - "loss": 0.0497, - "reward": 0.388950914144516, - "reward_std": 0.16891321167349815, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 27.400875091552734, + "kl": 4.40625, + "learning_rate": 1.5156565472261057e-07, + "loss": 0.2476, + "reward": 0.4877232387661934, + "reward_std": 0.12675922363996506, + "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3063616156578064, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2229 }, { "clip_ratio": 0.0, - "completion_length": 1444.9978332519531, + "completion_length": 1854.2300109863281, "epoch": 0.6661190351728773, - "grad_norm": 28.179195404052734, - "kl": 0.32080078125, - "learning_rate": 3.0265203014162896e-08, - "loss": 0.0732, - "reward": 0.3794643059372902, - "reward_std": 0.17811303585767746, - "rewards/accuracy_reward": 0.029017858672887087, + "grad_norm": 12.79924201965332, + "kl": 3.3359375, + "learning_rate": 1.5132601507081448e-07, + "loss": 0.2002, + "reward": 0.4609375223517418, + "reward_std": 0.1683448702096939, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3504464477300644, + "rewards/tag_count_reward": 0.416294664144516, "step": 2230 }, { "clip_ratio": 0.0, - "completion_length": 1382.6719360351562, + "completion_length": 1782.5090026855469, "epoch": 0.6664177432603988, - "grad_norm": 23.059375762939453, - "kl": 0.32568359375, - "learning_rate": 3.0217296553328574e-08, - "loss": 0.0624, - "reward": 0.4257812723517418, - "reward_std": 0.1815426480025053, - "rewards/accuracy_reward": 0.08258929010480642, + "grad_norm": 5.868471145629883, + "kl": 3.009765625, + "learning_rate": 1.5108648276664289e-07, + "loss": 0.1713, + "reward": 0.4972098395228386, + "reward_std": 0.15552951395511627, + "rewards/accuracy_reward": 0.09598215017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.4012276977300644, "step": 2231 }, { "clip_ratio": 0.0, - "completion_length": 1359.0379943847656, + "completion_length": 1770.5179443359375, "epoch": 0.6667164513479202, - "grad_norm": 26.071226119995117, - "kl": 0.32177734375, - "learning_rate": 3.0169411614136685e-08, - "loss": 0.0778, - "reward": 0.483258955180645, - "reward_std": 0.19201983511447906, - "rewards/accuracy_reward": 0.1383928619325161, + "grad_norm": 54.690853118896484, + "kl": 1.42578125, + "learning_rate": 1.5084705807068344e-07, + "loss": 0.0856, + "reward": 0.6155133992433548, + "reward_std": 0.1778074074536562, + "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.344866082072258, + "rewards/tag_count_reward": 0.434709832072258, "step": 2232 }, { "clip_ratio": 0.0, - "completion_length": 1428.5826416015625, + "completion_length": 1815.8594970703125, "epoch": 0.6670151594354418, - "grad_norm": 30.700687408447266, - "kl": 0.37158203125, - "learning_rate": 3.0121548248681374e-08, - "loss": 0.0863, - "reward": 0.3253348395228386, - "reward_std": 0.16376731172204018, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 105.45365142822266, + "kl": 1.4423828125, + "learning_rate": 1.5060774124340686e-07, + "loss": 0.1208, + "reward": 0.4229910969734192, + "reward_std": 0.12866713665425777, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419738650322, + "rewards/tag_count_reward": 0.4029018059372902, "step": 2233 }, { "clip_ratio": 0.0, - "completion_length": 1486.727783203125, + "completion_length": 1914.2835693359375, "epoch": 0.6673138675229632, - "grad_norm": 23.687421798706055, - "kl": 0.35693359375, - "learning_rate": 3.007370650903325e-08, - "loss": 0.0472, - "reward": 0.3152901977300644, - "reward_std": 0.1645796187222004, + "grad_norm": 47.46696853637695, + "kl": 1.8515625, + "learning_rate": 1.5036853254516627e-07, + "loss": 0.1218, + "reward": 0.4056919813156128, + "reward_std": 0.13845831714570522, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.396763414144516, "step": 2234 }, { "clip_ratio": 0.0, - "completion_length": 1397.5715026855469, + "completion_length": 1779.1920471191406, "epoch": 0.6676125756104847, - "grad_norm": 28.496912002563477, - "kl": 0.31982421875, - "learning_rate": 3.002588644723947e-08, - "loss": 0.0732, - "reward": 0.4162946566939354, - "reward_std": 0.16135310009121895, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 25.844369888305664, + "kl": 2.416015625, + "learning_rate": 1.5012943223619735e-07, + "loss": 0.1574, + "reward": 0.5128348544239998, + "reward_std": 0.15217616595327854, + "rewards/accuracy_reward": 0.09151786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375223517418, + "rewards/tag_count_reward": 0.4213169738650322, "step": 2235 }, { "clip_ratio": 0.0, - "completion_length": 1473.1607666015625, + "completion_length": 1844.99560546875, "epoch": 0.6679112836980061, - "grad_norm": 27.289867401123047, - "kl": 0.28564453125, - "learning_rate": 2.997808811532355e-08, - "loss": 0.0458, - "reward": 0.372209832072258, - "reward_std": 0.14720938354730606, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 88.18907928466797, + "kl": 5.453125, + "learning_rate": 1.4989044057661777e-07, + "loss": 0.2741, + "reward": 0.4765625223517418, + "reward_std": 0.1489609144628048, + "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2236 }, { "clip_ratio": 0.0, - "completion_length": 1423.0223999023438, + "completion_length": 1832.6384887695312, "epoch": 0.6682099917855276, - "grad_norm": 25.593822479248047, - "kl": 0.284423828125, - "learning_rate": 2.993031156528542e-08, - "loss": 0.0526, - "reward": 0.3738839477300644, - "reward_std": 0.19598009809851646, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 65.8905258178711, + "kl": 4.990234375, + "learning_rate": 1.4965155782642707e-07, + "loss": 0.2609, + "reward": 0.498883955180645, + "reward_std": 0.19453697465360165, + "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2237 }, { "clip_ratio": 0.0, - "completion_length": 1425.8170166015625, + "completion_length": 1846.1563110351562, "epoch": 0.6685086998730491, - "grad_norm": 27.412065505981445, - "kl": 0.31201171875, - "learning_rate": 2.988255684910125e-08, - "loss": 0.0822, - "reward": 0.427455373108387, - "reward_std": 0.15618594363331795, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 94.49633026123047, + "kl": 7.3203125, + "learning_rate": 1.4941278424550625e-07, + "loss": 0.3648, + "reward": 0.5117187798023224, + "reward_std": 0.14260359480977058, + "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482164144516, + "rewards/tag_count_reward": 0.388950914144516, "step": 2238 }, { "clip_ratio": 0.0, - "completion_length": 1387.2857666015625, + "completion_length": 1800.9130249023438, "epoch": 0.6688074079605706, - "grad_norm": 31.107940673828125, - "kl": 0.33251953125, - "learning_rate": 2.9834824018723503e-08, - "loss": 0.0866, - "reward": 0.3934151977300644, - "reward_std": 0.17855535820126534, - "rewards/accuracy_reward": 0.060267861699685454, + "grad_norm": 17.300281524658203, + "kl": 3.78515625, + "learning_rate": 1.4917412009361752e-07, + "loss": 0.2243, + "reward": 0.4737723395228386, + "reward_std": 0.1392871793359518, + "rewards/accuracy_reward": 0.07142857811413705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.4023437649011612, "step": 2239 }, { "clip_ratio": 0.0, - "completion_length": 1455.810302734375, + "completion_length": 1837.7567749023438, "epoch": 0.669106116048092, - "grad_norm": 26.544822692871094, - "kl": 0.35595703125, - "learning_rate": 2.9787113126080838e-08, - "loss": 0.0645, - "reward": 0.3911830633878708, - "reward_std": 0.18599475547671318, - "rewards/accuracy_reward": 0.0669642873108387, + "grad_norm": 36.67346954345703, + "kl": 2.173828125, + "learning_rate": 1.4893556563040418e-07, + "loss": 0.1368, + "reward": 0.4899553880095482, + "reward_std": 0.1452622301876545, + "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2240 }, { "clip_ratio": 0.0, - "completion_length": 1434.7768249511719, + "completion_length": 1834.8728637695312, "epoch": 0.6694048241356134, - "grad_norm": 23.840845108032227, - "kl": 0.31005859375, - "learning_rate": 2.9739424223078014e-08, - "loss": 0.0587, - "reward": 0.3621651902794838, - "reward_std": 0.23241586983203888, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 41.0516357421875, + "kl": 2.21875, + "learning_rate": 1.4869712111539007e-07, + "loss": 0.1517, + "reward": 0.4659598395228386, + "reward_std": 0.20707625150680542, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.412388414144516, "step": 2241 }, { "clip_ratio": 0.0, - "completion_length": 1405.1406860351562, + "completion_length": 1811.5983276367188, "epoch": 0.669703532223135, - "grad_norm": 30.705228805541992, - "kl": 0.2998046875, - "learning_rate": 2.969175736159588e-08, - "loss": 0.0979, - "reward": 0.3950893059372902, - "reward_std": 0.20134291425347328, - "rewards/accuracy_reward": 0.07366071688011289, + "grad_norm": 33.667640686035156, + "kl": 2.177734375, + "learning_rate": 1.4845878680797942e-07, + "loss": 0.16, + "reward": 0.4866071566939354, + "reward_std": 0.18808109126985073, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.3950893059372902, "step": 2242 }, { "clip_ratio": 0.0, - "completion_length": 1364.2478332519531, + "completion_length": 1738.4286499023438, "epoch": 0.6700022403106564, - "grad_norm": 31.504981994628906, - "kl": 0.274169921875, - "learning_rate": 2.964411259349131e-08, - "loss": 0.0507, - "reward": 0.5731027200818062, - "reward_std": 0.1950862854719162, - "rewards/accuracy_reward": 0.22321429662406445, + "grad_norm": 16.0098876953125, + "kl": 2.5703125, + "learning_rate": 1.4822056296745656e-07, + "loss": 0.1775, + "reward": 0.667410746216774, + "reward_std": 0.1542766820639372, + "rewards/accuracy_reward": 0.2455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3498884066939354, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2243 }, { "clip_ratio": 0.0, - "completion_length": 1424.9844665527344, + "completion_length": 1805.7657165527344, "epoch": 0.6703009483981779, - "grad_norm": 28.90024185180664, - "kl": 0.2861328125, - "learning_rate": 2.959648997059716e-08, - "loss": 0.0717, - "reward": 0.3895089477300644, - "reward_std": 0.180948618799448, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 29.059484481811523, + "kl": 4.48046875, + "learning_rate": 1.479824498529858e-07, + "loss": 0.2575, + "reward": 0.4838169738650322, + "reward_std": 0.1428676601499319, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.419084832072258, "step": 2244 }, { "clip_ratio": 0.0, - "completion_length": 1345.8929138183594, + "completion_length": 1771.5804443359375, "epoch": 0.6705996564856993, - "grad_norm": 21.29640769958496, - "kl": 0.2998046875, - "learning_rate": 2.954888954472216e-08, - "loss": 0.0567, - "reward": 0.4860491380095482, - "reward_std": 0.15399930998682976, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 10.385119438171387, + "kl": 3.74609375, + "learning_rate": 1.477444477236108e-07, + "loss": 0.2244, + "reward": 0.5803571715950966, + "reward_std": 0.1499339733272791, + "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4218750298023224, "step": 2245 }, { "clip_ratio": 0.0, - "completion_length": 1487.0491943359375, + "completion_length": 1836.9219970703125, "epoch": 0.6708983645732208, - "grad_norm": 24.220874786376953, - "kl": 0.33056640625, - "learning_rate": 2.9501311367650906e-08, - "loss": 0.0704, - "reward": 0.4335937649011612, - "reward_std": 0.16249243915081024, - "rewards/accuracy_reward": 0.12053571827709675, + "grad_norm": 8.37000846862793, + "kl": 4.10546875, + "learning_rate": 1.4750655683825454e-07, + "loss": 0.2264, + "reward": 0.5256696715950966, + "reward_std": 0.13385805487632751, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313058041036129, + "rewards/tag_count_reward": 0.3962053805589676, "step": 2246 }, { "clip_ratio": 0.0, - "completion_length": 1479.9107666015625, + "completion_length": 1826.3371276855469, "epoch": 0.6711970726607422, - "grad_norm": 27.67979621887207, - "kl": 0.31396484375, - "learning_rate": 2.9453755491143794e-08, - "loss": 0.0655, - "reward": 0.4520089477300644, - "reward_std": 0.16592667996883392, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 30.40422821044922, + "kl": 2.45703125, + "learning_rate": 1.4726877745571898e-07, + "loss": 0.1438, + "reward": 0.5546875298023224, + "reward_std": 0.13541391491889954, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4229910969734192, "step": 2247 }, { "clip_ratio": 0.0, - "completion_length": 1463.0447082519531, + "completion_length": 1827.3572387695312, "epoch": 0.6714957807482638, - "grad_norm": 29.290306091308594, - "kl": 0.34619140625, - "learning_rate": 2.9406221966936972e-08, - "loss": 0.0617, - "reward": 0.3281250149011612, - "reward_std": 0.1612277254462242, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 22.97214126586914, + "kl": 3.10546875, + "learning_rate": 1.4703110983468486e-07, + "loss": 0.1934, + "reward": 0.435267873108387, + "reward_std": 0.1530088633298874, + "rewards/accuracy_reward": 0.017857143888249993, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4174107387661934, "step": 2248 }, { "clip_ratio": 0.0, - "completion_length": 1372.9665832519531, + "completion_length": 1756.3817749023438, "epoch": 0.6717944888357852, - "grad_norm": 22.341718673706055, - "kl": 0.300537109375, - "learning_rate": 2.9358710846742235e-08, - "loss": 0.0471, - "reward": 0.3722098395228386, - "reward_std": 0.16662070527672768, - "rewards/accuracy_reward": 0.03348214295692742, + "grad_norm": 8.939291000366211, + "kl": 4.15234375, + "learning_rate": 1.4679355423371117e-07, + "loss": 0.2441, + "reward": 0.4441964402794838, + "reward_std": 0.14485556446015835, + "rewards/accuracy_reward": 0.03571428591385484, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2249 }, { "clip_ratio": 0.0, - "completion_length": 1418.3527526855469, + "completion_length": 1857.5313415527344, "epoch": 0.6720931969233067, - "grad_norm": 27.146390914916992, - "kl": 0.27734375, - "learning_rate": 2.9311222182247025e-08, - "loss": 0.057, - "reward": 0.4029017984867096, - "reward_std": 0.17501037567853928, - "rewards/accuracy_reward": 0.055803574388846755, + "grad_norm": 15.77121353149414, + "kl": 3.6328125, + "learning_rate": 1.4655611091123511e-07, + "loss": 0.1884, + "reward": 0.4709821715950966, + "reward_std": 0.14920120686292648, + "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982313156128, + "rewards/tag_count_reward": 0.4151785969734192, "step": 2250 }, { "clip_ratio": 0.0, - "completion_length": 1425.3728332519531, + "completion_length": 1831.8438110351562, "epoch": 0.6723919050108281, - "grad_norm": 24.194255828857422, - "kl": 0.3056640625, - "learning_rate": 2.9263756025114373e-08, - "loss": 0.0407, - "reward": 0.4386160895228386, - "reward_std": 0.1962682344019413, - "rewards/accuracy_reward": 0.10267857555299997, + "grad_norm": 33.26713180541992, + "kl": 2.212890625, + "learning_rate": 1.4631878012557186e-07, + "loss": 0.1527, + "reward": 0.5284598395228386, + "reward_std": 0.18534547463059425, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.412388414144516, "step": 2251 }, { "clip_ratio": 0.0, - "completion_length": 1396.7054138183594, + "completion_length": 1808.8505554199219, "epoch": 0.6726906130983497, - "grad_norm": 26.5141658782959, - "kl": 0.32470703125, - "learning_rate": 2.92163124269828e-08, - "loss": 0.0645, - "reward": 0.392299123108387, - "reward_std": 0.20668940618634224, - "rewards/accuracy_reward": 0.06473214365541935, + "grad_norm": 28.90888786315918, + "kl": 2.154296875, + "learning_rate": 1.4608156213491398e-07, + "loss": 0.1386, + "reward": 0.4843750298023224, + "reward_std": 0.1605367213487625, + "rewards/accuracy_reward": 0.06696428591385484, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4174107238650322, "step": 2252 }, { "clip_ratio": 0.0, - "completion_length": 1463.4531860351562, + "completion_length": 1803.6875610351562, "epoch": 0.6729893211858711, - "grad_norm": 28.722715377807617, - "kl": 0.3310546875, - "learning_rate": 2.9168891439466282e-08, - "loss": 0.0873, - "reward": 0.4224330559372902, - "reward_std": 0.1831551343202591, - "rewards/accuracy_reward": 0.09375000232830644, + "grad_norm": 7.708353519439697, + "kl": 3.3203125, + "learning_rate": 1.458444571973314e-07, + "loss": 0.181, + "reward": 0.5206473544239998, + "reward_std": 0.18021929264068604, + "rewards/accuracy_reward": 0.11383929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4068080559372902, "step": 2253 }, { "clip_ratio": 0.0, - "completion_length": 1429.71435546875, + "completion_length": 1780.5022888183594, "epoch": 0.6732880292733926, - "grad_norm": 27.8275089263916, - "kl": 0.30615234375, - "learning_rate": 2.9121493114154195e-08, - "loss": 0.0701, - "reward": 0.3621651902794838, - "reward_std": 0.16633693128824234, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 15.127179145812988, + "kl": 3.34765625, + "learning_rate": 1.4560746557077096e-07, + "loss": 0.2062, + "reward": 0.4475446715950966, + "reward_std": 0.13529558666050434, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3420759066939354, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2254 }, { "clip_ratio": 0.0, - "completion_length": 1474.9866943359375, + "completion_length": 1814.1451721191406, "epoch": 0.673586737360914, - "grad_norm": 28.829872131347656, - "kl": 0.32470703125, - "learning_rate": 2.9074117502611296e-08, - "loss": 0.0629, - "reward": 0.3945312723517418, - "reward_std": 0.19852902367711067, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 37.160343170166016, + "kl": 4.55859375, + "learning_rate": 1.4537058751305648e-07, + "loss": 0.2656, + "reward": 0.4447544813156128, + "reward_std": 0.19166382774710655, + "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.3911830633878708, "step": 2255 }, { "clip_ratio": 0.0, - "completion_length": 1470.6964721679688, + "completion_length": 1860.4041137695312, "epoch": 0.6738854454484355, - "grad_norm": 26.61707305908203, - "kl": 0.3212890625, - "learning_rate": 2.9026764656377607e-08, - "loss": 0.0594, - "reward": 0.381138414144516, - "reward_std": 0.15919284150004387, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 11.697124481201172, + "kl": 3.73828125, + "learning_rate": 1.4513382328188804e-07, + "loss": 0.1929, + "reward": 0.466517873108387, + "reward_std": 0.13220770843327045, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991305589676, + "rewards/tag_count_reward": 0.3973214477300644, "step": 2256 }, { "clip_ratio": 0.0, - "completion_length": 1453.4822082519531, + "completion_length": 1845.3862609863281, "epoch": 0.674184153535957, - "grad_norm": 23.725513458251953, - "kl": 0.31201171875, - "learning_rate": 2.8979434626968358e-08, - "loss": 0.0522, - "reward": 0.3722098395228386, - "reward_std": 0.1504089441150427, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 19.31157875061035, + "kl": 3.38671875, + "learning_rate": 1.448971731348418e-07, + "loss": 0.1885, + "reward": 0.4553571566939354, + "reward_std": 0.13384700566530228, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.3861607313156128, "step": 2257 }, { "clip_ratio": 0.0, - "completion_length": 1395.0603332519531, + "completion_length": 1885.2478637695312, "epoch": 0.6744828616234785, - "grad_norm": 28.482776641845703, - "kl": 0.369140625, - "learning_rate": 2.8932127465874002e-08, - "loss": 0.0474, - "reward": 0.4012276977300644, - "reward_std": 0.1586824581027031, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 7.32114839553833, + "kl": 4.31640625, + "learning_rate": 1.4466063732937e-07, + "loss": 0.2278, + "reward": 0.4815848469734192, + "reward_std": 0.12960257567465305, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.320870541036129, + "rewards/tag_count_reward": 0.3989955559372902, "step": 2258 }, { "clip_ratio": 0.0, - "completion_length": 1477.9375610351562, + "completion_length": 1809.1742248535156, "epoch": 0.6747815697109999, - "grad_norm": 24.494707107543945, - "kl": 0.2958984375, - "learning_rate": 2.8884843224560107e-08, - "loss": 0.0609, - "reward": 0.4001116305589676, - "reward_std": 0.15868601575493813, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 39.921119689941406, + "kl": 3.09375, + "learning_rate": 1.4442421612280055e-07, + "loss": 0.2065, + "reward": 0.5016741380095482, + "reward_std": 0.13272346556186676, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4213169887661934, "step": 2259 }, { "clip_ratio": 0.0, - "completion_length": 1360.1607360839844, + "completion_length": 1777.9554443359375, "epoch": 0.6750802777985214, - "grad_norm": 28.929363250732422, - "kl": 0.34765625, - "learning_rate": 2.883758195446725e-08, - "loss": 0.0487, - "reward": 0.3962053656578064, - "reward_std": 0.1885504126548767, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 26.055233001708984, + "kl": 4.2109375, + "learning_rate": 1.4418790977233624e-07, + "loss": 0.2353, + "reward": 0.5039062798023224, + "reward_std": 0.1640316154807806, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.423549123108387, "step": 2260 }, { "clip_ratio": 0.0, - "completion_length": 1380.99560546875, + "completion_length": 1790.3237609863281, "epoch": 0.6753789858860428, - "grad_norm": 25.756166458129883, - "kl": 0.30859375, - "learning_rate": 2.879034370701111e-08, - "loss": 0.0694, - "reward": 0.3867187649011612, - "reward_std": 0.17788690701127052, - "rewards/accuracy_reward": 0.06919643026776612, + "grad_norm": 19.48446273803711, + "kl": 4.158203125, + "learning_rate": 1.4395171853505555e-07, + "loss": 0.2264, + "reward": 0.5161830633878708, + "reward_std": 0.16254903934895992, + "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.415736623108387, "step": 2261 }, { "clip_ratio": 0.0, - "completion_length": 1436.0223693847656, + "completion_length": 1827.57373046875, "epoch": 0.6756776939735644, - "grad_norm": 31.77899742126465, - "kl": 0.3359375, - "learning_rate": 2.8743128533582233e-08, - "loss": 0.0813, - "reward": 0.3621651977300644, - "reward_std": 0.17570829018950462, + "grad_norm": 33.797950744628906, + "kl": 2.318359375, + "learning_rate": 1.4371564266791117e-07, + "loss": 0.1421, + "reward": 0.447544664144516, + "reward_std": 0.1427716352045536, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2262 }, { "clip_ratio": 0.0, - "completion_length": 1369.1897888183594, + "completion_length": 1750.4822387695312, "epoch": 0.6759764020610858, - "grad_norm": 26.105714797973633, - "kl": 0.3193359375, - "learning_rate": 2.8695936485546113e-08, - "loss": 0.0569, - "reward": 0.4291294813156128, - "reward_std": 0.18720606714487076, - "rewards/accuracy_reward": 0.09151785937137902, + "grad_norm": 26.659408569335938, + "kl": 2.78515625, + "learning_rate": 1.4347968242773056e-07, + "loss": 0.1764, + "reward": 0.5206473469734192, + "reward_std": 0.14315833523869514, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.420200914144516, "step": 2263 }, { "clip_ratio": 0.0, - "completion_length": 1506.7991943359375, + "completion_length": 1860.2991943359375, "epoch": 0.6762751101486073, - "grad_norm": 20.615997314453125, - "kl": 0.322265625, - "learning_rate": 2.8648767614243087e-08, - "loss": 0.0365, - "reward": 0.3638393133878708, - "reward_std": 0.2252562753856182, - "rewards/accuracy_reward": 0.04464286006987095, + "grad_norm": 11.19881820678711, + "kl": 3.24609375, + "learning_rate": 1.4324383807121544e-07, + "loss": 0.1666, + "reward": 0.494977705180645, + "reward_std": 0.23616641014814377, + "rewards/accuracy_reward": 0.08705357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.407924123108387, "step": 2264 }, { "clip_ratio": 0.0, - "completion_length": 1458.3705749511719, + "completion_length": 1866.2835998535156, "epoch": 0.6765738182361287, - "grad_norm": 29.187232971191406, - "kl": 0.35302734375, - "learning_rate": 2.8601621970988237e-08, - "loss": 0.0443, - "reward": 0.412388414144516, - "reward_std": 0.23616067692637444, - "rewards/accuracy_reward": 0.09151786286383867, + "grad_norm": 7.676377773284912, + "kl": 3.51953125, + "learning_rate": 1.430081098549412e-07, + "loss": 0.1931, + "reward": 0.5217634215950966, + "reward_std": 0.20982329919934273, + "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.412388414144516, "step": 2265 }, { "clip_ratio": 0.0, - "completion_length": 1362.5513916015625, + "completion_length": 1811.1697082519531, "epoch": 0.6768725263236502, - "grad_norm": 23.194671630859375, - "kl": 0.3193359375, - "learning_rate": 2.8554499607071414e-08, - "loss": 0.0571, - "reward": 0.3995535969734192, - "reward_std": 0.1837044321000576, - "rewards/accuracy_reward": 0.0758928582072258, + "grad_norm": 14.971346855163574, + "kl": 4.38671875, + "learning_rate": 1.4277249803535707e-07, + "loss": 0.2459, + "reward": 0.4815848469734192, + "reward_std": 0.1541061159223318, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4079241305589676, "step": 2266 }, { "clip_ratio": 0.0, - "completion_length": 1373.1384582519531, + "completion_length": 1812.6697387695312, "epoch": 0.6771712344111717, - "grad_norm": 28.910980224609375, - "kl": 0.332275390625, - "learning_rate": 2.8507400573757156e-08, - "loss": 0.0655, - "reward": 0.4910714402794838, - "reward_std": 0.20643015205860138, - "rewards/accuracy_reward": 0.1540178656578064, + "grad_norm": 6.971721649169922, + "kl": 3.5771484375, + "learning_rate": 1.425370028687858e-07, + "loss": 0.2008, + "reward": 0.5747768208384514, + "reward_std": 0.17602366767823696, + "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.416294664144516, "step": 2267 }, { "clip_ratio": 0.0, - "completion_length": 1441.654052734375, + "completion_length": 1819.4532165527344, "epoch": 0.6774699424986932, - "grad_norm": 25.915889739990234, - "kl": 0.31884765625, - "learning_rate": 2.846032492228455e-08, - "loss": 0.08, - "reward": 0.4045759066939354, - "reward_std": 0.191526148468256, - "rewards/accuracy_reward": 0.07366071688011289, + "grad_norm": 30.680177688598633, + "kl": 2.6015625, + "learning_rate": 1.4230162461142275e-07, + "loss": 0.1582, + "reward": 0.5106026977300644, + "reward_std": 0.20192551985383034, + "rewards/accuracy_reward": 0.10714285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.4034598395228386, "step": 2268 }, { "clip_ratio": 0.0, - "completion_length": 1458.3192443847656, + "completion_length": 1863.8259887695312, "epoch": 0.6777686505862146, - "grad_norm": 26.903793334960938, - "kl": 0.32763671875, - "learning_rate": 2.8413272703867314e-08, - "loss": 0.064, - "reward": 0.4107143059372902, - "reward_std": 0.18164760619401932, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 46.24980163574219, + "kl": 2.072265625, + "learning_rate": 1.4206636351933656e-07, + "loss": 0.1306, + "reward": 0.532924123108387, + "reward_std": 0.18215982615947723, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2269 }, { "clip_ratio": 0.0, - "completion_length": 1409.1563110351562, + "completion_length": 1848.7255249023438, "epoch": 0.6780673586737361, - "grad_norm": 24.700651168823242, - "kl": 0.341796875, - "learning_rate": 2.836624396969367e-08, - "loss": 0.0479, - "reward": 0.3627232313156128, - "reward_std": 0.1576467528939247, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 6.81874942779541, + "kl": 3.19921875, + "learning_rate": 1.4183121984846835e-07, + "loss": 0.1832, + "reward": 0.4458705559372902, + "reward_std": 0.13625740073621273, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.3922991156578064, "step": 2270 }, { "clip_ratio": 0.0, - "completion_length": 1482.0223999023438, + "completion_length": 1848.6652221679688, "epoch": 0.6783660667612575, - "grad_norm": 28.825721740722656, - "kl": 0.30615234375, - "learning_rate": 2.831923877092623e-08, - "loss": 0.053, - "reward": 0.3973214477300644, - "reward_std": 0.1779797300696373, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 15.339544296264648, + "kl": 3.609375, + "learning_rate": 1.4159619385463116e-07, + "loss": 0.1898, + "reward": 0.4681919887661934, + "reward_std": 0.15377994626760483, + "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.4101562649011612, "step": 2271 }, { "clip_ratio": 0.0, - "completion_length": 1438.7121276855469, + "completion_length": 1849.9442749023438, "epoch": 0.6786647748487791, - "grad_norm": 20.12704849243164, - "kl": 0.30712890625, - "learning_rate": 2.8272257158702084e-08, - "loss": 0.0372, - "reward": 0.3900669738650322, - "reward_std": 0.19877024739980698, - "rewards/accuracy_reward": 0.05803571827709675, + "grad_norm": 11.581968307495117, + "kl": 3.19140625, + "learning_rate": 1.413612857935104e-07, + "loss": 0.1798, + "reward": 0.4726562649011612, + "reward_std": 0.17398705147206783, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4123884066939354, "step": 2272 }, { "clip_ratio": 0.0, - "completion_length": 1464.6072082519531, + "completion_length": 1862.85498046875, "epoch": 0.6789634829363005, - "grad_norm": 25.372539520263672, - "kl": 0.31884765625, - "learning_rate": 2.8225299184132584e-08, - "loss": 0.0565, - "reward": 0.404017873108387, - "reward_std": 0.1595601923763752, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 11.202001571655273, + "kl": 3.9453125, + "learning_rate": 1.411264959206629e-07, + "loss": 0.2144, + "reward": 0.5150669813156128, + "reward_std": 0.15135764330625534, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4123884066939354, "step": 2273 }, { "clip_ratio": 0.0, - "completion_length": 1397.7879943847656, + "completion_length": 1838.4040832519531, "epoch": 0.679262191023822, - "grad_norm": 24.093904495239258, - "kl": 0.3623046875, - "learning_rate": 2.817836489830342e-08, - "loss": 0.0613, - "reward": 0.4218750223517418, - "reward_std": 0.2107440084218979, - "rewards/accuracy_reward": 0.12500000488944352, + "grad_norm": 27.210845947265625, + "kl": 5.03125, + "learning_rate": 1.4089182449151708e-07, + "loss": 0.2816, + "reward": 0.522321455180645, + "reward_std": 0.19746014848351479, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2968750149011612, + "rewards/tag_count_reward": 0.3883928805589676, "step": 2274 }, { "clip_ratio": 0.0, - "completion_length": 1437.0156860351562, + "completion_length": 1814.555908203125, "epoch": 0.6795608991113434, - "grad_norm": 28.721723556518555, - "kl": 0.31396484375, - "learning_rate": 2.813145435227452e-08, - "loss": 0.0665, - "reward": 0.3470982313156128, - "reward_std": 0.1683214008808136, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 37.84942626953125, + "kl": 2.220703125, + "learning_rate": 1.406572717613726e-07, + "loss": 0.1441, + "reward": 0.4291294738650322, + "reward_std": 0.11951805092394352, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2275 }, { "clip_ratio": 0.0, - "completion_length": 1406.6986999511719, + "completion_length": 1832.3840026855469, "epoch": 0.679859607198865, - "grad_norm": 23.415176391601562, - "kl": 0.32763671875, - "learning_rate": 2.808456759707991e-08, - "loss": 0.0667, - "reward": 0.3549107238650322, - "reward_std": 0.20064927637577057, - "rewards/accuracy_reward": 0.033482144586741924, + "grad_norm": 38.643157958984375, + "kl": 2.189453125, + "learning_rate": 1.4042283798539957e-07, + "loss": 0.1389, + "reward": 0.4720982313156128, + "reward_std": 0.1818946897983551, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4185268133878708, "step": 2276 }, { "clip_ratio": 0.0, - "completion_length": 1385.4688110351562, + "completion_length": 1752.3170166015625, "epoch": 0.6801583152863864, - "grad_norm": 28.294218063354492, - "kl": 0.3515625, - "learning_rate": 2.8037704683727815e-08, - "loss": 0.0802, - "reward": 0.466517873108387, - "reward_std": 0.18287217617034912, - "rewards/accuracy_reward": 0.14508928963914514, + "grad_norm": 8.059804916381836, + "kl": 3.087890625, + "learning_rate": 1.4018852341863907e-07, + "loss": 0.1831, + "reward": 0.5708705559372902, + "reward_std": 0.16612395457923412, + "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.416852705180645, "step": 2277 }, { "clip_ratio": 0.0, - "completion_length": 1385.9375610351562, + "completion_length": 1842.8572387695312, "epoch": 0.6804570233739079, - "grad_norm": 26.831846237182617, - "kl": 0.3330078125, - "learning_rate": 2.799086566320051e-08, - "loss": 0.0541, - "reward": 0.3777901977300644, - "reward_std": 0.1664332039654255, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 37.96380615234375, + "kl": 5.40625, + "learning_rate": 1.3995432831600254e-07, + "loss": 0.2854, + "reward": 0.4737723469734192, + "reward_std": 0.1620611995458603, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.400111623108387, "step": 2278 }, { "clip_ratio": 0.0, - "completion_length": 1348.7366638183594, + "completion_length": 1797.13623046875, "epoch": 0.6807557314614293, - "grad_norm": 23.260488510131836, - "kl": 0.32275390625, - "learning_rate": 2.794405058645421e-08, - "loss": 0.0624, - "reward": 0.353236623108387, - "reward_std": 0.17542896419763565, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 26.19983673095703, + "kl": 4.96875, + "learning_rate": 1.3972025293227106e-07, + "loss": 0.2602, + "reward": 0.4335937649011612, + "reward_std": 0.1328039299696684, + "rewards/accuracy_reward": 0.022321430267766118, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/tag_count_reward": 0.411272332072258, "step": 2279 }, { "clip_ratio": 0.0, - "completion_length": 1371.6250610351562, + "completion_length": 1767.1340026855469, "epoch": 0.6810544395489508, - "grad_norm": 26.15044593811035, - "kl": 0.31298828125, - "learning_rate": 2.7897259504419163e-08, - "loss": 0.0598, - "reward": 0.5050223395228386, - "reward_std": 0.19024837389588356, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 7.110368251800537, + "kl": 3.9140625, + "learning_rate": 1.394862975220958e-07, + "loss": 0.2332, + "reward": 0.602678582072258, + "reward_std": 0.1831081472337246, + "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080484867096, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2280 }, { "clip_ratio": 0.0, - "completion_length": 1415.99560546875, + "completion_length": 1810.5023193359375, "epoch": 0.6813531476364723, - "grad_norm": 21.22757911682129, - "kl": 0.2900390625, - "learning_rate": 2.7850492467999487e-08, - "loss": 0.0572, - "reward": 0.4006696492433548, - "reward_std": 0.15529765188694, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 45.5844841003418, + "kl": 2.296875, + "learning_rate": 1.3925246233999743e-07, + "loss": 0.1474, + "reward": 0.5133928805589676, + "reward_std": 0.13518951833248138, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446492433548, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2281 }, { "clip_ratio": 0.0, - "completion_length": 1382.87060546875, + "completion_length": 1740.7634582519531, "epoch": 0.6816518557239938, - "grad_norm": 25.167301177978516, - "kl": 0.32568359375, - "learning_rate": 2.7803749528073107e-08, - "loss": 0.0732, - "reward": 0.3699776977300644, - "reward_std": 0.17960704863071442, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 27.854764938354492, + "kl": 2.15625, + "learning_rate": 1.3901874764036553e-07, + "loss": 0.1383, + "reward": 0.4743303805589676, + "reward_std": 0.16959016397595406, + "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4207589402794838, "step": 2282 }, { "clip_ratio": 0.0, - "completion_length": 1366.9375610351562, + "completion_length": 1789.8147888183594, "epoch": 0.6819505638115152, - "grad_norm": 21.00299835205078, - "kl": 0.260498046875, - "learning_rate": 2.7757030735491772e-08, - "loss": 0.047, - "reward": 0.526227705180645, - "reward_std": 0.18834864348173141, - "rewards/accuracy_reward": 0.17187500605359674, + "grad_norm": 43.78980255126953, + "kl": 2.43359375, + "learning_rate": 1.3878515367745886e-07, + "loss": 0.1512, + "reward": 0.5987723469734192, + "reward_std": 0.161507036536932, + "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526902794838, + "rewards/tag_count_reward": 0.4157366305589676, "step": 2283 }, { "clip_ratio": 0.0, - "completion_length": 1393.8304443359375, + "completion_length": 1756.2455749511719, "epoch": 0.6822492718990366, - "grad_norm": 29.866718292236328, - "kl": 0.2822265625, - "learning_rate": 2.771033614108097e-08, - "loss": 0.0738, - "reward": 0.4190848395228386, - "reward_std": 0.20183081924915314, - "rewards/accuracy_reward": 0.066964291036129, + "grad_norm": 20.92259407043457, + "kl": 2.025390625, + "learning_rate": 1.3855168070540484e-07, + "loss": 0.1311, + "reward": 0.5558035895228386, + "reward_std": 0.1605326235294342, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3521205559372902, + "rewards/tag_count_reward": 0.4441964477300644, "step": 2284 }, { "clip_ratio": 0.0, - "completion_length": 1457.5960388183594, + "completion_length": 1913.6050109863281, "epoch": 0.6825479799865581, - "grad_norm": 20.399829864501953, - "kl": 0.37841796875, - "learning_rate": 2.7663665795639813e-08, - "loss": 0.0506, - "reward": 0.3599330559372902, - "reward_std": 0.20276802778244019, - "rewards/accuracy_reward": 0.06250000442378223, + "grad_norm": 46.264522552490234, + "kl": 5.3515625, + "learning_rate": 1.3831832897819905e-07, + "loss": 0.2391, + "reward": 0.474888414144516, + "reward_std": 0.18064400181174278, + "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.3900669738650322, "step": 2285 }, { "clip_ratio": 0.0, - "completion_length": 1401.8236999511719, + "completion_length": 1831.5960693359375, "epoch": 0.6828466880740796, - "grad_norm": 23.14652442932129, - "kl": 0.24853515625, - "learning_rate": 2.7617019749941085e-08, - "loss": 0.058, - "reward": 0.404017873108387, - "reward_std": 0.15322944708168507, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 37.7057991027832, + "kl": 5.0625, + "learning_rate": 1.3808509874970541e-07, + "loss": 0.2598, + "reward": 0.5150669887661934, + "reward_std": 0.17999398335814476, + "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3593750149011612, + "rewards/tag_count_reward": 0.4168526902794838, "step": 2286 }, { "clip_ratio": 0.0, - "completion_length": 1427.5380249023438, + "completion_length": 1844.3326721191406, "epoch": 0.6831453961616011, - "grad_norm": 24.243099212646484, - "kl": 0.3203125, - "learning_rate": 2.757039805473108e-08, - "loss": 0.0658, - "reward": 0.3320312574505806, - "reward_std": 0.1885770969092846, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 35.68173599243164, + "kl": 5.46484375, + "learning_rate": 1.378519902736554e-07, + "loss": 0.2739, + "reward": 0.4447544813156128, + "reward_std": 0.17052270472049713, + "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2287 }, { "clip_ratio": 0.0, - "completion_length": 1400.5491638183594, + "completion_length": 1825.5938110351562, "epoch": 0.6834441042491225, - "grad_norm": 28.620756149291992, - "kl": 0.35009765625, - "learning_rate": 2.7523800760729665e-08, - "loss": 0.0911, - "reward": 0.3962053805589676, - "reward_std": 0.17063992470502853, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 18.42423439025879, + "kl": 3.40625, + "learning_rate": 1.3761900380364834e-07, + "loss": 0.2059, + "reward": 0.4910714402794838, + "reward_std": 0.12804099172353745, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.4062500149011612, "step": 2288 }, { "clip_ratio": 0.0, - "completion_length": 1430.4754943847656, + "completion_length": 1823.43310546875, "epoch": 0.683742812336644, - "grad_norm": 26.456607818603516, - "kl": 0.31640625, - "learning_rate": 2.747722791863013e-08, - "loss": 0.0862, - "reward": 0.4464285895228386, - "reward_std": 0.20762256160378456, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 27.275354385375977, + "kl": 2.533203125, + "learning_rate": 1.3738613959315066e-07, + "loss": 0.169, + "reward": 0.584821455180645, + "reward_std": 0.2129845693707466, + "rewards/accuracy_reward": 0.16964286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2289 }, { "clip_ratio": 0.0, - "completion_length": 1391.3995971679688, - "epoch": 0.6840415204241654, - "grad_norm": 26.773900985717773, - "kl": 0.3212890625, - "learning_rate": 2.743067957909913e-08, - "loss": 0.05, - "reward": 0.4877232238650322, - "reward_std": 0.1355978585779667, - "rewards/accuracy_reward": 0.14508929220028222, + "completion_length": 1778.1518859863281, + "epoch": 0.6840415204241654, + "grad_norm": 31.560060501098633, + "kl": 3.09375, + "learning_rate": 1.3715339789549564e-07, + "loss": 0.2062, + "reward": 0.5714285895228386, + "reward_std": 0.12910557724535465, + "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2290 }, { "clip_ratio": 0.0, - "completion_length": 1384.7545471191406, + "completion_length": 1761.7679443359375, "epoch": 0.684340228511687, - "grad_norm": 22.37617301940918, - "kl": 0.3330078125, - "learning_rate": 2.738415579277672e-08, - "loss": 0.0672, - "reward": 0.4453125149011612, - "reward_std": 0.18840213678777218, - "rewards/accuracy_reward": 0.11830357555299997, + "grad_norm": 11.656278610229492, + "kl": 4.11328125, + "learning_rate": 1.369207789638836e-07, + "loss": 0.2398, + "reward": 0.526227705180645, + "reward_std": 0.16148805804550648, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2291 }, { "clip_ratio": 0.0, - "completion_length": 1418.0692749023438, + "completion_length": 1857.7032165527344, "epoch": 0.6846389365992084, - "grad_norm": 18.642513275146484, - "kl": 0.3173828125, - "learning_rate": 2.7337656610276233e-08, - "loss": 0.0514, - "reward": 0.3476562574505806, - "reward_std": 0.17895102500915527, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 19.717029571533203, + "kl": 5.1328125, + "learning_rate": 1.3668828305138116e-07, + "loss": 0.2705, + "reward": 0.4469866380095482, + "reward_std": 0.18407762795686722, + "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026828289032, + "rewards/tag_count_reward": 0.3956473469734192, "step": 2292 }, { "clip_ratio": 0.0, - "completion_length": 1411.1384582519531, + "completion_length": 1793.5871276855469, "epoch": 0.6849376446867299, - "grad_norm": 27.92796516418457, - "kl": 0.35400390625, - "learning_rate": 2.729118208218419e-08, - "loss": 0.0801, - "reward": 0.4073660895228386, - "reward_std": 0.17762621492147446, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 11.887115478515625, + "kl": 3.88671875, + "learning_rate": 1.3645591041092094e-07, + "loss": 0.2364, + "reward": 0.4815848544239998, + "reward_std": 0.13165586814284325, + "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3180803805589676, + "rewards/tag_count_reward": 0.3967634066939354, "step": 2293 }, { "clip_ratio": 0.0, - "completion_length": 1458.7723693847656, + "completion_length": 1830.1228332519531, "epoch": 0.6852363527742513, - "grad_norm": 26.994400024414062, - "kl": 0.32666015625, - "learning_rate": 2.7244732259060333e-08, - "loss": 0.0564, - "reward": 0.3264509066939354, - "reward_std": 0.16832182928919792, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 33.36900329589844, + "kl": 2.2734375, + "learning_rate": 1.3622366129530166e-07, + "loss": 0.1358, + "reward": 0.4430803805589676, + "reward_std": 0.13829679787158966, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4207589402794838, "step": 2294 }, { "clip_ratio": 0.0, - "completion_length": 1378.3728332519531, + "completion_length": 1777.3237609863281, "epoch": 0.6855350608617728, - "grad_norm": 24.022968292236328, - "kl": 0.30810546875, - "learning_rate": 2.7198307191437547e-08, - "loss": 0.0633, - "reward": 0.4414062649011612, - "reward_std": 0.19894244521856308, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 19.76296043395996, + "kl": 2.87109375, + "learning_rate": 1.3599153595718773e-07, + "loss": 0.1849, + "reward": 0.513392873108387, + "reward_std": 0.16689422726631165, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2295 }, { "clip_ratio": 0.0, - "completion_length": 1461.3348999023438, + "completion_length": 1800.6250610351562, "epoch": 0.6858337689492943, - "grad_norm": 22.847442626953125, - "kl": 0.32373046875, - "learning_rate": 2.715190692982171e-08, - "loss": 0.0908, - "reward": 0.4620536044239998, - "reward_std": 0.17905309051275253, - "rewards/accuracy_reward": 0.1428571492433548, + "grad_norm": 29.906021118164062, + "kl": 2.75, + "learning_rate": 1.3575953464910855e-07, + "loss": 0.1808, + "reward": 0.5786830633878708, + "reward_std": 0.14733159355819225, + "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2296 }, { "clip_ratio": 0.0, - "completion_length": 1450.9621276855469, + "completion_length": 1847.2679443359375, "epoch": 0.6861324770368158, - "grad_norm": 25.155942916870117, - "kl": 0.27587890625, - "learning_rate": 2.7105531524691777e-08, - "loss": 0.0691, - "reward": 0.4983259215950966, - "reward_std": 0.17267910204827785, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 13.44467830657959, + "kl": 4.1875, + "learning_rate": 1.3552765762345889e-07, + "loss": 0.2318, + "reward": 0.594308078289032, + "reward_std": 0.16371478140354156, + "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.415736623108387, "step": 2297 }, { "clip_ratio": 0.0, - "completion_length": 1407.4911499023438, + "completion_length": 1806.0425109863281, "epoch": 0.6864311851243372, - "grad_norm": 23.470861434936523, - "kl": 0.3193359375, - "learning_rate": 2.7059181026499666e-08, - "loss": 0.0719, - "reward": 0.3465401902794838, - "reward_std": 0.16240552067756653, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 11.341994285583496, + "kl": 4.6328125, + "learning_rate": 1.3529590513249834e-07, + "loss": 0.272, + "reward": 0.4492187798023224, + "reward_std": 0.14527273178100586, + "rewards/accuracy_reward": 0.046875003492459655, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.4023437649011612, "step": 2298 }, { "clip_ratio": 0.0, - "completion_length": 1421.2009582519531, + "completion_length": 1856.8505249023438, "epoch": 0.6867298932118587, - "grad_norm": 26.94002914428711, - "kl": 0.30419921875, - "learning_rate": 2.701285548567014e-08, - "loss": 0.0663, - "reward": 0.4375000223517418, - "reward_std": 0.15361602045595646, - "rewards/accuracy_reward": 0.09151786006987095, + "grad_norm": 14.80528736114502, + "kl": 2.75, + "learning_rate": 1.350642774283507e-07, + "loss": 0.1592, + "reward": 0.5139509215950966, + "reward_std": 0.14822277054190636, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821566939354, + "rewards/tag_count_reward": 0.4179687723517418, "step": 2299 }, { "clip_ratio": 0.0, - "completion_length": 1404.1072082519531, + "completion_length": 1802.4420471191406, "epoch": 0.6870286012993801, - "grad_norm": 24.74565887451172, - "kl": 0.31640625, - "learning_rate": 2.6966554952600883e-08, - "loss": 0.0607, - "reward": 0.3627232238650322, - "reward_std": 0.1825161799788475, - "rewards/accuracy_reward": 0.026785715483129025, + "grad_norm": 14.312652587890625, + "kl": 2.90625, + "learning_rate": 1.3483277476300442e-07, + "loss": 0.165, + "reward": 0.4469866305589676, + "reward_std": 0.15317968651652336, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2300 }, { "clip_ratio": 0.0, - "completion_length": 1397.1361999511719, + "completion_length": 1791.2188110351562, "epoch": 0.6873273093869017, - "grad_norm": 25.243789672851562, - "kl": 0.30712890625, - "learning_rate": 2.692027947766229e-08, - "loss": 0.0918, - "reward": 0.3303571566939354, - "reward_std": 0.16210122406482697, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 17.865530014038086, + "kl": 4.396484375, + "learning_rate": 1.3460139738831144e-07, + "loss": 0.2524, + "reward": 0.4520089402794838, + "reward_std": 0.1305291187018156, + "rewards/accuracy_reward": 0.03571428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310267873108387, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2301 }, { "clip_ratio": 0.0, - "completion_length": 1431.8371276855469, + "completion_length": 1826.2322387695312, "epoch": 0.6876260174744231, - "grad_norm": 21.1333065032959, - "kl": 0.28662109375, - "learning_rate": 2.6874029111197578e-08, - "loss": 0.052, - "reward": 0.4174107313156128, - "reward_std": 0.1859288588166237, - "rewards/accuracy_reward": 0.09375000419095159, + "grad_norm": 7.019198417663574, + "kl": 3.2734375, + "learning_rate": 1.3437014555598788e-07, + "loss": 0.1863, + "reward": 0.5039062723517418, + "reward_std": 0.14002435468137264, + "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2302 }, { "clip_ratio": 0.0, - "completion_length": 1412.1518249511719, + "completion_length": 1783.5268859863281, "epoch": 0.6879247255619446, - "grad_norm": 23.68489646911621, - "kl": 0.287109375, - "learning_rate": 2.6827803903522617e-08, - "loss": 0.0623, - "reward": 0.3671875149011612, - "reward_std": 0.17002400383353233, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 481.5315246582031, + "kl": 7.37890625, + "learning_rate": 1.341390195176131e-07, + "loss": 0.3723, + "reward": 0.4782366305589676, + "reward_std": 0.14395453035831451, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4246651902794838, "step": 2303 }, { "clip_ratio": 0.0, - "completion_length": 1376.2590026855469, + "completion_length": 1833.6875915527344, "epoch": 0.688223433649466, - "grad_norm": 21.846982955932617, - "kl": 0.28515625, - "learning_rate": 2.6781603904925876e-08, - "loss": 0.0637, - "reward": 0.3805803805589676, - "reward_std": 0.15363709814846516, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 10.538017272949219, + "kl": 3.578125, + "learning_rate": 1.339080195246294e-07, + "loss": 0.211, + "reward": 0.4614955559372902, + "reward_std": 0.13816064968705177, + "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4079241156578064, "step": 2304 }, { "clip_ratio": 0.0, - "completion_length": 1480.9219360351562, + "completion_length": 1841.6540832519531, "epoch": 0.6885221417369876, - "grad_norm": 29.329614639282227, - "kl": 0.31005859375, - "learning_rate": 2.673542916566844e-08, - "loss": 0.059, - "reward": 0.4771205484867096, - "reward_std": 0.21232419833540916, - "rewards/accuracy_reward": 0.14285715389996767, + "grad_norm": 4.950185775756836, + "kl": 3.4921875, + "learning_rate": 1.336771458283422e-07, + "loss": 0.1959, + "reward": 0.5585937723517418, + "reward_std": 0.1743004210293293, + "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2305 }, { "clip_ratio": 0.0, - "completion_length": 1384.6831359863281, + "completion_length": 1802.4197387695312, "epoch": 0.688820849824509, - "grad_norm": 24.99567413330078, - "kl": 0.3251953125, - "learning_rate": 2.6689279735983915e-08, - "loss": 0.0766, - "reward": 0.444754496216774, - "reward_std": 0.14582355692982674, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 13.231029510498047, + "kl": 4.8359375, + "learning_rate": 1.3344639867991957e-07, + "loss": 0.2769, + "reward": 0.5234375149011612, + "reward_std": 0.13038799911737442, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4073660969734192, "step": 2306 }, { "clip_ratio": 0.0, - "completion_length": 1467.4777221679688, + "completion_length": 1861.2120971679688, "epoch": 0.6891195579120305, - "grad_norm": 23.8519229888916, - "kl": 0.333984375, - "learning_rate": 2.664315566607832e-08, - "loss": 0.0716, - "reward": 0.3437500149011612, - "reward_std": 0.20085272192955017, - "rewards/accuracy_reward": 0.03571428754366934, + "grad_norm": 6.581903457641602, + "kl": 3.7265625, + "learning_rate": 1.3321577833039162e-07, + "loss": 0.1871, + "reward": 0.443080373108387, + "reward_std": 0.14962735958397388, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.4073660895228386, "step": 2307 }, { "clip_ratio": 0.0, - "completion_length": 1391.80810546875, + "completion_length": 1757.2009887695312, "epoch": 0.6894182659995519, - "grad_norm": 27.77754783630371, - "kl": 0.32666015625, - "learning_rate": 2.6597057006130148e-08, - "loss": 0.0616, - "reward": 0.3738839477300644, - "reward_std": 0.17621182650327682, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 21.00966453552246, + "kl": 2.244140625, + "learning_rate": 1.3298528503065073e-07, + "loss": 0.1488, + "reward": 0.483816996216774, + "reward_std": 0.17069623619318008, + "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4168526902794838, "step": 2308 }, { "clip_ratio": 0.0, - "completion_length": 1418.513427734375, + "completion_length": 1835.5558471679688, "epoch": 0.6897169740870734, - "grad_norm": 22.56853485107422, - "kl": 0.29638671875, - "learning_rate": 2.6550983806290234e-08, - "loss": 0.0632, - "reward": 0.3660714477300644, - "reward_std": 0.1717811767011881, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 18.678773880004883, + "kl": 2.80078125, + "learning_rate": 1.3275491903145118e-07, + "loss": 0.1761, + "reward": 0.4793526977300644, + "reward_std": 0.17067987471818924, + "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.407924123108387, "step": 2309 }, { "clip_ratio": 0.0, - "completion_length": 1416.9130249023438, + "completion_length": 1798.5782165527344, "epoch": 0.6900156821745949, - "grad_norm": 27.73126220703125, - "kl": 0.259033203125, - "learning_rate": 2.650493611668167e-08, - "loss": 0.0696, - "reward": 0.4174107387661934, - "reward_std": 0.17660359665751457, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 28.260032653808594, + "kl": 2.494140625, + "learning_rate": 1.3252468058340835e-07, + "loss": 0.1622, + "reward": 0.4849330633878708, + "reward_std": 0.14634467475116253, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3504464402794838, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2310 }, { "clip_ratio": 0.0, - "completion_length": 1432.7835693359375, + "completion_length": 1818.6406860351562, "epoch": 0.6903143902621164, - "grad_norm": 21.588407516479492, - "kl": 0.341552734375, - "learning_rate": 2.6458913987399844e-08, - "loss": 0.0612, - "reward": 0.385044664144516, - "reward_std": 0.1933174878358841, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 7.030909061431885, + "kl": 3.251953125, + "learning_rate": 1.3229456993699923e-07, + "loss": 0.1873, + "reward": 0.4799107313156128, + "reward_std": 0.17785080149769783, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.424107164144516, "step": 2311 }, { "clip_ratio": 0.0, - "completion_length": 1423.2255249023438, + "completion_length": 1782.4531860351562, "epoch": 0.6906130983496378, - "grad_norm": 22.838939666748047, - "kl": 0.2958984375, - "learning_rate": 2.6412917468512352e-08, - "loss": 0.0725, - "reward": 0.387834832072258, - "reward_std": 0.19357070699334145, - "rewards/accuracy_reward": 0.06919642956927419, + "grad_norm": 28.04505157470703, + "kl": 5.3203125, + "learning_rate": 1.3206458734256177e-07, + "loss": 0.2907, + "reward": 0.5022321566939354, + "reward_std": 0.1640472076833248, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186383992433548, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2312 }, { "clip_ratio": 0.0, - "completion_length": 1458.0045166015625, + "completion_length": 1790.5134582519531, "epoch": 0.6909118064371593, - "grad_norm": 23.284225463867188, - "kl": 0.28662109375, - "learning_rate": 2.636694661005885e-08, - "loss": 0.0686, - "reward": 0.4375000149011612, - "reward_std": 0.18165702745318413, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 29.145143508911133, + "kl": 4.6796875, + "learning_rate": 1.3183473305029425e-07, + "loss": 0.2585, + "reward": 0.5658482313156128, + "reward_std": 0.17875580117106438, + "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.427455373108387, "step": 2313 }, { "clip_ratio": 0.0, - "completion_length": 1363.9554138183594, + "completion_length": 1778.4487609863281, "epoch": 0.6912105145246807, - "grad_norm": 23.375680923461914, - "kl": 0.283935546875, - "learning_rate": 2.6321001462051164e-08, - "loss": 0.0605, - "reward": 0.3627232387661934, - "reward_std": 0.16582268476486206, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 16.276432037353516, + "kl": 4.5390625, + "learning_rate": 1.316050073102558e-07, + "loss": 0.2463, + "reward": 0.446986623108387, + "reward_std": 0.14768668450415134, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.420200914144516, "step": 2314 }, { "clip_ratio": 0.0, - "completion_length": 1380.2656860351562, + "completion_length": 1798.8438110351562, "epoch": 0.6915092226122023, - "grad_norm": 19.888492584228516, - "kl": 0.30224609375, - "learning_rate": 2.6275082074473077e-08, - "loss": 0.065, - "reward": 0.4486607387661934, - "reward_std": 0.18961424753069878, - "rewards/accuracy_reward": 0.14062500558793545, + "grad_norm": 21.490264892578125, + "kl": 3.7265625, + "learning_rate": 1.3137541037236537e-07, + "loss": 0.2121, + "reward": 0.5725446790456772, + "reward_std": 0.18477138318121433, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357238650322, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2315 }, { "clip_ratio": 0.0, - "completion_length": 1403.1920471191406, + "completion_length": 1774.8683776855469, "epoch": 0.6918079306997237, - "grad_norm": 30.432315826416016, - "kl": 0.3291015625, - "learning_rate": 2.6229188497280407e-08, - "loss": 0.0836, - "reward": 0.3934151977300644, - "reward_std": 0.1897834725677967, - "rewards/accuracy_reward": 0.06696429033763707, + "grad_norm": 26.730506896972656, + "kl": 2.568359375, + "learning_rate": 1.3114594248640203e-07, + "loss": 0.1421, + "reward": 0.4815848395228386, + "reward_std": 0.13136619143188, + "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4190848469734192, "step": 2316 }, { "clip_ratio": 0.0, - "completion_length": 1377.7277221679688, + "completion_length": 1815.0693054199219, "epoch": 0.6921066387872452, - "grad_norm": 23.836376190185547, - "kl": 0.275146484375, - "learning_rate": 2.6183320780400875e-08, - "loss": 0.0415, - "reward": 0.3917410969734192, - "reward_std": 0.18133998289704323, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 12.669639587402344, + "kl": 3.296875, + "learning_rate": 1.3091660390200438e-07, + "loss": 0.1937, + "reward": 0.4715401977300644, + "reward_std": 0.15540923923254013, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2317 }, { "clip_ratio": 0.0, - "completion_length": 1354.0848693847656, + "completion_length": 1768.4933776855469, "epoch": 0.6924053468747666, - "grad_norm": 21.24176025390625, - "kl": 0.2724609375, - "learning_rate": 2.613747897373403e-08, - "loss": 0.055, - "reward": 0.4369419813156128, - "reward_std": 0.19334747269749641, - "rewards/accuracy_reward": 0.09375000465661287, + "grad_norm": 18.05756950378418, + "kl": 3.4921875, + "learning_rate": 1.3068739486867015e-07, + "loss": 0.2105, + "reward": 0.5139509066939354, + "reward_std": 0.17640399374067783, + "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2318 }, { "clip_ratio": 0.0, - "completion_length": 1450.7054443359375, + "completion_length": 1861.7857971191406, "epoch": 0.6927040549622882, - "grad_norm": 19.603240966796875, - "kl": 0.310546875, - "learning_rate": 2.6091663127151287e-08, - "loss": 0.0471, - "reward": 0.3359375149011612, - "reward_std": 0.20318053662776947, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 10.468100547790527, + "kl": 3.953125, + "learning_rate": 1.3045831563575643e-07, + "loss": 0.2219, + "reward": 0.4481026902794838, + "reward_std": 0.1887516789138317, + "rewards/accuracy_reward": 0.046875001629814506, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.4012276977300644, "step": 2319 }, { "clip_ratio": 0.0, - "completion_length": 1381.9442138671875, + "completion_length": 1775.1741943359375, "epoch": 0.6930027630498096, - "grad_norm": 17.277013778686523, - "kl": 0.2578125, - "learning_rate": 2.6045873290495805e-08, - "loss": 0.0323, - "reward": 0.4135044887661934, - "reward_std": 0.18076413124799728, - "rewards/accuracy_reward": 0.06919643236324191, + "grad_norm": 8.991233825683594, + "kl": 3.546875, + "learning_rate": 1.3022936645247904e-07, + "loss": 0.1983, + "reward": 0.4933035969734192, + "reward_std": 0.15109683386981487, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080484867096, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2320 }, { "clip_ratio": 0.0, - "completion_length": 1430.5558471679688, + "completion_length": 1898.3817749023438, "epoch": 0.6933014711373311, - "grad_norm": 24.578222274780273, - "kl": 0.33349609375, - "learning_rate": 2.6000109513582413e-08, - "loss": 0.0623, - "reward": 0.321986623108387, - "reward_std": 0.1621534563601017, + "grad_norm": 13.724143028259277, + "kl": 4.4765625, + "learning_rate": 1.3000054756791206e-07, + "loss": 0.2284, + "reward": 0.3917410895228386, + "reward_std": 0.12733750976622105, "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937649011612, + "rewards/tag_count_reward": 0.3783482313156128, "step": 2321 }, { "clip_ratio": 0.0, - "completion_length": 1390.3505249023438, + "completion_length": 1755.1853637695312, "epoch": 0.6936001792248525, - "grad_norm": 32.21237564086914, - "kl": 0.276611328125, - "learning_rate": 2.5954371846197625e-08, - "loss": 0.0926, - "reward": 0.4257812723517418, - "reward_std": 0.18469398468732834, - "rewards/accuracy_reward": 0.0669642873108387, + "grad_norm": 62.894630432128906, + "kl": 1.0888671875, + "learning_rate": 1.2977185923098813e-07, + "loss": 0.1181, + "reward": 0.513950914144516, + "reward_std": 0.15728253684937954, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169813156128, + "rewards/tag_count_reward": 0.4425223395228386, "step": 2322 }, { "clip_ratio": 0.0, - "completion_length": 1367.6340026855469, + "completion_length": 1800.9666137695312, "epoch": 0.693898887312374, - "grad_norm": 19.463382720947266, - "kl": 0.3193359375, - "learning_rate": 2.5908660338099565e-08, - "loss": 0.0609, - "reward": 0.4732143059372902, - "reward_std": 0.21077986434102058, - "rewards/accuracy_reward": 0.14508929289877415, + "grad_norm": 70.9227066040039, + "kl": 2.111328125, + "learning_rate": 1.295433016904978e-07, + "loss": 0.1389, + "reward": 0.5703125298023224, + "reward_std": 0.19661297276616096, + "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.416294664144516, "step": 2323 }, { "clip_ratio": 0.0, - "completion_length": 1357.6406860351562, + "completion_length": 1807.2568054199219, "epoch": 0.6941975953998955, - "grad_norm": 19.252201080322266, - "kl": 0.30908203125, - "learning_rate": 2.586297503901783e-08, - "loss": 0.0558, - "reward": 0.422991082072258, - "reward_std": 0.17209182679653168, - "rewards/accuracy_reward": 0.09151785867288709, + "grad_norm": 102.58361053466797, + "kl": 1.33984375, + "learning_rate": 1.2931487519508914e-07, + "loss": 0.1137, + "reward": 0.5290178880095482, + "reward_std": 0.15740134939551353, + "rewards/accuracy_reward": 0.10267857951112092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4263393133878708, "step": 2324 }, { "clip_ratio": 0.0, - "completion_length": 1384.4844055175781, + "completion_length": 1830.3572387695312, "epoch": 0.694496303487417, - "grad_norm": 24.727617263793945, - "kl": 0.3271484375, - "learning_rate": 2.581731599865358e-08, - "loss": 0.045, - "reward": 0.3409598395228386, - "reward_std": 0.15947521105408669, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 82.88196563720703, + "kl": 1.4921875, + "learning_rate": 1.290865799932679e-07, + "loss": 0.1174, + "reward": 0.4375000223517418, + "reward_std": 0.11289165914058685, + "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2325 }, { "clip_ratio": 0.0, - "completion_length": 1371.7500610351562, + "completion_length": 1713.27685546875, "epoch": 0.6947950115749384, - "grad_norm": 25.054006576538086, - "kl": 0.24609375, - "learning_rate": 2.5771683266679372e-08, - "loss": 0.0609, - "reward": 0.3950892984867096, - "reward_std": 0.18603291362524033, + "grad_norm": 53.8298225402832, + "kl": 1.361328125, + "learning_rate": 1.2885841633339685e-07, + "loss": 0.1173, + "reward": 0.4743303656578064, + "reward_std": 0.13796906918287277, "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357142873108387, + "rewards/tag_count_reward": 0.436383955180645, "step": 2326 }, { "clip_ratio": 0.0, - "completion_length": 1460.5090026855469, + "completion_length": 1865.8036804199219, "epoch": 0.6950937196624598, - "grad_norm": 23.590669631958008, - "kl": 0.27392578125, - "learning_rate": 2.5726076892739122e-08, - "loss": 0.0604, - "reward": 0.432477705180645, - "reward_std": 0.21837865933775902, - "rewards/accuracy_reward": 0.10267857764847577, + "grad_norm": 23.767873764038086, + "kl": 2.875, + "learning_rate": 1.286303844636956e-07, + "loss": 0.1706, + "reward": 0.5474330708384514, + "reward_std": 0.17863717675209045, + "rewards/accuracy_reward": 0.12723214854486287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991305589676, + "rewards/tag_count_reward": 0.4202009215950966, "step": 2327 }, { "clip_ratio": 0.0, - "completion_length": 1439.0201721191406, + "completion_length": 1854.2634887695312, "epoch": 0.6953924277499813, - "grad_norm": 23.036455154418945, - "kl": 0.30126953125, - "learning_rate": 2.5680496926448137e-08, - "loss": 0.0421, - "reward": 0.369419664144516, - "reward_std": 0.15909731201827526, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 10.475944519042969, + "kl": 4.3671875, + "learning_rate": 1.284024846322407e-07, + "loss": 0.2326, + "reward": 0.4866071715950966, + "reward_std": 0.17675423994660378, + "rewards/accuracy_reward": 0.08482143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4017857313156128, "step": 2328 }, { "clip_ratio": 0.0, - "completion_length": 1405.8482666015625, + "completion_length": 1768.71435546875, "epoch": 0.6956911358375027, - "grad_norm": 25.368305206298828, - "kl": 0.260009765625, - "learning_rate": 2.5634943417392908e-08, - "loss": 0.0585, - "reward": 0.435825914144516, - "reward_std": 0.17196445912122726, - "rewards/accuracy_reward": 0.0892857201397419, + "grad_norm": 8.587560653686523, + "kl": 3.3125, + "learning_rate": 1.2817471708696453e-07, + "loss": 0.2055, + "reward": 0.5161830484867096, + "reward_std": 0.1239225510507822, + "rewards/accuracy_reward": 0.08035714528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401977300644, + "rewards/tag_count_reward": 0.4358259066939354, "step": 2329 }, { "clip_ratio": 0.0, - "completion_length": 1350.2701416015625, + "completion_length": 1771.8616943359375, "epoch": 0.6959898439250243, - "grad_norm": 23.25001335144043, - "kl": 0.26416015625, - "learning_rate": 2.558941641513121e-08, - "loss": 0.0726, - "reward": 0.428013414144516, - "reward_std": 0.1641475185751915, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 22.29239845275879, + "kl": 4.25390625, + "learning_rate": 1.2794708207565604e-07, + "loss": 0.2302, + "reward": 0.525669664144516, + "reward_std": 0.1373024843633175, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2330 }, { "clip_ratio": 0.0, - "completion_length": 1399.4554138183594, + "completion_length": 1827.7657165527344, "epoch": 0.6962885520125457, - "grad_norm": 19.54181671142578, - "kl": 0.2958984375, - "learning_rate": 2.5543915969191976e-08, - "loss": 0.0285, - "reward": 0.3599330559372902, - "reward_std": 0.20088354498147964, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 42.8067626953125, + "kl": 5.1796875, + "learning_rate": 1.2771957984595988e-07, + "loss": 0.2681, + "reward": 0.4687500149011612, + "reward_std": 0.1681443639099598, + "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.415178582072258, "step": 2331 }, { "clip_ratio": 0.0, - "completion_length": 1472.1697082519531, + "completion_length": 1825.7188415527344, "epoch": 0.6965872601000672, - "grad_norm": 24.52573013305664, - "kl": 0.265625, - "learning_rate": 2.549844212907519e-08, - "loss": 0.0697, - "reward": 0.4425223395228386, - "reward_std": 0.19157304242253304, - "rewards/accuracy_reward": 0.10714285937137902, + "grad_norm": 6.760704517364502, + "kl": 3.619140625, + "learning_rate": 1.2749221064537596e-07, + "loss": 0.2056, + "reward": 0.545758955180645, + "reward_std": 0.16525132581591606, + "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2332 }, { "clip_ratio": 0.0, - "completion_length": 1377.6094360351562, + "completion_length": 1772.9866638183594, "epoch": 0.6968859681875886, - "grad_norm": 22.684070587158203, - "kl": 0.2900390625, - "learning_rate": 2.545299494425196e-08, - "loss": 0.0617, - "reward": 0.3337053805589676, - "reward_std": 0.16018924489617348, + "grad_norm": 9.520299911499023, + "kl": 3.8125, + "learning_rate": 1.272649747212598e-07, + "loss": 0.2141, + "reward": 0.4257812723517418, + "reward_std": 0.12550578080117702, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4168526902794838, "step": 2333 }, { "clip_ratio": 0.0, - "completion_length": 1360.5312805175781, + "completion_length": 1789.69873046875, "epoch": 0.6971846762751102, - "grad_norm": 18.736183166503906, - "kl": 0.268798828125, - "learning_rate": 2.540757446416439e-08, - "loss": 0.0626, - "reward": 0.4224330633878708, - "reward_std": 0.23876908794045448, - "rewards/accuracy_reward": 0.06919643329456449, + "grad_norm": 8.589119911193848, + "kl": 2.61328125, + "learning_rate": 1.2703787232082195e-07, + "loss": 0.145, + "reward": 0.5005580559372902, + "reward_std": 0.20239262282848358, + "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3532366156578064, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2334 }, { "clip_ratio": 0.0, - "completion_length": 1404.2299499511719, + "completion_length": 1840.8281860351562, "epoch": 0.6974833843626316, - "grad_norm": 23.011856079101562, - "kl": 0.29052734375, - "learning_rate": 2.5362180738225457e-08, - "loss": 0.0528, - "reward": 0.4017857387661934, - "reward_std": 0.19347967952489853, - "rewards/accuracy_reward": 0.08035714528523386, + "grad_norm": 24.76255226135254, + "kl": 2.8203125, + "learning_rate": 1.268109036911273e-07, + "loss": 0.1586, + "reward": 0.4966518133878708, + "reward_std": 0.1398679930716753, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2335 }, { "clip_ratio": 0.0, - "completion_length": 1429.2165832519531, + "completion_length": 1840.8996276855469, "epoch": 0.6977820924501531, - "grad_norm": 20.497346878051758, - "kl": 0.263671875, - "learning_rate": 2.5316813815819126e-08, - "loss": 0.0375, - "reward": 0.4363839402794838, - "reward_std": 0.19915423542261124, - "rewards/accuracy_reward": 0.10491072130389512, + "grad_norm": 7.063710689544678, + "kl": 3.484375, + "learning_rate": 1.2658406907909562e-07, + "loss": 0.1791, + "reward": 0.5457589402794838, + "reward_std": 0.20056127384305, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2336 }, { "clip_ratio": 0.0, - "completion_length": 1367.2187805175781, + "completion_length": 1778.9375610351562, "epoch": 0.6980808005376745, - "grad_norm": 25.83759307861328, - "kl": 0.30419921875, - "learning_rate": 2.5271473746300166e-08, - "loss": 0.0633, - "reward": 0.3286830484867096, - "reward_std": 0.17458219081163406, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 14.450241088867188, + "kl": 2.77734375, + "learning_rate": 1.263573687315008e-07, + "loss": 0.171, + "reward": 0.4732143133878708, + "reward_std": 0.171335494145751, + "rewards/accuracy_reward": 0.05133928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2337 }, { "clip_ratio": 0.0, - "completion_length": 1427.35498046875, + "completion_length": 1853.5469665527344, "epoch": 0.698379508625196, - "grad_norm": 20.564208984375, - "kl": 0.303466796875, - "learning_rate": 2.5226160578994094e-08, - "loss": 0.0547, - "reward": 0.372209832072258, - "reward_std": 0.1658390834927559, + "grad_norm": 9.449134826660156, + "kl": 4.2890625, + "learning_rate": 1.2613080289497046e-07, + "loss": 0.2429, + "reward": 0.4760044813156128, + "reward_std": 0.13882718980312347, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.4068080484867096, "step": 2338 }, { "clip_ratio": 0.0, - "completion_length": 1377.9397888183594, + "completion_length": 1795.0447387695312, "epoch": 0.6986782167127175, - "grad_norm": 23.905710220336914, - "kl": 0.28466796875, - "learning_rate": 2.5180874363197214e-08, - "loss": 0.073, - "reward": 0.3772321566939354, - "reward_std": 0.15711326897144318, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 21.11614990234375, + "kl": 3.19921875, + "learning_rate": 1.2590437181598608e-07, + "loss": 0.2091, + "reward": 0.4737723469734192, + "reward_std": 0.11964461021125317, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2339 }, { "clip_ratio": 0.0, - "completion_length": 1389.9509582519531, + "completion_length": 1799.0269165039062, "epoch": 0.698976924800239, - "grad_norm": 24.97711944580078, - "kl": 0.29296875, - "learning_rate": 2.5135615148176503e-08, - "loss": 0.0813, - "reward": 0.3604910895228386, - "reward_std": 0.16769619658589363, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 12.637781143188477, + "kl": 4.44140625, + "learning_rate": 1.2567807574088252e-07, + "loss": 0.2518, + "reward": 0.4849330633878708, + "reward_std": 0.15998396836221218, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2340 }, { "clip_ratio": 0.0, - "completion_length": 1401.5982971191406, + "completion_length": 1773.1808471679688, "epoch": 0.6992756328877604, - "grad_norm": 24.35291862487793, - "kl": 0.31298828125, - "learning_rate": 2.5090382983169533e-08, - "loss": 0.0715, - "reward": 0.3950892984867096, - "reward_std": 0.1562977470457554, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 9.389719009399414, + "kl": 4.046875, + "learning_rate": 1.2545191491584766e-07, + "loss": 0.252, + "reward": 0.533482164144516, + "reward_std": 0.15666637755930424, + "rewards/accuracy_reward": 0.10937500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.424107164144516, "step": 2341 }, { "clip_ratio": 0.0, - "completion_length": 1374.7545166015625, + "completion_length": 1781.618408203125, "epoch": 0.6995743409752819, - "grad_norm": 25.897764205932617, - "kl": 0.2861328125, - "learning_rate": 2.5045177917384487e-08, - "loss": 0.0769, - "reward": 0.3348214402794838, - "reward_std": 0.16618949174880981, - "rewards/accuracy_reward": 0.0066964291036129, + "grad_norm": 7.314539432525635, + "kl": 4.3671875, + "learning_rate": 1.2522588958692242e-07, + "loss": 0.2524, + "reward": 0.4402901977300644, + "reward_std": 0.14623912423849106, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2342 }, { "clip_ratio": 0.0, - "completion_length": 1427.5625610351562, + "completion_length": 1811.1228637695312, "epoch": 0.6998730490628033, - "grad_norm": 22.827646255493164, - "kl": 0.263671875, - "learning_rate": 2.500000000000001e-08, - "loss": 0.0565, - "reward": 0.4614955633878708, - "reward_std": 0.22066709026694298, - "rewards/accuracy_reward": 0.12276786286383867, + "grad_norm": 11.186851501464844, + "kl": 3.24609375, + "learning_rate": 1.2500000000000005e-07, + "loss": 0.1967, + "reward": 0.5636160895228386, + "reward_std": 0.18568257987499237, + "rewards/accuracy_reward": 0.14062501024454832, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2343 }, { "clip_ratio": 0.0, - "completion_length": 1435.1317443847656, + "completion_length": 1783.1116638183594, "epoch": 0.7001717571503249, - "grad_norm": 24.158687591552734, - "kl": 0.266845703125, - "learning_rate": 2.495484928016527e-08, - "loss": 0.0893, - "reward": 0.4162946715950966, - "reward_std": 0.1814393289387226, - "rewards/accuracy_reward": 0.08482143096625805, + "grad_norm": 6.269689083099365, + "kl": 3.109375, + "learning_rate": 1.2477424640082634e-07, + "loss": 0.1864, + "reward": 0.5385044887661934, + "reward_std": 0.13482287526130676, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4358259215950966, "step": 2344 }, { "clip_ratio": 0.0, - "completion_length": 1410.7746276855469, + "completion_length": 1822.4822387695312, "epoch": 0.7004704652378463, - "grad_norm": 20.324682235717773, - "kl": 0.266845703125, - "learning_rate": 2.4909725806999842e-08, - "loss": 0.0431, - "reward": 0.4732143208384514, - "reward_std": 0.22953487187623978, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 6.870690822601318, + "kl": 3.74609375, + "learning_rate": 1.245486290349992e-07, + "loss": 0.2093, + "reward": 0.525111623108387, + "reward_std": 0.170251727104187, + "rewards/accuracy_reward": 0.11383929150179029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2345 }, { "clip_ratio": 0.0, - "completion_length": 1395.7545471191406, + "completion_length": 1788.5357971191406, "epoch": 0.7007691733253678, - "grad_norm": 23.249967575073242, - "kl": 0.259521484375, - "learning_rate": 2.4864629629593613e-08, - "loss": 0.0472, - "reward": 0.3928571566939354, - "reward_std": 0.1564093828201294, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 38.80241012573242, + "kl": 2.546875, + "learning_rate": 1.2432314814796808e-07, + "loss": 0.1907, + "reward": 0.4804687723517418, + "reward_std": 0.12757463939487934, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.4224330633878708, "step": 2346 }, { "clip_ratio": 0.0, - "completion_length": 1443.4866638183594, + "completion_length": 1858.212158203125, "epoch": 0.7010678814128892, - "grad_norm": 23.719383239746094, - "kl": 0.3447265625, - "learning_rate": 2.481956079700681e-08, - "loss": 0.0649, - "reward": 0.333147332072258, - "reward_std": 0.1907803751528263, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 9.211620330810547, + "kl": 3.40625, + "learning_rate": 1.2409780398503405e-07, + "loss": 0.2041, + "reward": 0.4570312723517418, + "reward_std": 0.1657471749931574, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.4168526902794838, "step": 2347 }, { "clip_ratio": 0.0, - "completion_length": 1439.6741943359375, + "completion_length": 1850.9911804199219, "epoch": 0.7013665895004108, - "grad_norm": 23.567033767700195, - "kl": 0.36181640625, - "learning_rate": 2.477451935826993e-08, - "loss": 0.068, - "reward": 0.3699776977300644, - "reward_std": 0.16485638171434402, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 13.792752265930176, + "kl": 3.6015625, + "learning_rate": 1.2387259679134964e-07, + "loss": 0.2128, + "reward": 0.4849330559372902, + "reward_std": 0.1466073002666235, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419738650322, + "rewards/tag_count_reward": 0.415736623108387, "step": 2348 }, { "clip_ratio": 0.0, - "completion_length": 1368.97998046875, + "completion_length": 1745.2746276855469, "epoch": 0.7016652975879322, - "grad_norm": 25.643295288085938, - "kl": 0.31689453125, - "learning_rate": 2.4729505362383612e-08, - "loss": 0.0797, - "reward": 0.3950892984867096, - "reward_std": 0.1820814162492752, - "rewards/accuracy_reward": 0.06250000116415322, + "grad_norm": 21.6173038482666, + "kl": 4.50390625, + "learning_rate": 1.2364752681191806e-07, + "loss": 0.2609, + "reward": 0.4977678880095482, + "reward_std": 0.1821570172905922, + "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.332589291036129, + "rewards/tag_count_reward": 0.4196428805589676, "step": 2349 }, { "clip_ratio": 0.0, - "completion_length": 1403.6741638183594, + "completion_length": 1843.9018859863281, "epoch": 0.7019640056754537, - "grad_norm": 25.59372901916504, - "kl": 0.31640625, - "learning_rate": 2.4684518858318683e-08, - "loss": 0.0771, - "reward": 0.4391741305589676, - "reward_std": 0.19174769893288612, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 40.01692199707031, + "kl": 5.53125, + "learning_rate": 1.234225942915934e-07, + "loss": 0.2985, + "reward": 0.4899553656578064, + "reward_std": 0.13741761445999146, + "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4073660895228386, "step": 2350 }, { "clip_ratio": 0.0, - "completion_length": 1424.8103332519531, + "completion_length": 1818.7590026855469, "epoch": 0.7022627137629751, - "grad_norm": 22.680200576782227, - "kl": 0.265380859375, - "learning_rate": 2.4639559895016066e-08, - "loss": 0.0673, - "reward": 0.3577009066939354, - "reward_std": 0.19708474352955818, + "grad_norm": 12.28449821472168, + "kl": 3.76953125, + "learning_rate": 1.2319779947508032e-07, + "loss": 0.1921, + "reward": 0.4375000223517418, + "reward_std": 0.14590265788137913, "rewards/accuracy_reward": 0.020089286845177412, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4174107313156128, "step": 2351 }, { "clip_ratio": 0.0, - "completion_length": 1344.80810546875, + "completion_length": 1716.74560546875, "epoch": 0.7025614218504966, - "grad_norm": 25.16532325744629, - "kl": 0.2685546875, - "learning_rate": 2.459462852138668e-08, - "loss": 0.0865, - "reward": 0.5636160969734192, - "reward_std": 0.20583615452051163, - "rewards/accuracy_reward": 0.20089287124574184, + "grad_norm": 14.940701484680176, + "kl": 3.046875, + "learning_rate": 1.229731426069334e-07, + "loss": 0.1876, + "reward": 0.635044664144516, + "reward_std": 0.20123385079205036, + "rewards/accuracy_reward": 0.2075893022119999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3627232313156128, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2352 }, { "clip_ratio": 0.0, - "completion_length": 1370.6406860351562, + "completion_length": 1765.2031860351562, "epoch": 0.702860129938018, - "grad_norm": 21.478084564208984, - "kl": 0.29150390625, - "learning_rate": 2.4549724786311487e-08, - "loss": 0.0591, - "reward": 0.4425223395228386, - "reward_std": 0.2269110418856144, - "rewards/accuracy_reward": 0.10714286239817739, + "grad_norm": 31.647991180419922, + "kl": 2.45703125, + "learning_rate": 1.2274862393155743e-07, + "loss": 0.1607, + "reward": 0.5463169813156128, + "reward_std": 0.19926150515675545, + "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.423549123108387, "step": 2353 }, { "clip_ratio": 0.0, - "completion_length": 1389.497802734375, + "completion_length": 1788.8617248535156, "epoch": 0.7031588380255396, - "grad_norm": 23.49627685546875, - "kl": 0.2978515625, - "learning_rate": 2.450484873864131e-08, - "loss": 0.0645, - "reward": 0.353236623108387, - "reward_std": 0.18055781163275242, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 16.964784622192383, + "kl": 3.078125, + "learning_rate": 1.2252424369320655e-07, + "loss": 0.1724, + "reward": 0.439732164144516, + "reward_std": 0.14505527541041374, + "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4151785969734192, "step": 2354 }, { "clip_ratio": 0.0, - "completion_length": 1369.1719360351562, + "completion_length": 1790.1898193359375, "epoch": 0.703457546113061, - "grad_norm": 19.345478057861328, - "kl": 0.291015625, - "learning_rate": 2.4460000427196913e-08, - "loss": 0.0655, - "reward": 0.398995541036129, - "reward_std": 0.18704243749380112, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 28.85265350341797, + "kl": 2.701171875, + "learning_rate": 1.2230000213598456e-07, + "loss": 0.1771, + "reward": 0.489955373108387, + "reward_std": 0.1441718377172947, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2355 }, { "clip_ratio": 0.0, - "completion_length": 1402.8929138183594, + "completion_length": 1792.4398193359375, "epoch": 0.7037562542005825, - "grad_norm": 22.519763946533203, - "kl": 0.34912109375, - "learning_rate": 2.4415179900768878e-08, - "loss": 0.0858, - "reward": 0.3297991156578064, - "reward_std": 0.15692423656582832, - "rewards/accuracy_reward": 0.020089285913854837, + "grad_norm": 15.449318885803223, + "kl": 4.00390625, + "learning_rate": 1.2207589950384438e-07, + "loss": 0.2407, + "reward": 0.4347098469734192, + "reward_std": 0.14693696051836014, + "rewards/accuracy_reward": 0.040178574388846755, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.3945312723517418, "step": 2356 }, { "clip_ratio": 0.0, - "completion_length": 1381.5000915527344, + "completion_length": 1771.0603332519531, "epoch": 0.7040549622881039, - "grad_norm": 24.67153549194336, - "kl": 0.299072265625, - "learning_rate": 2.4370387208117514e-08, - "loss": 0.0542, - "reward": 0.491071455180645, - "reward_std": 0.18956921994686127, - "rewards/accuracy_reward": 0.1361607201397419, + "grad_norm": 9.857447624206543, + "kl": 3.6171875, + "learning_rate": 1.2185193604058757e-07, + "loss": 0.2159, + "reward": 0.5725446715950966, + "reward_std": 0.18209994956851006, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2357 }, { "clip_ratio": 0.0, - "completion_length": 1379.2098693847656, + "completion_length": 1773.6072082519531, "epoch": 0.7043536703756255, - "grad_norm": 21.630271911621094, - "kl": 0.35107421875, - "learning_rate": 2.4325622397972896e-08, - "loss": 0.0527, - "reward": 0.4347098395228386, - "reward_std": 0.1548932120203972, - "rewards/accuracy_reward": 0.12723215157166123, + "grad_norm": 27.088909149169922, + "kl": 5.984375, + "learning_rate": 1.2162811198986447e-07, + "loss": 0.3171, + "reward": 0.5424107387661934, + "reward_std": 0.13507209345698357, + "rewards/accuracy_reward": 0.13392858067527413, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2358 }, { "clip_ratio": 0.0, - "completion_length": 1419.9799499511719, + "completion_length": 1838.29248046875, "epoch": 0.7046523784631469, - "grad_norm": 18.406936645507812, - "kl": 0.3193359375, - "learning_rate": 2.4280885519034765e-08, - "loss": 0.0697, - "reward": 0.3627232313156128, - "reward_std": 0.19328664243221283, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 20.271066665649414, + "kl": 4.06640625, + "learning_rate": 1.2140442759517382e-07, + "loss": 0.2283, + "reward": 0.4665178805589676, + "reward_std": 0.18487808108329773, + "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.4107143059372902, "step": 2359 }, { "clip_ratio": 0.0, - "completion_length": 1398.1630249023438, + "completion_length": 1765.7166137695312, "epoch": 0.7049510865506684, - "grad_norm": 22.82428550720215, - "kl": 0.29833984375, - "learning_rate": 2.423617661997243e-08, - "loss": 0.0828, - "reward": 0.4497767984867096, - "reward_std": 0.2165549136698246, - "rewards/accuracy_reward": 0.1160714365541935, + "grad_norm": 11.207869529724121, + "kl": 3.201171875, + "learning_rate": 1.2118088309986217e-07, + "loss": 0.1879, + "reward": 0.5602678805589676, + "reward_std": 0.19498751126229763, + "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4218750149011612, "step": 2360 }, { "clip_ratio": 0.0, - "completion_length": 1463.0491943359375, + "completion_length": 1848.2523498535156, "epoch": 0.7052497946381898, - "grad_norm": 21.27891731262207, - "kl": 0.30810546875, - "learning_rate": 2.4191495749424805e-08, - "loss": 0.0447, - "reward": 0.3281250149011612, - "reward_std": 0.16583548858761787, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 14.72805404663086, + "kl": 3.98828125, + "learning_rate": 1.2095747874712404e-07, + "loss": 0.2264, + "reward": 0.427455373108387, + "reward_std": 0.14801684580743313, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310267873108387, + "rewards/tag_count_reward": 0.4051339477300644, "step": 2361 }, { "clip_ratio": 0.0, - "completion_length": 1409.8125610351562, + "completion_length": 1822.5335693359375, "epoch": 0.7055485027257113, - "grad_norm": 22.2251033782959, - "kl": 0.3359375, - "learning_rate": 2.414684295600032e-08, - "loss": 0.06, - "reward": 0.3900669738650322, - "reward_std": 0.15599830821156502, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 15.283440589904785, + "kl": 4.26953125, + "learning_rate": 1.207342147800016e-07, + "loss": 0.2296, + "reward": 0.4921875298023224, + "reward_std": 0.16096188127994537, + "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4029018059372902, "step": 2362 }, { "clip_ratio": 0.0, - "completion_length": 1421.3683776855469, + "completion_length": 1808.3549499511719, "epoch": 0.7058472108132328, - "grad_norm": 21.96902847290039, - "kl": 0.3134765625, - "learning_rate": 2.4102218288276797e-08, - "loss": 0.0817, - "reward": 0.3766741305589676, - "reward_std": 0.2044924534857273, - "rewards/accuracy_reward": 0.05133928777649999, + "grad_norm": 20.200292587280273, + "kl": 3.5390625, + "learning_rate": 1.20511091441384e-07, + "loss": 0.2144, + "reward": 0.478794664144516, + "reward_std": 0.1604021191596985, + "rewards/accuracy_reward": 0.06473214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2363 }, { "clip_ratio": 0.0, - "completion_length": 1472.7902526855469, + "completion_length": 1813.8393859863281, "epoch": 0.7061459189007543, - "grad_norm": 17.149307250976562, - "kl": 0.29052734375, - "learning_rate": 2.405762179480152e-08, - "loss": 0.0409, - "reward": 0.3588169887661934, - "reward_std": 0.1592687852680683, - "rewards/accuracy_reward": 0.0491071455180645, + "grad_norm": 5.426363945007324, + "kl": 3.39453125, + "learning_rate": 1.202881089740076e-07, + "loss": 0.2029, + "reward": 0.4626116380095482, + "reward_std": 0.13159577175974846, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2364 }, { "clip_ratio": 0.0, - "completion_length": 1440.40185546875, + "completion_length": 1821.5960693359375, "epoch": 0.7064446269882757, - "grad_norm": 23.324249267578125, - "kl": 0.2939453125, - "learning_rate": 2.4013053524091126e-08, - "loss": 0.0646, - "reward": 0.3867187723517418, - "reward_std": 0.21229878440499306, - "rewards/accuracy_reward": 0.07589286239817739, + "grad_norm": 5.461429119110107, + "kl": 4.16015625, + "learning_rate": 1.2006526762045563e-07, + "loss": 0.2407, + "reward": 0.510044664144516, + "reward_std": 0.18972396105527878, + "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3108259066939354, + "rewards/tag_count_reward": 0.4095982387661934, "step": 2365 }, { "clip_ratio": 0.0, - "completion_length": 1423.5090026855469, + "completion_length": 1805.7567749023438, "epoch": 0.7067433350757972, - "grad_norm": 27.468181610107422, - "kl": 0.273681640625, - "learning_rate": 2.396851352463148e-08, - "loss": 0.0864, - "reward": 0.4804687649011612, - "reward_std": 0.15669093281030655, - "rewards/accuracy_reward": 0.14955357578583062, + "grad_norm": 9.928297996520996, + "kl": 3.50390625, + "learning_rate": 1.198425676231574e-07, + "loss": 0.2105, + "reward": 0.5792410969734192, + "reward_std": 0.12962576746940613, + "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2366 }, { "clip_ratio": 0.0, - "completion_length": 1452.79248046875, + "completion_length": 1824.7746276855469, "epoch": 0.7070420431633186, - "grad_norm": 22.461578369140625, - "kl": 0.26708984375, - "learning_rate": 2.3924001844877778e-08, - "loss": 0.0578, - "reward": 0.469866082072258, - "reward_std": 0.18099137023091316, - "rewards/accuracy_reward": 0.13839286309666932, + "grad_norm": 8.336638450622559, + "kl": 3.8828125, + "learning_rate": 1.196200092243889e-07, + "loss": 0.2125, + "reward": 0.5887276902794838, + "reward_std": 0.12562943622469902, + "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2367 }, { "clip_ratio": 0.0, - "completion_length": 1432.2188720703125, + "completion_length": 1798.9107971191406, "epoch": 0.7073407512508402, - "grad_norm": 22.193368911743164, - "kl": 0.279296875, - "learning_rate": 2.3879518533254328e-08, - "loss": 0.0682, - "reward": 0.401227705180645, - "reward_std": 0.1588432565331459, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 6.459567070007324, + "kl": 3.52734375, + "learning_rate": 1.1939759266627164e-07, + "loss": 0.2055, + "reward": 0.5212053805589676, + "reward_std": 0.11128736473619938, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2368 }, { "clip_ratio": 0.0, - "completion_length": 1352.6027526855469, + "completion_length": 1745.8371276855469, "epoch": 0.7076394593383616, - "grad_norm": 23.396760940551758, - "kl": 0.30859375, - "learning_rate": 2.3835063638154634e-08, - "loss": 0.0562, - "reward": 0.4185267984867096, - "reward_std": 0.16641060262918472, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 15.765811920166016, + "kl": 3.9765625, + "learning_rate": 1.1917531819077318e-07, + "loss": 0.2424, + "reward": 0.5195312798023224, + "reward_std": 0.1305846907198429, + "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4257812649011612, "step": 2369 }, { "clip_ratio": 0.0, - "completion_length": 1476.6094360351562, + "completion_length": 1858.0157165527344, "epoch": 0.707938167425883, - "grad_norm": 20.752687454223633, - "kl": 0.28466796875, - "learning_rate": 2.3790637207941283e-08, - "loss": 0.0518, - "reward": 0.3554687649011612, - "reward_std": 0.17978684604167938, - "rewards/accuracy_reward": 0.037946430034935474, + "grad_norm": 6.259654998779297, + "kl": 3.83984375, + "learning_rate": 1.1895318603970641e-07, + "loss": 0.2117, + "reward": 0.4737723395228386, + "reward_std": 0.16261932626366615, + "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223246216774, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2370 }, { "clip_ratio": 0.0, - "completion_length": 1386.9375610351562, + "completion_length": 1739.5090026855469, "epoch": 0.7082368755134045, - "grad_norm": 22.7547664642334, - "kl": 0.274658203125, - "learning_rate": 2.374623929094583e-08, - "loss": 0.0702, - "reward": 0.4179687723517418, - "reward_std": 0.2056364305317402, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 9.80095100402832, + "kl": 4.296875, + "learning_rate": 1.1873119645472915e-07, + "loss": 0.2508, + "reward": 0.5418527126312256, + "reward_std": 0.18165267072618008, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2371 }, { "clip_ratio": 0.0, - "completion_length": 1399.7589721679688, + "completion_length": 1716.1741943359375, "epoch": 0.7085355836009259, - "grad_norm": 19.28399658203125, - "kl": 0.280517578125, - "learning_rate": 2.370186993546889e-08, - "loss": 0.0371, - "reward": 0.467075914144516, - "reward_std": 0.20676765218377113, - "rewards/accuracy_reward": 0.11830357951112092, + "grad_norm": 18.86343002319336, + "kl": 1.43701171875, + "learning_rate": 1.1850934967734444e-07, + "loss": 0.096, + "reward": 0.584263414144516, + "reward_std": 0.1726251933723688, + "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3487723395228386, + "rewards/tag_count_reward": 0.443638414144516, "step": 2372 }, { "clip_ratio": 0.0, - "completion_length": 1469.0692749023438, + "completion_length": 1842.18310546875, "epoch": 0.7088342916884475, - "grad_norm": 22.684823989868164, - "kl": 0.266357421875, - "learning_rate": 2.3657529189779983e-08, - "loss": 0.0525, - "reward": 0.3755580633878708, - "reward_std": 0.1467813029885292, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 34.17057418823242, + "kl": 2.24609375, + "learning_rate": 1.1828764594889992e-07, + "loss": 0.1379, + "reward": 0.4715401977300644, + "reward_std": 0.11822165921330452, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4291294887661934, "step": 2373 }, { "clip_ratio": 0.0, - "completion_length": 1397.0335388183594, + "completion_length": 1841.8795471191406, "epoch": 0.7091329997759689, - "grad_norm": 20.003999710083008, - "kl": 0.25830078125, - "learning_rate": 2.3613217102117462e-08, - "loss": 0.045, - "reward": 0.4363839477300644, - "reward_std": 0.2226431518793106, - "rewards/accuracy_reward": 0.09598214947618544, + "grad_norm": 19.992822647094727, + "kl": 2.49609375, + "learning_rate": 1.180660855105873e-07, + "loss": 0.1518, + "reward": 0.525669664144516, + "reward_std": 0.16457823105156422, + "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.420758955180645, "step": 2374 }, { "clip_ratio": 0.0, - "completion_length": 1377.5201721191406, + "completion_length": 1770.5134887695312, "epoch": 0.7094317078634904, - "grad_norm": 24.8966007232666, - "kl": 0.31396484375, - "learning_rate": 2.3568933720688543e-08, - "loss": 0.0973, - "reward": 0.4335937723517418, - "reward_std": 0.19918614998459816, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 13.566108703613281, + "kl": 3.408203125, + "learning_rate": 1.1784466860344272e-07, + "loss": 0.2166, + "reward": 0.5351562723517418, + "reward_std": 0.16907967627048492, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4257812649011612, "step": 2375 }, { "clip_ratio": 0.0, - "completion_length": 1395.8840026855469, + "completion_length": 1739.58935546875, "epoch": 0.7097304159510118, - "grad_norm": 23.963274002075195, - "kl": 0.30810546875, - "learning_rate": 2.3524679093669235e-08, - "loss": 0.0693, - "reward": 0.4029017984867096, - "reward_std": 0.18983511999249458, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 14.752470016479492, + "kl": 4.41015625, + "learning_rate": 1.1762339546834618e-07, + "loss": 0.2627, + "reward": 0.510044664144516, + "reward_std": 0.1607834417372942, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2376 }, { "clip_ratio": 0.0, - "completion_length": 1441.4442443847656, + "completion_length": 1826.0826721191406, "epoch": 0.7100291240385334, - "grad_norm": 23.1390323638916, - "kl": 0.27197265625, - "learning_rate": 2.3480453269204198e-08, - "loss": 0.0772, - "reward": 0.4296875149011612, - "reward_std": 0.2024025321006775, - "rewards/accuracy_reward": 0.10267857555299997, + "grad_norm": 16.78652000427246, + "kl": 4.71875, + "learning_rate": 1.1740226634602099e-07, + "loss": 0.2527, + "reward": 0.5569196715950966, + "reward_std": 0.18307537958025932, + "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089402794838, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2377 }, { "clip_ratio": 0.0, - "completion_length": 1442.51123046875, + "completion_length": 1876.04248046875, "epoch": 0.7103278321260548, - "grad_norm": 20.213878631591797, - "kl": 0.2763671875, - "learning_rate": 2.3436256295406808e-08, - "loss": 0.0424, - "reward": 0.3783482313156128, - "reward_std": 0.18458179384469986, - "rewards/accuracy_reward": 0.06026786146685481, + "grad_norm": 22.39483070373535, + "kl": 4.63671875, + "learning_rate": 1.1718128147703405e-07, + "loss": 0.249, + "reward": 0.4726562798023224, + "reward_std": 0.16766344383358955, + "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.407924123108387, "step": 2378 }, { "clip_ratio": 0.0, - "completion_length": 1496.1228332519531, + "completion_length": 1876.2054443359375, "epoch": 0.7106265402135763, - "grad_norm": 21.3836669921875, - "kl": 0.30126953125, - "learning_rate": 2.3392088220359063e-08, - "loss": 0.0468, - "reward": 0.4246651977300644, - "reward_std": 0.18220502883195877, - "rewards/accuracy_reward": 0.1272321455180645, + "grad_norm": 7.543182849884033, + "kl": 4.08984375, + "learning_rate": 1.1696044110179532e-07, + "loss": 0.1985, + "reward": 0.5496651902794838, + "reward_std": 0.1329356748610735, + "rewards/accuracy_reward": 0.13839286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2379 }, { "clip_ratio": 0.0, - "completion_length": 1390.1004943847656, + "completion_length": 1768.4107971191406, "epoch": 0.7109252483010977, - "grad_norm": 23.103717803955078, - "kl": 0.256591796875, - "learning_rate": 2.3347949092111453e-08, - "loss": 0.0889, - "reward": 0.426897332072258, - "reward_std": 0.19106629863381386, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 30.028804779052734, + "kl": 2.25390625, + "learning_rate": 1.1673974546055726e-07, + "loss": 0.1646, + "reward": 0.5518973469734192, + "reward_std": 0.16922985017299652, + "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3487723395228386, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2380 }, { "clip_ratio": 0.0, - "completion_length": 1380.7746276855469, + "completion_length": 1764.2411499023438, "epoch": 0.7112239563886192, - "grad_norm": 18.833585739135742, - "kl": 0.262451171875, - "learning_rate": 2.3303838958683076e-08, - "loss": 0.0531, - "reward": 0.3716517984867096, - "reward_std": 0.19802891835570335, - "rewards/accuracy_reward": 0.03125000209547579, + "grad_norm": 49.80156707763672, + "kl": 2.158203125, + "learning_rate": 1.1651919479341538e-07, + "loss": 0.1407, + "reward": 0.4570312723517418, + "reward_std": 0.1568269580602646, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2381 }, { "clip_ratio": 0.0, - "completion_length": 1367.90185546875, + "completion_length": 1767.1697082519531, "epoch": 0.7115226644761407, - "grad_norm": 24.408397674560547, - "kl": 0.298828125, - "learning_rate": 2.325975786806138e-08, - "loss": 0.0884, - "reward": 0.4146205484867096, - "reward_std": 0.15226201340556145, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 53.72437286376953, + "kl": 1.927734375, + "learning_rate": 1.162987893403069e-07, + "loss": 0.1483, + "reward": 0.5061384215950966, + "reward_std": 0.12915064580738544, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2382 }, { "clip_ratio": 0.0, - "completion_length": 1451.7322082519531, + "completion_length": 1859.665283203125, "epoch": 0.7118213725636622, - "grad_norm": 24.053606033325195, - "kl": 0.32373046875, - "learning_rate": 2.3215705868202302e-08, - "loss": 0.0671, - "reward": 0.3281250149011612, - "reward_std": 0.17666487395763397, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 32.8027229309082, + "kl": 1.97265625, + "learning_rate": 1.1607852934101151e-07, + "loss": 0.1272, + "reward": 0.4581473469734192, + "reward_std": 0.17100156471133232, + "rewards/accuracy_reward": 0.044642859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000074505806, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2383 }, { "clip_ratio": 0.0, - "completion_length": 1419.9844360351562, + "completion_length": 1818.8148193359375, "epoch": 0.7121200806511836, - "grad_norm": 17.823698043823242, - "kl": 0.275634765625, - "learning_rate": 2.3171683007030114e-08, - "loss": 0.0596, - "reward": 0.3325892984867096, - "reward_std": 0.15765470638871193, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 10.564373970031738, + "kl": 2.111328125, + "learning_rate": 1.1585841503515056e-07, + "loss": 0.1206, + "reward": 0.4486607387661934, + "reward_std": 0.15155724622309208, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.424107164144516, "step": 2384 }, { "clip_ratio": 0.0, - "completion_length": 1416.9330749511719, + "completion_length": 1833.9755554199219, "epoch": 0.7124187887387051, - "grad_norm": 19.508310317993164, - "kl": 0.2607421875, - "learning_rate": 2.3127689332437337e-08, - "loss": 0.0708, - "reward": 0.3945312798023224, - "reward_std": 0.19613809511065483, - "rewards/accuracy_reward": 0.06696428707800806, + "grad_norm": 17.677459716796875, + "kl": 3.2421875, + "learning_rate": 1.1563844666218669e-07, + "loss": 0.2065, + "reward": 0.4771205633878708, + "reward_std": 0.15114394389092922, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2385 }, { "clip_ratio": 0.0, - "completion_length": 1449.5915832519531, + "completion_length": 1840.0915832519531, "epoch": 0.7127174968262265, - "grad_norm": 18.37896728515625, - "kl": 0.285400390625, - "learning_rate": 2.30837248922848e-08, - "loss": 0.0602, - "reward": 0.3554687649011612, - "reward_std": 0.1556084230542183, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 9.652724266052246, + "kl": 4.9140625, + "learning_rate": 1.15418624461424e-07, + "loss": 0.267, + "reward": 0.4492187649011612, + "reward_std": 0.12138241529464722, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.4090401977300644, "step": 2386 }, { "clip_ratio": 0.0, - "completion_length": 1403.2255249023438, + "completion_length": 1754.5960693359375, "epoch": 0.7130162049137481, - "grad_norm": 24.689390182495117, - "kl": 0.28759765625, - "learning_rate": 2.3039789734401522e-08, - "loss": 0.0798, - "reward": 0.3750000149011612, - "reward_std": 0.1911601535975933, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 4.1428117752075195, + "kl": 3.05859375, + "learning_rate": 1.1519894867200761e-07, + "loss": 0.1848, + "reward": 0.495535746216774, + "reward_std": 0.1754977498203516, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.4263393133878708, "step": 2387 }, { "clip_ratio": 0.0, - "completion_length": 1431.7634582519531, + "completion_length": 1798.7233276367188, "epoch": 0.7133149130012695, - "grad_norm": 23.086023330688477, - "kl": 0.27587890625, - "learning_rate": 2.2995883906584606e-08, - "loss": 0.0644, - "reward": 0.4838169887661934, - "reward_std": 0.17892219126224518, - "rewards/accuracy_reward": 0.15848215157166123, + "grad_norm": 7.098261833190918, + "kl": 4.046875, + "learning_rate": 1.1497941953292303e-07, + "loss": 0.2403, + "reward": 0.577008955180645, + "reward_std": 0.1458814423531294, + "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.411830373108387, "step": 2388 }, { "clip_ratio": 0.0, - "completion_length": 1402.0000915527344, + "completion_length": 1720.1273193359375, "epoch": 0.713613621088791, - "grad_norm": 27.737632751464844, - "kl": 0.27197265625, - "learning_rate": 2.2952007456599316e-08, - "loss": 0.077, - "reward": 0.4453125149011612, - "reward_std": 0.1635904498398304, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 20.7607421875, + "kl": 2.662109375, + "learning_rate": 1.1476003728299657e-07, + "loss": 0.1844, + "reward": 0.5641741380095482, + "reward_std": 0.11848766170442104, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4391741305589676, "step": 2389 }, { "clip_ratio": 0.0, - "completion_length": 1390.15185546875, + "completion_length": 1749.0804138183594, "epoch": 0.7139123291763124, - "grad_norm": 21.292499542236328, - "kl": 0.25634765625, - "learning_rate": 2.2908160432178934e-08, - "loss": 0.0681, - "reward": 0.3906250149011612, - "reward_std": 0.16530713066458702, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 12.546521186828613, + "kl": 3.52734375, + "learning_rate": 1.1454080216089468e-07, + "loss": 0.1856, + "reward": 0.4933035969734192, + "reward_std": 0.15864093601703644, + "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500074505806, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2390 }, { "clip_ratio": 0.0, - "completion_length": 1432.9040832519531, + "completion_length": 1778.618408203125, "epoch": 0.714211037263834, - "grad_norm": 17.929832458496094, - "kl": 0.24853515625, - "learning_rate": 2.2864342881024706e-08, - "loss": 0.0637, - "reward": 0.4386160895228386, - "reward_std": 0.20113664492964745, - "rewards/accuracy_reward": 0.09821429150179029, + "grad_norm": 8.720478057861328, + "kl": 2.60546875, + "learning_rate": 1.1432171440512353e-07, + "loss": 0.1462, + "reward": 0.5697544887661934, + "reward_std": 0.19435515999794006, + "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2391 }, { "clip_ratio": 0.0, - "completion_length": 1462.6183471679688, + "completion_length": 1806.2679138183594, "epoch": 0.7145097453513554, - "grad_norm": 24.125314712524414, - "kl": 0.236083984375, - "learning_rate": 2.282055485080584e-08, - "loss": 0.044, - "reward": 0.4531250149011612, - "reward_std": 0.16507969796657562, - "rewards/accuracy_reward": 0.0915178582072258, + "grad_norm": 15.638729095458984, + "kl": 2.8671875, + "learning_rate": 1.141027742540292e-07, + "loss": 0.1617, + "reward": 0.5368303805589676, + "reward_std": 0.1441308967769146, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3616071492433548, + "rewards/tag_count_reward": 0.4319196566939354, "step": 2392 }, { "clip_ratio": 0.0, - "completion_length": 1416.4978637695312, + "completion_length": 1792.6898193359375, "epoch": 0.7148084534388769, - "grad_norm": 17.23619842529297, - "kl": 0.26123046875, - "learning_rate": 2.2776796389159448e-08, - "loss": 0.0797, - "reward": 0.4101562649011612, - "reward_std": 0.18480717577040195, - "rewards/accuracy_reward": 0.08482143259607255, + "grad_norm": 5.050838470458984, + "kl": 3.33203125, + "learning_rate": 1.1388398194579724e-07, + "loss": 0.1964, + "reward": 0.5239955559372902, + "reward_std": 0.15415244549512863, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2393 }, { "clip_ratio": 0.0, - "completion_length": 1414.8125610351562, + "completion_length": 1803.5090026855469, "epoch": 0.7151071615263983, - "grad_norm": 20.62865447998047, - "kl": 0.30810546875, - "learning_rate": 2.27330675436904e-08, - "loss": 0.075, - "reward": 0.3554687723517418, - "reward_std": 0.18603655323386192, - "rewards/accuracy_reward": 0.03571428661234677, + "grad_norm": 43.18879318237305, + "kl": 5.16015625, + "learning_rate": 1.1366533771845199e-07, + "loss": 0.2595, + "reward": 0.4732143133878708, + "reward_std": 0.16946572810411453, + "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.419642873108387, "step": 2394 }, { "clip_ratio": 0.0, - "completion_length": 1453.2902526855469, + "completion_length": 1807.2165832519531, "epoch": 0.7154058696139198, - "grad_norm": 24.01993179321289, - "kl": 0.277099609375, - "learning_rate": 2.268936836197144e-08, - "loss": 0.0824, - "reward": 0.361049123108387, - "reward_std": 0.17798389866948128, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 24.81668472290039, + "kl": 4.69921875, + "learning_rate": 1.134468418098572e-07, + "loss": 0.2759, + "reward": 0.471540205180645, + "reward_std": 0.1316543035209179, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2395 }, { "clip_ratio": 0.0, - "completion_length": 1398.2344360351562, + "completion_length": 1740.2344055175781, "epoch": 0.7157045777014412, - "grad_norm": 23.358028411865234, - "kl": 0.294921875, - "learning_rate": 2.2645698891542947e-08, - "loss": 0.0664, - "reward": 0.4369419813156128, - "reward_std": 0.19328084215521812, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 7.656452178955078, + "kl": 3.0390625, + "learning_rate": 1.1322849445771473e-07, + "loss": 0.1786, + "reward": 0.547991082072258, + "reward_std": 0.1985822692513466, + "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.4430803805589676, "step": 2396 }, { "clip_ratio": 0.0, - "completion_length": 1313.5715026855469, + "completion_length": 1681.0915832519531, "epoch": 0.7160032857889628, - "grad_norm": 29.02049446105957, - "kl": 0.260986328125, - "learning_rate": 2.260205917991306e-08, - "loss": 0.1066, - "reward": 0.424107164144516, - "reward_std": 0.17037393152713776, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 6.560885429382324, + "kl": 2.810546875, + "learning_rate": 1.1301029589956528e-07, + "loss": 0.2041, + "reward": 0.5128348395228386, + "reward_std": 0.15756717883050442, + "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482142984867096, + "rewards/tag_count_reward": 0.428013414144516, "step": 2397 }, { "clip_ratio": 0.0, - "completion_length": 1376.6496276855469, + "completion_length": 1753.3215026855469, "epoch": 0.7163019938764842, - "grad_norm": 21.631393432617188, - "kl": 0.26416015625, - "learning_rate": 2.255844927455751e-08, - "loss": 0.073, - "reward": 0.4670759066939354, - "reward_std": 0.21000506356358528, - "rewards/accuracy_reward": 0.13169643515720963, + "grad_norm": 4.95013427734375, + "kl": 3.54296875, + "learning_rate": 1.1279224637278755e-07, + "loss": 0.2162, + "reward": 0.5479910895228386, + "reward_std": 0.19631147757172585, + "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2398 }, { "clip_ratio": 0.0, - "completion_length": 1451.7991943359375, + "completion_length": 1786.7433776855469, "epoch": 0.7166007019640057, - "grad_norm": 23.086688995361328, - "kl": 0.27490234375, - "learning_rate": 2.2514869222919568e-08, - "loss": 0.0889, - "reward": 0.3878348395228386, - "reward_std": 0.177908131852746, - "rewards/accuracy_reward": 0.06473214481957257, + "grad_norm": 31.503252029418945, + "kl": 2.458984375, + "learning_rate": 1.1257434611459785e-07, + "loss": 0.1537, + "reward": 0.4810268059372902, + "reward_std": 0.13181341998279095, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4140625223517418, "step": 2399 }, { "clip_ratio": 0.0, - "completion_length": 1434.7656860351562, + "completion_length": 1801.4331359863281, "epoch": 0.7168994100515271, - "grad_norm": 25.37452507019043, - "kl": 0.244384765625, - "learning_rate": 2.247131907241009e-08, - "loss": 0.0815, - "reward": 0.5128348469734192, - "reward_std": 0.18598167598247528, - "rewards/accuracy_reward": 0.16294643888249993, + "grad_norm": 18.471357345581055, + "kl": 2.89453125, + "learning_rate": 1.1235659536205045e-07, + "loss": 0.1763, + "reward": 0.5864955633878708, + "reward_std": 0.13726701773703098, + "rewards/accuracy_reward": 0.16071428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349888414144516, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2400 }, { "clip_ratio": 0.0, - "completion_length": 1475.1339721679688, + "completion_length": 1806.0290832519531, "epoch": 0.7171981181390487, - "grad_norm": 23.998384475708008, - "kl": 0.28955078125, - "learning_rate": 2.2427798870407373e-08, - "loss": 0.0894, - "reward": 0.4090401977300644, - "reward_std": 0.1849764548242092, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 32.24931335449219, + "kl": 2.55859375, + "learning_rate": 1.1213899435203686e-07, + "loss": 0.1674, + "reward": 0.5574776977300644, + "reward_std": 0.1549007184803486, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.428013414144516, "step": 2401 }, { "clip_ratio": 0.0, - "completion_length": 1375.7478332519531, + "completion_length": 1795.8706359863281, "epoch": 0.7174968262265701, - "grad_norm": 24.933513641357422, - "kl": 0.2822265625, - "learning_rate": 2.2384308664257097e-08, - "loss": 0.0902, - "reward": 0.349330373108387, - "reward_std": 0.18496200069785118, - "rewards/accuracy_reward": 0.02901785890571773, + "grad_norm": 22.35542869567871, + "kl": 4.96484375, + "learning_rate": 1.1192154332128547e-07, + "loss": 0.295, + "reward": 0.4486607313156128, + "reward_std": 0.1741066500544548, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4107142984867096, "step": 2402 }, { "clip_ratio": 0.0, - "completion_length": 1435.4197082519531, + "completion_length": 1759.7858276367188, "epoch": 0.7177955343140916, - "grad_norm": 21.6085262298584, - "kl": 0.3056640625, - "learning_rate": 2.234084850127237e-08, - "loss": 0.0581, - "reward": 0.4308035969734192, - "reward_std": 0.2001265063881874, + "grad_norm": 25.364810943603516, + "kl": 5.0625, + "learning_rate": 1.1170424250636185e-07, + "loss": 0.3034, + "reward": 0.5228794738650322, + "reward_std": 0.16371148265898228, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2403 }, { "clip_ratio": 0.0, - "completion_length": 1421.2232666015625, + "completion_length": 1781.01123046875, "epoch": 0.718094242401613, - "grad_norm": 21.330224990844727, - "kl": 0.2734375, - "learning_rate": 2.22974184287336e-08, - "loss": 0.0745, - "reward": 0.360491082072258, - "reward_std": 0.20358651503920555, - "rewards/accuracy_reward": 0.029017859138548374, + "grad_norm": 16.583526611328125, + "kl": 3.765625, + "learning_rate": 1.11487092143668e-07, + "loss": 0.2104, + "reward": 0.4737723469734192, + "reward_std": 0.1625739485025406, + "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2404 }, { "clip_ratio": 0.0, - "completion_length": 1460.7009582519531, + "completion_length": 1821.7322082519531, "epoch": 0.7183929504891345, - "grad_norm": 23.770122528076172, - "kl": 0.2578125, - "learning_rate": 2.2254018493888415e-08, - "loss": 0.0651, - "reward": 0.3828125149011612, - "reward_std": 0.1796078458428383, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 11.287320137023926, + "kl": 3.5234375, + "learning_rate": 1.1127009246944208e-07, + "loss": 0.1933, + "reward": 0.4899553880095482, + "reward_std": 0.15909849293529987, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.420758955180645, "step": 2405 }, { "clip_ratio": 0.0, - "completion_length": 1394.5536804199219, + "completion_length": 1808.1295776367188, "epoch": 0.718691658576656, - "grad_norm": 23.413291931152344, - "kl": 0.328125, - "learning_rate": 2.2210648743951714e-08, - "loss": 0.0783, - "reward": 0.459821455180645, - "reward_std": 0.17562567815184593, - "rewards/accuracy_reward": 0.13839286379516125, + "grad_norm": 24.36695098876953, + "kl": 3.41015625, + "learning_rate": 1.1105324371975857e-07, + "loss": 0.2203, + "reward": 0.5535714477300644, + "reward_std": 0.16505314968526363, + "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2406 }, { "clip_ratio": 0.0, - "completion_length": 1431.8616638183594, + "completion_length": 1797.4018859863281, "epoch": 0.7189903666641775, - "grad_norm": 19.707653045654297, - "kl": 0.275390625, - "learning_rate": 2.216730922610553e-08, - "loss": 0.0694, - "reward": 0.3593750149011612, - "reward_std": 0.18698057904839516, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 28.975238800048828, + "kl": 2.451171875, + "learning_rate": 1.1083654613052764e-07, + "loss": 0.1542, + "reward": 0.4547991380095482, + "reward_std": 0.16203102096915245, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.4257812574505806, "step": 2407 }, { "clip_ratio": 0.0, - "completion_length": 1360.7366638183594, + "completion_length": 1739.74560546875, "epoch": 0.7192890747516989, - "grad_norm": 18.379741668701172, - "kl": 0.257568359375, - "learning_rate": 2.2123999987499015e-08, - "loss": 0.049, - "reward": 0.4017857387661934, - "reward_std": 0.19650868512690067, + "grad_norm": 560.644287109375, + "kl": 8.359375, + "learning_rate": 1.1061999993749508e-07, + "loss": 0.4097, + "reward": 0.4843750074505806, + "reward_std": 0.16469849087297916, "rewards/accuracy_reward": 0.06026786123402417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3415178805589676, + "rewards/tag_count_reward": 0.424107164144516, "step": 2408 }, { "clip_ratio": 0.0, - "completion_length": 1481.1027526855469, + "completion_length": 1860.1764221191406, "epoch": 0.7195877828392204, - "grad_norm": 17.521642684936523, - "kl": 0.267578125, - "learning_rate": 2.2080721075248383e-08, - "loss": 0.0484, - "reward": 0.3482142984867096, - "reward_std": 0.17644337937235832, - "rewards/accuracy_reward": 0.03125000232830644, + "grad_norm": 20.137100219726562, + "kl": 3.4921875, + "learning_rate": 1.1040360537624191e-07, + "loss": 0.1835, + "reward": 0.4352678805589676, + "reward_std": 0.12332583963871002, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.4062500223517418, "step": 2409 }, { "clip_ratio": 0.0, - "completion_length": 1417.7947387695312, + "completion_length": 1806.5313415527344, "epoch": 0.7198864909267418, - "grad_norm": 17.580415725708008, - "kl": 0.2744140625, - "learning_rate": 2.2037472536436824e-08, - "loss": 0.0554, - "reward": 0.3677455484867096, - "reward_std": 0.17621343955397606, - "rewards/accuracy_reward": 0.05357143026776612, + "grad_norm": 40.23284912109375, + "kl": 3.1640625, + "learning_rate": 1.1018736268218412e-07, + "loss": 0.2126, + "reward": 0.463727705180645, + "reward_std": 0.1408079881221056, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2410 }, { "clip_ratio": 0.0, - "completion_length": 1442.0335388183594, + "completion_length": 1762.6496276855469, "epoch": 0.7201851990142634, - "grad_norm": 23.438268661499023, - "kl": 0.29541015625, - "learning_rate": 2.199425441811452e-08, - "loss": 0.0907, - "reward": 0.3744419738650322, - "reward_std": 0.18287241086363792, - "rewards/accuracy_reward": 0.03571428591385484, + "grad_norm": 14.169441223144531, + "kl": 2.71875, + "learning_rate": 1.099712720905726e-07, + "loss": 0.1609, + "reward": 0.4726562723517418, + "reward_std": 0.1601056009531021, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276828289032, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2411 }, { "clip_ratio": 0.0, - "completion_length": 1430.27685546875, + "completion_length": 1785.0000915527344, "epoch": 0.7204839071017848, - "grad_norm": 23.471187591552734, - "kl": 0.31201171875, - "learning_rate": 2.195106676729857e-08, - "loss": 0.0708, - "reward": 0.412946455180645, - "reward_std": 0.1552618071436882, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 9.486129760742188, + "kl": 3.71875, + "learning_rate": 1.0975533383649285e-07, + "loss": 0.2316, + "reward": 0.5396205633878708, + "reward_std": 0.14515913277864456, + "rewards/accuracy_reward": 0.12946429150179029, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714402794838, + "rewards/tag_count_reward": 0.4101562649011612, "step": 2412 }, { "clip_ratio": 0.0, - "completion_length": 1447.9754943847656, + "completion_length": 1804.4509582519531, "epoch": 0.7207826151893062, - "grad_norm": 21.1092472076416, - "kl": 0.24365234375, - "learning_rate": 2.190790963097287e-08, - "loss": 0.074, - "reward": 0.381138414144516, - "reward_std": 0.19821033254265785, - "rewards/accuracy_reward": 0.035714287078008056, + "grad_norm": 5.653343677520752, + "kl": 3.306640625, + "learning_rate": 1.0953954815486436e-07, + "loss": 0.1931, + "reward": 0.474330373108387, + "reward_std": 0.1691528856754303, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.427455373108387, "step": 2413 }, { "clip_ratio": 0.0, - "completion_length": 1419.3080749511719, + "completion_length": 1719.2255249023438, "epoch": 0.7210813232768277, - "grad_norm": 24.444091796875, - "kl": 0.227783203125, - "learning_rate": 2.1864783056088187e-08, - "loss": 0.089, - "reward": 0.4062500149011612, - "reward_std": 0.20311935059726238, - "rewards/accuracy_reward": 0.05357143213041127, + "grad_norm": 17.845239639282227, + "kl": 2.4765625, + "learning_rate": 1.0932391528044094e-07, + "loss": 0.1753, + "reward": 0.5262277126312256, + "reward_std": 0.1883224993944168, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3526785895228386, + "rewards/tag_count_reward": 0.4414062723517418, "step": 2414 }, { "clip_ratio": 0.0, - "completion_length": 1373.8460693359375, + "completion_length": 1775.5982971191406, "epoch": 0.7213800313643491, - "grad_norm": 18.322324752807617, - "kl": 0.273681640625, - "learning_rate": 2.1821687089561973e-08, - "loss": 0.0894, - "reward": 0.4648437723517418, - "reward_std": 0.20485062152147293, - "rewards/accuracy_reward": 0.14508928917348385, + "grad_norm": 19.440322875976562, + "kl": 5.359375, + "learning_rate": 1.0910843544780987e-07, + "loss": 0.2964, + "reward": 0.5993303805589676, + "reward_std": 0.20246483385562897, + "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2415 }, { "clip_ratio": 0.0, - "completion_length": 1435.118408203125, + "completion_length": 1784.7366943359375, "epoch": 0.7216787394518707, - "grad_norm": 22.059310913085938, - "kl": 0.3173828125, - "learning_rate": 2.177862177827844e-08, - "loss": 0.0829, - "reward": 0.4542410895228386, - "reward_std": 0.16500470414757729, - "rewards/accuracy_reward": 0.12053571734577417, + "grad_norm": 6.092052936553955, + "kl": 3.6484375, + "learning_rate": 1.088931088913922e-07, + "loss": 0.2092, + "reward": 0.559709832072258, + "reward_std": 0.13023032806813717, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.439174123108387, "step": 2416 }, { "clip_ratio": 0.0, - "completion_length": 1405.3058471679688, + "completion_length": 1755.2813110351562, "epoch": 0.7219774475393921, - "grad_norm": 15.813189506530762, - "kl": 0.2529296875, - "learning_rate": 2.173558716908843e-08, - "loss": 0.0484, - "reward": 0.4062500223517418, - "reward_std": 0.16699782013893127, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 13.594573974609375, + "kl": 3.89453125, + "learning_rate": 1.0867793584544216e-07, + "loss": 0.2299, + "reward": 0.5066964402794838, + "reward_std": 0.12845467031002045, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325892984867096, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2417 }, { "clip_ratio": 0.0, - "completion_length": 1459.7567443847656, + "completion_length": 1769.7054443359375, "epoch": 0.7222761556269136, - "grad_norm": 18.36380958557129, - "kl": 0.29296875, - "learning_rate": 2.169258330880936e-08, - "loss": 0.0885, - "reward": 0.380580373108387, - "reward_std": 0.15589047968387604, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 12.550678253173828, + "kl": 3.90234375, + "learning_rate": 1.084629165440468e-07, + "loss": 0.2321, + "reward": 0.493861623108387, + "reward_std": 0.12634802795946598, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4090401902794838, "step": 2418 }, { "clip_ratio": 0.0, - "completion_length": 1397.8147888183594, + "completion_length": 1724.57373046875, "epoch": 0.722574863714435, - "grad_norm": 26.921178817749023, - "kl": 0.3076171875, - "learning_rate": 2.1649610244225218e-08, - "loss": 0.1133, - "reward": 0.5027901902794838, - "reward_std": 0.20379715785384178, - "rewards/accuracy_reward": 0.16741072200238705, + "grad_norm": 9.8344144821167, + "kl": 4.296875, + "learning_rate": 1.082480512211261e-07, + "loss": 0.2862, + "reward": 0.6138393208384514, + "reward_std": 0.1814941093325615, + "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2419 }, { "clip_ratio": 0.0, - "completion_length": 1361.3527526855469, + "completion_length": 1730.6407165527344, "epoch": 0.7228735718019565, - "grad_norm": 15.358524322509766, - "kl": 0.209716796875, - "learning_rate": 2.1606668022086516e-08, - "loss": 0.0592, - "reward": 0.4877232313156128, - "reward_std": 0.21167884021997452, - "rewards/accuracy_reward": 0.11830358067527413, + "grad_norm": 15.728743553161621, + "kl": 2.37109375, + "learning_rate": 1.0803334011043258e-07, + "loss": 0.1526, + "reward": 0.5552455633878708, + "reward_std": 0.18135413527488708, + "rewards/accuracy_reward": 0.10937500325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3694196492433548, + "rewards/tag_count_reward": 0.4458705559372902, "step": 2420 }, { "clip_ratio": 0.0, - "completion_length": 1413.4197082519531, + "completion_length": 1786.2478637695312, "epoch": 0.723172279889478, - "grad_norm": 19.968002319335938, - "kl": 0.289306640625, - "learning_rate": 2.1563756689110135e-08, - "loss": 0.0696, - "reward": 0.4681919738650322, - "reward_std": 0.22234686464071274, - "rewards/accuracy_reward": 0.12946429289877415, + "grad_norm": 7.246221542358398, + "kl": 3.61328125, + "learning_rate": 1.0781878344555067e-07, + "loss": 0.2163, + "reward": 0.5915178805589676, + "reward_std": 0.19372060522437096, + "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2421 }, { "clip_ratio": 0.0, - "completion_length": 1485.1741943359375, + "completion_length": 1857.40185546875, "epoch": 0.7234709879769995, - "grad_norm": 18.616125106811523, - "kl": 0.34130859375, - "learning_rate": 2.1520876291979435e-08, - "loss": 0.0622, - "reward": 0.3208705484867096, - "reward_std": 0.16980597376823425, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 41.586605072021484, + "kl": 5.296875, + "learning_rate": 1.0760438145989717e-07, + "loss": 0.2696, + "reward": 0.466517873108387, + "reward_std": 0.1800931841135025, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2422 }, { "clip_ratio": 0.0, - "completion_length": 1448.3438415527344, + "completion_length": 1764.63623046875, "epoch": 0.7237696960645209, - "grad_norm": 20.671817779541016, - "kl": 0.30419921875, - "learning_rate": 2.1478026877344085e-08, - "loss": 0.0839, - "reward": 0.3761160895228386, - "reward_std": 0.17178165912628174, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 13.38475227355957, + "kl": 3.103515625, + "learning_rate": 1.0739013438672043e-07, + "loss": 0.1955, + "reward": 0.5072544813156128, + "reward_std": 0.15691293589770794, + "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4335937723517418, "step": 2423 }, { "clip_ratio": 0.0, - "completion_length": 1438.1429443359375, + "completion_length": 1836.5514221191406, "epoch": 0.7240684041520424, - "grad_norm": 21.707809448242188, - "kl": 0.30029296875, - "learning_rate": 2.1435208491820022e-08, - "loss": 0.0662, - "reward": 0.3616071566939354, - "reward_std": 0.1479582991451025, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 39.78252029418945, + "kl": 4.6328125, + "learning_rate": 1.0717604245910012e-07, + "loss": 0.2433, + "reward": 0.4709821715950966, + "reward_std": 0.15256065130233765, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4129464402794838, "step": 2424 }, { "clip_ratio": 0.0, - "completion_length": 1391.4933776855469, + "completion_length": 1765.1920471191406, "epoch": 0.7243671122395638, - "grad_norm": 17.620718002319336, - "kl": 0.28662109375, - "learning_rate": 2.139242118198947e-08, - "loss": 0.0745, - "reward": 0.4213169813156128, - "reward_std": 0.20252827554941177, - "rewards/accuracy_reward": 0.0870535729918629, + "grad_norm": 7.882612228393555, + "kl": 3.5, + "learning_rate": 1.0696210590994736e-07, + "loss": 0.2074, + "reward": 0.5625000149011612, + "reward_std": 0.17608734965324402, + "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.435267873108387, "step": 2425 }, { "clip_ratio": 0.0, - "completion_length": 1438.3996276855469, + "completion_length": 1799.38623046875, "epoch": 0.7246658203270854, - "grad_norm": 19.668800354003906, - "kl": 0.292236328125, - "learning_rate": 2.1349664994400853e-08, - "loss": 0.0598, - "reward": 0.4062500149011612, - "reward_std": 0.15046541392803192, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 5.4424333572387695, + "kl": 3.36328125, + "learning_rate": 1.0674832497200425e-07, + "loss": 0.1955, + "reward": 0.512276791036129, + "reward_std": 0.14732623100280762, + "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2426 }, { "clip_ratio": 0.0, - "completion_length": 1360.2522888183594, + "completion_length": 1715.1741638183594, "epoch": 0.7249645284146068, - "grad_norm": 16.181371688842773, - "kl": 0.24658203125, - "learning_rate": 2.1306939975568662e-08, - "loss": 0.0737, - "reward": 0.4101562649011612, - "reward_std": 0.1774160247296095, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 17.144968032836914, + "kl": 2.59765625, + "learning_rate": 1.0653469987784331e-07, + "loss": 0.1616, + "reward": 0.5440848544239998, + "reward_std": 0.1448826715350151, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4481026977300644, "step": 2427 }, { "clip_ratio": 0.0, - "completion_length": 1422.7254943847656, + "completion_length": 1738.4152526855469, "epoch": 0.7252632365021283, - "grad_norm": 19.84902000427246, - "kl": 0.251953125, - "learning_rate": 2.1264246171973576e-08, - "loss": 0.0712, - "reward": 0.4123884066939354, - "reward_std": 0.19069647416472435, - "rewards/accuracy_reward": 0.07142857322469354, + "grad_norm": 44.91627502441406, + "kl": 2.47265625, + "learning_rate": 1.0632123085986788e-07, + "loss": 0.1716, + "reward": 0.5262276977300644, + "reward_std": 0.14621329680085182, + "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4369419887661934, "step": 2428 }, { "clip_ratio": 0.0, - "completion_length": 1431.9621276855469, + "completion_length": 1770.0804138183594, "epoch": 0.7255619445896497, - "grad_norm": 21.883121490478516, - "kl": 0.267822265625, - "learning_rate": 2.1221583630062228e-08, - "loss": 0.0656, - "reward": 0.3593750223517418, - "reward_std": 0.15603632852435112, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 7.293946266174316, + "kl": 4.0703125, + "learning_rate": 1.0610791815031114e-07, + "loss": 0.2371, + "reward": 0.4603794813156128, + "reward_std": 0.12955988198518753, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2429 }, { "clip_ratio": 0.0, - "completion_length": 1440.6875915527344, + "completion_length": 1827.9554443359375, "epoch": 0.7258606526771713, - "grad_norm": 18.233665466308594, - "kl": 0.29345703125, - "learning_rate": 2.1178952396247302e-08, - "loss": 0.0564, - "reward": 0.4168526977300644, - "reward_std": 0.17751368507742882, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 15.784100532531738, + "kl": 4.1484375, + "learning_rate": 1.058947619812365e-07, + "loss": 0.2094, + "reward": 0.5267857387661934, + "reward_std": 0.17344837822020054, + "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.4129464402794838, "step": 2430 }, { "clip_ratio": 0.0, - "completion_length": 1448.3639221191406, + "completion_length": 1792.8438415527344, "epoch": 0.7261593607646927, - "grad_norm": 21.724233627319336, - "kl": 0.3046875, - "learning_rate": 2.1136352516907426e-08, - "loss": 0.0548, - "reward": 0.3443080559372902, - "reward_std": 0.18131940439343452, - "rewards/accuracy_reward": 0.022321430267766118, + "grad_norm": 16.269140243530273, + "kl": 3.3203125, + "learning_rate": 1.0568176258453712e-07, + "loss": 0.1897, + "reward": 0.4369419813156128, + "reward_std": 0.1579836793243885, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2431 }, { "clip_ratio": 0.0, - "completion_length": 1424.4554443359375, + "completion_length": 1794.4375915527344, "epoch": 0.7264580688522142, - "grad_norm": 20.26696014404297, - "kl": 0.256103515625, - "learning_rate": 2.1093784038387048e-08, - "loss": 0.051, - "reward": 0.4542410969734192, - "reward_std": 0.16632955893874168, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 12.674710273742676, + "kl": 2.4970703125, + "learning_rate": 1.0546892019193523e-07, + "loss": 0.1452, + "reward": 0.5552455633878708, + "reward_std": 0.13266366347670555, + "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4280134066939354, "step": 2432 }, { "clip_ratio": 0.0, - "completion_length": 1439.7411193847656, + "completion_length": 1727.7077026367188, "epoch": 0.7267567769397356, - "grad_norm": 23.169391632080078, - "kl": 0.28466796875, - "learning_rate": 2.105124700699652e-08, - "loss": 0.0845, - "reward": 0.395647332072258, - "reward_std": 0.17786778882145882, - "rewards/accuracy_reward": 0.09151786286383867, + "grad_norm": 27.235422134399414, + "kl": 2.625, + "learning_rate": 1.052562350349826e-07, + "loss": 0.1916, + "reward": 0.5379464626312256, + "reward_std": 0.15297571197152138, + "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.4375000149011612, "step": 2433 }, { "clip_ratio": 0.0, - "completion_length": 1445.9397888183594, + "completion_length": 1768.0915832519531, "epoch": 0.7270554850272571, - "grad_norm": 17.980857849121094, - "kl": 0.277587890625, - "learning_rate": 2.1008741469011986e-08, - "loss": 0.0794, - "reward": 0.3950893059372902, - "reward_std": 0.19426273182034492, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 5.215934753417969, + "kl": 3.517578125, + "learning_rate": 1.0504370734505993e-07, + "loss": 0.2037, + "reward": 0.5390625149011612, + "reward_std": 0.189414344727993, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2434 }, { "clip_ratio": 0.0, - "completion_length": 1392.9241638183594, + "completion_length": 1707.0357971191406, "epoch": 0.7273541931147786, - "grad_norm": 21.36063575744629, - "kl": 0.264404296875, - "learning_rate": 2.096626747067527e-08, - "loss": 0.0804, - "reward": 0.3666294738650322, - "reward_std": 0.18021366372704506, - "rewards/accuracy_reward": 0.04017857299186289, + "grad_norm": 10.802674293518066, + "kl": 2.728515625, + "learning_rate": 1.0483133735337635e-07, + "loss": 0.1941, + "reward": 0.5083705708384514, + "reward_std": 0.15484930016100407, + "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4436384066939354, "step": 2435 }, { "clip_ratio": 0.0, - "completion_length": 1523.62060546875, + "completion_length": 1847.6741943359375, "epoch": 0.7276529012023001, - "grad_norm": 22.1626033782959, - "kl": 0.2685546875, - "learning_rate": 2.0923825058193933e-08, - "loss": 0.0552, - "reward": 0.3945312798023224, - "reward_std": 0.19159283116459846, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 20.80423355102539, + "kl": 3.94140625, + "learning_rate": 1.0461912529096967e-07, + "loss": 0.2002, + "reward": 0.5251116305589676, + "reward_std": 0.16722449474036694, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2436 }, { "clip_ratio": 0.0, - "completion_length": 1503.8750610351562, + "completion_length": 1860.5915832519531, "epoch": 0.7279516092898215, - "grad_norm": 21.497182846069336, - "kl": 0.26708984375, - "learning_rate": 2.0881414277741177e-08, - "loss": 0.0609, - "reward": 0.3833705559372902, - "reward_std": 0.20732494443655014, - "rewards/accuracy_reward": 0.06473214528523386, + "grad_norm": 7.680060863494873, + "kl": 3.71484375, + "learning_rate": 1.0440707138870589e-07, + "loss": 0.1908, + "reward": 0.4944196715950966, + "reward_std": 0.1624172255396843, + "rewards/accuracy_reward": 0.07366071501746774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2437 }, { "clip_ratio": 0.0, - "completion_length": 1462.0491638183594, + "completion_length": 1815.8148193359375, "epoch": 0.728250317377343, - "grad_norm": 20.3282413482666, - "kl": 0.27099609375, - "learning_rate": 2.0839035175455745e-08, - "loss": 0.0713, - "reward": 0.3744419813156128, - "reward_std": 0.16175758466124535, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 12.394641876220703, + "kl": 2.900390625, + "learning_rate": 1.0419517587727872e-07, + "loss": 0.1781, + "reward": 0.4893973469734192, + "reward_std": 0.14169423654675484, + "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.431361623108387, "step": 2438 }, { "clip_ratio": 0.0, - "completion_length": 1458.8616638183594, + "completion_length": 1770.5134887695312, "epoch": 0.7285490254648644, - "grad_norm": 22.34003257751465, - "kl": 0.273681640625, - "learning_rate": 2.0796687797441974e-08, - "loss": 0.0917, - "reward": 0.4118303805589676, - "reward_std": 0.18811574950814247, - "rewards/accuracy_reward": 0.09598214784637094, + "grad_norm": 5.094322681427002, + "kl": 3.796875, + "learning_rate": 1.0398343898720987e-07, + "loss": 0.2265, + "reward": 0.5479910895228386, + "reward_std": 0.16166459023952484, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2439 }, { "clip_ratio": 0.0, - "completion_length": 1446.185302734375, + "completion_length": 1791.227783203125, "epoch": 0.728847733552386, - "grad_norm": 20.93658447265625, - "kl": 0.25830078125, - "learning_rate": 2.075437218976963e-08, - "loss": 0.0896, - "reward": 0.4012276902794838, - "reward_std": 0.19138899073004723, - "rewards/accuracy_reward": 0.06696428777649999, + "grad_norm": 8.830268859863281, + "kl": 3.91796875, + "learning_rate": 1.0377186094884815e-07, + "loss": 0.2223, + "reward": 0.5094866380095482, + "reward_std": 0.17784508690238, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.420200914144516, "step": 2440 }, { "clip_ratio": 0.0, - "completion_length": 1496.7991638183594, + "completion_length": 1761.7969665527344, "epoch": 0.7291464416399074, - "grad_norm": 19.58333396911621, - "kl": 0.29541015625, - "learning_rate": 2.0712088398473963e-08, - "loss": 0.063, - "reward": 0.4196428656578064, - "reward_std": 0.19695661030709743, - "rewards/accuracy_reward": 0.10044643399305642, + "grad_norm": 7.605828285217285, + "kl": 2.3046875, + "learning_rate": 1.0356044199236982e-07, + "loss": 0.1354, + "reward": 0.5552455633878708, + "reward_std": 0.12088223733007908, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.4414062649011612, "step": 2441 }, { "clip_ratio": 0.0, - "completion_length": 1493.8438110351562, + "completion_length": 1795.3237609863281, "epoch": 0.7294451497274289, - "grad_norm": 21.467208862304688, - "kl": 0.25830078125, - "learning_rate": 2.066983646955562e-08, - "loss": 0.064, - "reward": 0.419084832072258, - "reward_std": 0.1901966854929924, - "rewards/accuracy_reward": 0.0915178619325161, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "grad_norm": 17.28439712524414, + "kl": 2.6640625, + "learning_rate": 1.0334918234777809e-07, + "loss": 0.1694, + "reward": 0.5396205708384514, + "reward_std": 0.15077169053256512, + "rewards/accuracy_reward": 0.10044643096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.439174123108387, "step": 2442 }, { "clip_ratio": 0.0, - "completion_length": 1501.26123046875, + "completion_length": 1820.2232666015625, "epoch": 0.7297438578149503, - "grad_norm": 19.311813354492188, - "kl": 0.3232421875, - "learning_rate": 2.0627616448980505e-08, - "loss": 0.0424, - "reward": 0.3917410895228386, - "reward_std": 0.18032475747168064, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 15.56547737121582, + "kl": 3.916015625, + "learning_rate": 1.0313808224490253e-07, + "loss": 0.1993, + "reward": 0.5072544813156128, + "reward_std": 0.1442029308527708, + "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553656578064, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2443 }, { "clip_ratio": 0.0, - "completion_length": 1457.2813110351562, + "completion_length": 1797.32373046875, "epoch": 0.7300425659024719, - "grad_norm": 19.1524600982666, - "kl": 0.245361328125, - "learning_rate": 2.0585428382679893e-08, - "loss": 0.085, - "reward": 0.3945312798023224, - "reward_std": 0.1812089905142784, - "rewards/accuracy_reward": 0.05357143119908869, + "grad_norm": 6.343161106109619, + "kl": 2.859375, + "learning_rate": 1.0292714191339946e-07, + "loss": 0.1739, + "reward": 0.4910714477300644, + "reward_std": 0.16152603551745415, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.340959832072258, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2444 }, { "clip_ratio": 0.0, - "completion_length": 1443.8638916015625, + "completion_length": 1752.4665832519531, "epoch": 0.7303412739899933, - "grad_norm": 17.68728256225586, - "kl": 0.25927734375, - "learning_rate": 2.0543272316550286e-08, - "loss": 0.0784, - "reward": 0.3666294738650322, - "reward_std": 0.17374080792069435, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 7.181160926818848, + "kl": 3.1328125, + "learning_rate": 1.0271636158275143e-07, + "loss": 0.188, + "reward": 0.5217634066939354, + "reward_std": 0.13296915777027607, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4369419887661934, "step": 2445 }, { "clip_ratio": 0.0, - "completion_length": 1422.6451416015625, + "completion_length": 1811.415283203125, "epoch": 0.7306399820775148, - "grad_norm": 18.25535774230957, - "kl": 0.2392578125, - "learning_rate": 2.0501148296453306e-08, - "loss": 0.0681, - "reward": 0.4168526902794838, - "reward_std": 0.1698230467736721, - "rewards/accuracy_reward": 0.0736607147846371, + "grad_norm": 5.441793918609619, + "kl": 3.796875, + "learning_rate": 1.0250574148226654e-07, + "loss": 0.2165, + "reward": 0.5033482387661934, + "reward_std": 0.14979437738656998, + "rewards/accuracy_reward": 0.09598215017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.407366082072258, "step": 2446 }, { "clip_ratio": 0.0, - "completion_length": 1486.9576721191406, + "completion_length": 1810.32373046875, "epoch": 0.7309386901650362, - "grad_norm": 17.87949562072754, - "kl": 0.262451171875, - "learning_rate": 2.0459056368215783e-08, - "loss": 0.0588, - "reward": 0.3671875149011612, - "reward_std": 0.1528852842748165, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 9.2046480178833, + "kl": 3.09765625, + "learning_rate": 1.0229528184107892e-07, + "loss": 0.1598, + "reward": 0.4843750149011612, + "reward_std": 0.12292457930743694, + "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4330357387661934, "step": 2447 }, { "clip_ratio": 0.0, - "completion_length": 1367.529052734375, + "completion_length": 1744.9040832519531, "epoch": 0.7312373982525577, - "grad_norm": 21.501020431518555, - "kl": 0.24267578125, - "learning_rate": 2.0416996577629624e-08, - "loss": 0.068, - "reward": 0.388392873108387, - "reward_std": 0.17282630875706673, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 16.724777221679688, + "kl": 2.740234375, + "learning_rate": 1.0208498288814813e-07, + "loss": 0.1714, + "reward": 0.4776785969734192, + "reward_std": 0.1339544542133808, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3683035895228386, + "rewards/tag_count_reward": 0.450892873108387, "step": 2448 }, { "clip_ratio": 0.0, - "completion_length": 1420.30810546875, + "completion_length": 1726.0670471191406, "epoch": 0.7315361063400792, - "grad_norm": 18.667829513549805, - "kl": 0.251220703125, - "learning_rate": 2.0374968970451728e-08, - "loss": 0.0756, - "reward": 0.4330357313156128, - "reward_std": 0.17843280360102654, - "rewards/accuracy_reward": 0.09821428847499192, + "grad_norm": 9.860130310058594, + "kl": 3.68359375, + "learning_rate": 1.0187484485225864e-07, + "loss": 0.2312, + "reward": 0.547433041036129, + "reward_std": 0.14477303624153137, + "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4335937649011612, "step": 2449 }, { "clip_ratio": 0.0, - "completion_length": 1444.6473693847656, + "completion_length": 1790.884033203125, "epoch": 0.7318348144276007, - "grad_norm": 19.69434928894043, - "kl": 0.254638671875, - "learning_rate": 2.0332973592404023e-08, - "loss": 0.0728, - "reward": 0.400111623108387, - "reward_std": 0.20489004999399185, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 19.418649673461914, + "kl": 4.29296875, + "learning_rate": 1.0166486796202012e-07, + "loss": 0.2452, + "reward": 0.5083705559372902, + "reward_std": 0.1649991013109684, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2450 }, { "clip_ratio": 0.0, - "completion_length": 1464.8906860351562, + "completion_length": 1776.1496276855469, "epoch": 0.7321335225151221, - "grad_norm": 23.504188537597656, - "kl": 0.267333984375, - "learning_rate": 2.0291010489173395e-08, - "loss": 0.1011, - "reward": 0.4012276902794838, - "reward_std": 0.16463207826018333, - "rewards/accuracy_reward": 0.08258929080329835, + "grad_norm": 18.984111785888672, + "kl": 4.9921875, + "learning_rate": 1.0145505244586697e-07, + "loss": 0.2753, + "reward": 0.5189732313156128, + "reward_std": 0.1406539287418127, + "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186383992433548, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2451 }, { "clip_ratio": 0.0, - "completion_length": 1466.8840026855469, + "completion_length": 1774.2255249023438, "epoch": 0.7324322306026436, - "grad_norm": 24.40184783935547, - "kl": 0.247802734375, - "learning_rate": 2.0249079706411532e-08, - "loss": 0.0863, - "reward": 0.450892873108387, - "reward_std": 0.19064459204673767, - "rewards/accuracy_reward": 0.10714285913854837, + "grad_norm": 4.973389625549316, + "kl": 3.83203125, + "learning_rate": 1.0124539853205766e-07, + "loss": 0.2241, + "reward": 0.5290178805589676, + "reward_std": 0.14920012652873993, + "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2452 }, { "clip_ratio": 0.0, - "completion_length": 1493.1340026855469, + "completion_length": 1844.9554443359375, "epoch": 0.732730938690165, - "grad_norm": 16.736913681030273, - "kl": 0.276611328125, - "learning_rate": 2.020718128973507e-08, - "loss": 0.0716, - "reward": 0.3939732313156128, - "reward_std": 0.2179703414440155, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 4.226149559020996, + "kl": 3.421875, + "learning_rate": 1.0103590644867535e-07, + "loss": 0.1858, + "reward": 0.4860491305589676, + "reward_std": 0.17224475741386414, + "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4123884215950966, "step": 2453 }, { "clip_ratio": 0.0, - "completion_length": 1509.8170776367188, + "completion_length": 1851.4308776855469, "epoch": 0.7330296467776866, - "grad_norm": 16.2025146484375, - "kl": 0.2607421875, - "learning_rate": 2.016531528472533e-08, - "loss": 0.0612, - "reward": 0.401785746216774, - "reward_std": 0.21099768206477165, - "rewards/accuracy_reward": 0.07812500349245965, + "grad_norm": 5.98801326751709, + "kl": 3.576171875, + "learning_rate": 1.0082657642362666e-07, + "loss": 0.2057, + "reward": 0.4910714477300644, + "reward_std": 0.18428654968738556, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2454 }, { "clip_ratio": 0.0, - "completion_length": 1494.1496276855469, + "completion_length": 1805.1719665527344, "epoch": 0.733328354865208, - "grad_norm": 21.030914306640625, - "kl": 0.270751953125, - "learning_rate": 2.0123481736928453e-08, - "loss": 0.0897, - "reward": 0.3409598395228386, - "reward_std": 0.19861437380313873, - "rewards/accuracy_reward": 0.02901785750873387, + "grad_norm": 8.540138244628906, + "kl": 3.421875, + "learning_rate": 1.0061740868464227e-07, + "loss": 0.1965, + "reward": 0.4698660969734192, + "reward_std": 0.17062033526599407, + "rewards/accuracy_reward": 0.046875001629814506, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2455 }, { "clip_ratio": 0.0, - "completion_length": 1437.9509582519531, + "completion_length": 1812.634033203125, "epoch": 0.7336270629527294, - "grad_norm": 20.61189079284668, - "kl": 0.247802734375, - "learning_rate": 2.0081680691855245e-08, - "loss": 0.0857, - "reward": 0.4280134066939354, - "reward_std": 0.200772225856781, - "rewards/accuracy_reward": 0.09375000558793545, + "grad_norm": 35.15149688720703, + "kl": 1.94140625, + "learning_rate": 1.0040840345927623e-07, + "loss": 0.1212, + "reward": 0.5279018059372902, + "reward_std": 0.19380417466163635, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2456 }, { "clip_ratio": 0.0, - "completion_length": 1403.8638916015625, + "completion_length": 1686.7098999023438, "epoch": 0.7339257710402509, - "grad_norm": 23.574045181274414, - "kl": 0.256591796875, - "learning_rate": 2.0039912194981123e-08, - "loss": 0.1064, - "reward": 0.4860491305589676, - "reward_std": 0.20602257549762726, - "rewards/accuracy_reward": 0.12276786239817739, + "grad_norm": 30.371028900146484, + "kl": 1.5791015625, + "learning_rate": 1.0019956097490562e-07, + "loss": 0.1114, + "reward": 0.5892857387661934, + "reward_std": 0.16236157529056072, + "rewards/accuracy_reward": 0.13616071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812649011612, + "rewards/tag_count_reward": 0.4531250223517418, "step": 2457 }, { "clip_ratio": 0.0, - "completion_length": 1401.055908203125, + "completion_length": 1738.6585693359375, "epoch": 0.7342244791277723, - "grad_norm": 15.816662788391113, - "kl": 0.23974609375, - "learning_rate": 1.9998176291746127e-08, - "loss": 0.0739, - "reward": 0.4441964477300644, - "reward_std": 0.16661863215267658, - "rewards/accuracy_reward": 0.10267857648432255, + "grad_norm": 22.71346664428711, + "kl": 2.146484375, + "learning_rate": 9.999088145873064e-08, + "loss": 0.1417, + "reward": 0.5619419887661934, + "reward_std": 0.1414557546377182, + "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.4436384215950966, "step": 2458 }, { "clip_ratio": 0.0, - "completion_length": 1449.9286499023438, + "completion_length": 1768.9732971191406, "epoch": 0.7345231872152939, - "grad_norm": 15.239723205566406, - "kl": 0.256591796875, - "learning_rate": 1.9956473027554844e-08, - "loss": 0.0773, - "reward": 0.3839285895228386, - "reward_std": 0.20350854471325874, - "rewards/accuracy_reward": 0.06026785844005644, + "grad_norm": 12.978499412536621, + "kl": 3.859375, + "learning_rate": 9.978236513777421e-08, + "loss": 0.2077, + "reward": 0.5117187574505806, + "reward_std": 0.17660164088010788, + "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2459 }, { "clip_ratio": 0.0, - "completion_length": 1400.6116638183594, + "completion_length": 1711.8282165527344, "epoch": 0.7348218953028153, - "grad_norm": 21.04696273803711, - "kl": 0.222412109375, - "learning_rate": 1.9914802447776314e-08, - "loss": 0.0991, - "reward": 0.4659598469734192, - "reward_std": 0.20279047265648842, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 8.386109352111816, + "kl": 2.7734375, + "learning_rate": 9.957401223888156e-08, + "loss": 0.1661, + "reward": 0.5401785969734192, + "reward_std": 0.12458506878465414, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.365513414144516, + "rewards/tag_count_reward": 0.4464285895228386, "step": 2460 }, { "clip_ratio": 0.0, - "completion_length": 1410.9308776855469, + "completion_length": 1736.68310546875, "epoch": 0.7351206033903368, - "grad_norm": 19.716442108154297, - "kl": 0.269775390625, - "learning_rate": 1.9873164597744045e-08, - "loss": 0.0898, - "reward": 0.4804687798023224, - "reward_std": 0.16579106822609901, - "rewards/accuracy_reward": 0.1562500074505806, + "grad_norm": 7.0329813957214355, + "kl": 4.103515625, + "learning_rate": 9.936582298872023e-08, + "loss": 0.243, + "reward": 0.6149553805589676, + "reward_std": 0.13996917940676212, + "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2461 }, { "clip_ratio": 0.0, - "completion_length": 1481.7656555175781, + "completion_length": 1843.3639221191406, "epoch": 0.7354193114778582, - "grad_norm": 12.566195487976074, - "kl": 0.259033203125, - "learning_rate": 1.9831559522755976e-08, - "loss": 0.0414, - "reward": 0.357142873108387, - "reward_std": 0.1364893615245819, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 7.624131679534912, + "kl": 4.20703125, + "learning_rate": 9.915779761377987e-08, + "loss": 0.2334, + "reward": 0.4838169738650322, + "reward_std": 0.13326245918869972, + "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.4280134066939354, "step": 2462 }, { "clip_ratio": 0.0, - "completion_length": 1388.9643859863281, + "completion_length": 1703.0246276855469, "epoch": 0.7357180195653797, - "grad_norm": 16.93359375, - "kl": 0.267578125, - "learning_rate": 1.9789987268074294e-08, - "loss": 0.0776, - "reward": 0.5212053880095482, - "reward_std": 0.2103702649474144, - "rewards/accuracy_reward": 0.19196429662406445, + "grad_norm": 9.913413047790527, + "kl": 3.69921875, + "learning_rate": 9.894993634037147e-08, + "loss": 0.2067, + "reward": 0.6735491454601288, + "reward_std": 0.18679275922477245, + "rewards/accuracy_reward": 0.2433035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2463 }, { "clip_ratio": 0.0, - "completion_length": 1469.9175109863281, + "completion_length": 1805.6942749023438, "epoch": 0.7360167276529012, - "grad_norm": 16.46976661682129, - "kl": 0.291748046875, - "learning_rate": 1.9748447878925567e-08, - "loss": 0.0787, - "reward": 0.3459821566939354, - "reward_std": 0.1451220829039812, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 22.81328773498535, + "kl": 4.88671875, + "learning_rate": 9.874223939462784e-08, + "loss": 0.2542, + "reward": 0.4642857387661934, + "reward_std": 0.1239042580127716, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714365541935, + "rewards/tag_count_reward": 0.408482164144516, "step": 2464 }, { "clip_ratio": 0.0, - "completion_length": 1458.5647888183594, + "completion_length": 1799.2880249023438, "epoch": 0.7363154357404227, - "grad_norm": 21.197179794311523, - "kl": 0.249755859375, - "learning_rate": 1.97069414005006e-08, - "loss": 0.0762, - "reward": 0.3537946566939354, - "reward_std": 0.1871011033654213, - "rewards/accuracy_reward": 0.020089287078008056, + "grad_norm": 21.951324462890625, + "kl": 2.96875, + "learning_rate": 9.8534707002503e-08, + "loss": 0.1756, + "reward": 0.4492187723517418, + "reward_std": 0.13758663088083267, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4291294887661934, "step": 2465 }, { "clip_ratio": 0.0, - "completion_length": 1492.3661804199219, + "completion_length": 1827.83935546875, "epoch": 0.7366141438279441, - "grad_norm": 15.961751937866211, - "kl": 0.271240234375, - "learning_rate": 1.966546787795433e-08, - "loss": 0.0663, - "reward": 0.3164062574505806, - "reward_std": 0.16967228055000305, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 5.277919769287109, + "kl": 3.609375, + "learning_rate": 9.832733938977164e-08, + "loss": 0.2072, + "reward": 0.451450914144516, + "reward_std": 0.1796775683760643, + "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.411272332072258, "step": 2466 }, { "clip_ratio": 0.0, - "completion_length": 1547.1897888183594, + "completion_length": 1834.5626220703125, "epoch": 0.7369128519154656, - "grad_norm": 18.31294059753418, - "kl": 0.27587890625, - "learning_rate": 1.9624027356405914e-08, - "loss": 0.0771, - "reward": 0.4760044887661934, - "reward_std": 0.1564146690070629, - "rewards/accuracy_reward": 0.1808035783469677, + "grad_norm": 20.796598434448242, + "kl": 2.9375, + "learning_rate": 9.812013678202957e-08, + "loss": 0.1798, + "reward": 0.6054687798023224, + "reward_std": 0.11634495109319687, + "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952008992433548, + "rewards/tag_count_reward": 0.4224330633878708, "step": 2467 }, { "clip_ratio": 0.0, - "completion_length": 1448.6384582519531, + "completion_length": 1768.2813415527344, "epoch": 0.737211560002987, - "grad_norm": 16.263673782348633, - "kl": 0.245361328125, - "learning_rate": 1.9582619880938563e-08, - "loss": 0.0937, - "reward": 0.3320312723517418, - "reward_std": 0.15081966295838356, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 15.44597053527832, + "kl": 3.49609375, + "learning_rate": 9.791309940469281e-08, + "loss": 0.2138, + "reward": 0.4213169813156128, + "reward_std": 0.12068344466388226, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2468 }, { "clip_ratio": 0.0, - "completion_length": 1430.4197082519531, + "completion_length": 1773.6764221191406, "epoch": 0.7375102680905086, - "grad_norm": 16.930828094482422, - "kl": 0.23291015625, - "learning_rate": 1.954124549659955e-08, - "loss": 0.0853, - "reward": 0.4268973469734192, - "reward_std": 0.20167602971196175, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 6.350347518920898, + "kl": 3.2734375, + "learning_rate": 9.770622748299775e-08, + "loss": 0.1928, + "reward": 0.5440848395228386, + "reward_std": 0.16485394537448883, + "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401977300644, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2469 }, { "clip_ratio": 0.0, - "completion_length": 1418.5335693359375, + "completion_length": 1715.4666137695312, "epoch": 0.73780897617803, - "grad_norm": 17.139368057250977, - "kl": 0.24658203125, - "learning_rate": 1.949990424840018e-08, - "loss": 0.0968, - "reward": 0.4375000149011612, - "reward_std": 0.2010096199810505, - "rewards/accuracy_reward": 0.10491072060540318, + "grad_norm": 6.997522354125977, + "kl": 3.44140625, + "learning_rate": 9.749952124200089e-08, + "loss": 0.2275, + "reward": 0.5697544738650322, + "reward_std": 0.14852512441575527, + "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2470 }, { "clip_ratio": 0.0, - "completion_length": 1358.0625610351562, + "completion_length": 1712.7389221191406, "epoch": 0.7381076842655515, - "grad_norm": 19.369150161743164, - "kl": 0.275146484375, - "learning_rate": 1.945859618131564e-08, - "loss": 0.0876, - "reward": 0.4386160969734192, - "reward_std": 0.19326533004641533, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 7.35548734664917, + "kl": 3.73046875, + "learning_rate": 9.72929809065782e-08, + "loss": 0.2412, + "reward": 0.5446428954601288, + "reward_std": 0.16495750844478607, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2471 }, { "clip_ratio": 0.0, - "completion_length": 1412.2255249023438, + "completion_length": 1725.716552734375, "epoch": 0.7384063923530729, - "grad_norm": 21.51898765563965, - "kl": 0.272705078125, - "learning_rate": 1.9417321340285075e-08, - "loss": 0.1021, - "reward": 0.5267857387661934, - "reward_std": 0.18275799229741096, - "rewards/accuracy_reward": 0.17187500977888703, + "grad_norm": 43.503604888916016, + "kl": 5.53515625, + "learning_rate": 9.708660670142538e-08, + "loss": 0.2927, + "reward": 0.6222098395228386, + "reward_std": 0.15264825522899628, + "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2472 }, { "clip_ratio": 0.0, - "completion_length": 1485.2790832519531, + "completion_length": 1775.7545471191406, "epoch": 0.7387051004405945, - "grad_norm": 15.690969467163086, - "kl": 0.250732421875, - "learning_rate": 1.9376079770211486e-08, - "loss": 0.098, - "reward": 0.4274553656578064, - "reward_std": 0.19920217245817184, - "rewards/accuracy_reward": 0.09821429080329835, + "grad_norm": 23.964923858642578, + "kl": 3.30078125, + "learning_rate": 9.688039885105742e-08, + "loss": 0.1757, + "reward": 0.5736607313156128, + "reward_std": 0.16235435381531715, + "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4508928805589676, "step": 2473 }, { "clip_ratio": 0.0, - "completion_length": 1470.6920166015625, + "completion_length": 1817.0045776367188, "epoch": 0.7390038085281159, - "grad_norm": 23.325002670288086, - "kl": 0.32421875, - "learning_rate": 1.9334871515961616e-08, - "loss": 0.0929, - "reward": 0.4263392984867096, - "reward_std": 0.1480097360908985, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 42.71257400512695, + "kl": 6.9140625, + "learning_rate": 9.667435757980808e-08, + "loss": 0.3571, + "reward": 0.5312500149011612, + "reward_std": 0.13582192361354828, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.4062500223517418, "step": 2474 }, { "clip_ratio": 0.0, - "completion_length": 1455.8236999511719, + "completion_length": 1745.5670471191406, "epoch": 0.7393025166156374, - "grad_norm": 20.87278938293457, - "kl": 0.25390625, - "learning_rate": 1.9293696622366034e-08, - "loss": 0.0829, - "reward": 0.4040178805589676, - "reward_std": 0.1839243620634079, - "rewards/accuracy_reward": 0.06696428940631449, + "grad_norm": 22.660520553588867, + "kl": 4.81640625, + "learning_rate": 9.646848311183018e-08, + "loss": 0.2645, + "reward": 0.4810268133878708, + "reward_std": 0.16368885710835457, + "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337053582072258, + "rewards/tag_count_reward": 0.416294664144516, "step": 2475 }, { "clip_ratio": 0.0, - "completion_length": 1508.0826721191406, + "completion_length": 1787.0781860351562, "epoch": 0.7396012247031588, - "grad_norm": 15.614667892456055, - "kl": 0.2705078125, - "learning_rate": 1.9252555134219005e-08, - "loss": 0.078, - "reward": 0.3141741305589676, - "reward_std": 0.17687185481190681, - "rewards/accuracy_reward": 0.01116071455180645, + "grad_norm": 11.5211820602417, + "kl": 3.203125, + "learning_rate": 9.626277567109503e-08, + "loss": 0.1864, + "reward": 0.4520089477300644, + "reward_std": 0.15423032455146313, + "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.303013414144516, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2476 }, { "clip_ratio": 0.0, - "completion_length": 1393.544677734375, + "completion_length": 1703.2880249023438, "epoch": 0.7398999327906803, - "grad_norm": 14.569531440734863, - "kl": 0.231689453125, - "learning_rate": 1.92114470962784e-08, - "loss": 0.0743, - "reward": 0.4481026902794838, - "reward_std": 0.2363801784813404, - "rewards/accuracy_reward": 0.10491072060540318, + "grad_norm": 47.9050407409668, + "kl": 1.58203125, + "learning_rate": 9.6057235481392e-08, + "loss": 0.1335, + "reward": 0.5468750223517418, + "reward_std": 0.20343011990189552, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4508928805589676, "step": 2477 }, { "clip_ratio": 0.0, - "completion_length": 1451.5045471191406, + "completion_length": 1760.9063415527344, "epoch": 0.7401986408782018, - "grad_norm": 19.766706466674805, - "kl": 0.2646484375, - "learning_rate": 1.917037255326575e-08, - "loss": 0.0723, - "reward": 0.3582589477300644, - "reward_std": 0.1707298345863819, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 43.88084030151367, + "kl": 2.48046875, + "learning_rate": 9.585186276632875e-08, + "loss": 0.1498, + "reward": 0.486049123108387, + "reward_std": 0.1505572684109211, + "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125223517418, + "rewards/tag_count_reward": 0.432477705180645, "step": 2478 }, { "clip_ratio": 0.0, - "completion_length": 1455.5246276855469, + "completion_length": 1787.3750915527344, "epoch": 0.7404973489657233, - "grad_norm": 21.634777069091797, - "kl": 0.253173828125, - "learning_rate": 1.912933154986615e-08, - "loss": 0.0844, - "reward": 0.4213169813156128, - "reward_std": 0.22713232785463333, - "rewards/accuracy_reward": 0.08705357741564512, + "grad_norm": 12116.1376953125, + "kl": 23.037109375, + "learning_rate": 9.564665774933075e-08, + "loss": 0.9966, + "reward": 0.5362723395228386, + "reward_std": 0.21773753687739372, + "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342633992433548, + "rewards/tag_count_reward": 0.4179687723517418, "step": 2479 }, { "clip_ratio": 0.0, - "completion_length": 1502.5067749023438, + "completion_length": 1781.1697387695312, "epoch": 0.7407960570532447, - "grad_norm": 21.933643341064453, - "kl": 0.271484375, - "learning_rate": 1.9088324130728163e-08, - "loss": 0.0897, - "reward": 0.3593750074505806, - "reward_std": 0.1836048997938633, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 10.233570098876953, + "kl": 3.302734375, + "learning_rate": 9.54416206536408e-08, + "loss": 0.1815, + "reward": 0.509486623108387, + "reward_std": 0.138621149584651, + "rewards/accuracy_reward": 0.08035714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714402794838, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2480 }, { "clip_ratio": 0.0, - "completion_length": 1505.0491943359375, + "completion_length": 1835.0357971191406, "epoch": 0.7410947651407662, - "grad_norm": 17.610992431640625, - "kl": 0.240966796875, - "learning_rate": 1.9047350340463858e-08, - "loss": 0.0653, - "reward": 0.4921875223517418, - "reward_std": 0.2252787910401821, - "rewards/accuracy_reward": 0.16071429336443543, + "grad_norm": 15.677043914794922, + "kl": 2.337890625, + "learning_rate": 9.523675170231929e-08, + "loss": 0.1415, + "reward": 0.6434151977300644, + "reward_std": 0.22734371572732925, + "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.4425223395228386, "step": 2481 }, { "clip_ratio": 0.0, - "completion_length": 1488.96435546875, + "completion_length": 1766.77685546875, "epoch": 0.7413934732282876, - "grad_norm": 20.761030197143555, - "kl": 0.265625, - "learning_rate": 1.900641022364869e-08, - "loss": 0.1044, - "reward": 0.4497768059372902, - "reward_std": 0.21831408888101578, - "rewards/accuracy_reward": 0.13839286286383867, + "grad_norm": 16.84552574157715, + "kl": 3.46875, + "learning_rate": 9.503205111824344e-08, + "loss": 0.2271, + "reward": 0.5837053805589676, + "reward_std": 0.16218005679547787, + "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2482 }, { "clip_ratio": 0.0, - "completion_length": 1465.5536193847656, + "completion_length": 1785.1295776367188, "epoch": 0.7416921813158092, - "grad_norm": 23.185705184936523, - "kl": 0.273193359375, - "learning_rate": 1.8965503824821493e-08, - "loss": 0.0924, - "reward": 0.3844866156578064, - "reward_std": 0.15811992809176445, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 9.836202621459961, + "kl": 4.35546875, + "learning_rate": 9.482751912410748e-08, + "loss": 0.2488, + "reward": 0.501116082072258, + "reward_std": 0.09269302058964968, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2483 }, { "clip_ratio": 0.0, - "completion_length": 1433.2098999023438, + "completion_length": 1730.0782165527344, "epoch": 0.7419908894033306, - "grad_norm": 20.277729034423828, - "kl": 0.24072265625, - "learning_rate": 1.8924631188484445e-08, - "loss": 0.0881, - "reward": 0.4112723395228386, - "reward_std": 0.16480418108403683, - "rewards/accuracy_reward": 0.07812500093132257, + "grad_norm": 6.52803373336792, + "kl": 3.138671875, + "learning_rate": 9.462315594242223e-08, + "loss": 0.1921, + "reward": 0.5424107313156128, + "reward_std": 0.15229442343115807, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2484 }, { "clip_ratio": 0.0, - "completion_length": 1522.5313110351562, + "completion_length": 1847.9755554199219, "epoch": 0.7422895974908521, - "grad_norm": 17.23532485961914, - "kl": 0.25830078125, - "learning_rate": 1.8883792359102935e-08, - "loss": 0.0896, - "reward": 0.3191964477300644, - "reward_std": 0.1770779862999916, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 24.102272033691406, + "kl": 4.83984375, + "learning_rate": 9.441896179551467e-08, + "loss": 0.2496, + "reward": 0.4492187723517418, + "reward_std": 0.15179810300469398, + "rewards/accuracy_reward": 0.02901785750873387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.420200914144516, "step": 2485 }, { "clip_ratio": 0.0, - "completion_length": 1448.5067749023438, + "completion_length": 1797.5536499023438, "epoch": 0.7425883055783735, - "grad_norm": 16.539165496826172, - "kl": 0.250732421875, - "learning_rate": 1.8842987381105624e-08, - "loss": 0.0633, - "reward": 0.3856026977300644, - "reward_std": 0.16939463838934898, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 35.3285026550293, + "kl": 4.9609375, + "learning_rate": 9.421493690552812e-08, + "loss": 0.2487, + "reward": 0.4832589477300644, + "reward_std": 0.1297183446586132, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4140625223517418, "step": 2486 }, { "clip_ratio": 0.0, - "completion_length": 1578.9130249023438, + "completion_length": 1855.9331665039062, "epoch": 0.742887013665895, - "grad_norm": 19.75218391418457, - "kl": 0.27490234375, - "learning_rate": 1.8802216298884345e-08, - "loss": 0.0695, - "reward": 0.365513414144516, - "reward_std": 0.17991004511713982, - "rewards/accuracy_reward": 0.06250000302679837, + "grad_norm": 8.837518692016602, + "kl": 4.29296875, + "learning_rate": 9.401108149442172e-08, + "loss": 0.2242, + "reward": 0.474330373108387, + "reward_std": 0.1323937438428402, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2487 }, { "clip_ratio": 0.0, - "completion_length": 1514.0558776855469, + "completion_length": 1813.4331359863281, "epoch": 0.7431857217534165, - "grad_norm": 20.72906494140625, - "kl": 0.29150390625, - "learning_rate": 1.8761479156793996e-08, - "loss": 0.0909, - "reward": 0.333147332072258, - "reward_std": 0.16239267960190773, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 5.106071472167969, + "kl": 3.87109375, + "learning_rate": 9.380739578396999e-08, + "loss": 0.2177, + "reward": 0.495535746216774, + "reward_std": 0.16601544246077538, + "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2929687649011612, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2488 }, { "clip_ratio": 0.0, - "completion_length": 1489.2210693359375, + "completion_length": 1793.4800109863281, "epoch": 0.743484429840938, - "grad_norm": 19.023496627807617, - "kl": 0.251708984375, - "learning_rate": 1.8720775999152627e-08, - "loss": 0.0916, - "reward": 0.4380580484867096, - "reward_std": 0.17026307061314583, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 33.44401550292969, + "kl": 2.517578125, + "learning_rate": 9.360387999576314e-08, + "loss": 0.1951, + "reward": 0.5479910895228386, + "reward_std": 0.14646285399794579, + "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544738650322, + "rewards/tag_count_reward": 0.422991082072258, "step": 2489 }, { "clip_ratio": 0.0, - "completion_length": 1527.384033203125, + "completion_length": 1834.5179443359375, "epoch": 0.7437831379284594, - "grad_norm": 13.306137084960938, - "kl": 0.278564453125, - "learning_rate": 1.8680106870241296e-08, - "loss": 0.0628, - "reward": 0.365513414144516, - "reward_std": 0.1711796224117279, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 9.561552047729492, + "kl": 3.609375, + "learning_rate": 9.340053435120648e-08, + "loss": 0.202, + "reward": 0.4609375074505806, + "reward_std": 0.1355385221540928, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.407366082072258, "step": 2490 }, { "clip_ratio": 0.0, - "completion_length": 1517.399658203125, + "completion_length": 1838.0759887695312, "epoch": 0.7440818460159809, - "grad_norm": 21.240888595581055, - "kl": 0.221923828125, - "learning_rate": 1.863947181430399e-08, - "loss": 0.0803, - "reward": 0.4146205484867096, - "reward_std": 0.15788529440760612, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 27.870573043823242, + "kl": 2.29296875, + "learning_rate": 9.319735907151996e-08, + "loss": 0.1288, + "reward": 0.514508955180645, + "reward_std": 0.1199816819280386, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4319196715950966, "step": 2491 }, { "clip_ratio": 0.0, - "completion_length": 1454.9308776855469, + "completion_length": 1829.4018249511719, "epoch": 0.7443805541035023, - "grad_norm": 15.7232027053833, - "kl": 0.2802734375, - "learning_rate": 1.859887087554769e-08, - "loss": 0.0742, - "reward": 0.3973214402794838, - "reward_std": 0.17027786374092102, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 8.88051700592041, + "kl": 3.072265625, + "learning_rate": 9.299435437773845e-08, + "loss": 0.1911, + "reward": 0.5167410969734192, + "reward_std": 0.14815832115709782, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714328289032, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2492 }, { "clip_ratio": 0.0, - "completion_length": 1460.2009582519531, + "completion_length": 1785.5514526367188, "epoch": 0.7446792621910239, - "grad_norm": 16.480937957763672, - "kl": 0.26806640625, - "learning_rate": 1.855830409814225e-08, - "loss": 0.0786, - "reward": 0.3917410895228386, - "reward_std": 0.14875664189457893, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 22.785863876342773, + "kl": 3.16015625, + "learning_rate": 9.279152049071126e-08, + "loss": 0.2108, + "reward": 0.4899553656578064, + "reward_std": 0.1146912183612585, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2493 }, { "clip_ratio": 0.0, - "completion_length": 1420.2098693847656, + "completion_length": 1722.7210693359375, "epoch": 0.7449779702785453, - "grad_norm": 15.533133506774902, - "kl": 0.2529296875, - "learning_rate": 1.851777152622032e-08, - "loss": 0.0918, - "reward": 0.4637276977300644, - "reward_std": 0.20613892376422882, - "rewards/accuracy_reward": 0.1339285778813064, + "grad_norm": 9.721294403076172, + "kl": 4.40625, + "learning_rate": 9.258885763110161e-08, + "loss": 0.2759, + "reward": 0.5385044738650322, + "reward_std": 0.17283786460757256, + "rewards/accuracy_reward": 0.11607143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2494 }, { "clip_ratio": 0.0, - "completion_length": 1491.6652221679688, + "completion_length": 1831.82373046875, "epoch": 0.7452766783660668, - "grad_norm": 15.18786907196045, - "kl": 0.250732421875, - "learning_rate": 1.8477273203877398e-08, - "loss": 0.0893, - "reward": 0.404575914144516, - "reward_std": 0.17783695831894875, - "rewards/accuracy_reward": 0.08258928847499192, + "grad_norm": 22.2852783203125, + "kl": 4.71875, + "learning_rate": 9.238636601938699e-08, + "loss": 0.25, + "reward": 0.5011161044239998, + "reward_std": 0.14202029816806316, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2495 }, { "clip_ratio": 0.0, - "completion_length": 1426.5179138183594, + "completion_length": 1735.7188110351562, "epoch": 0.7455753864535882, - "grad_norm": 15.432900428771973, - "kl": 0.25927734375, - "learning_rate": 1.8436809175171648e-08, - "loss": 0.0914, - "reward": 0.4330357313156128, - "reward_std": 0.2035733200609684, - "rewards/accuracy_reward": 0.11383929220028222, + "grad_norm": 9.159541130065918, + "kl": 4.0546875, + "learning_rate": 9.218404587585824e-08, + "loss": 0.2393, + "reward": 0.549107164144516, + "reward_std": 0.15040829591453075, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2496 }, { "clip_ratio": 0.0, - "completion_length": 1477.6272888183594, + "completion_length": 1769.4487609863281, "epoch": 0.7458740945411098, - "grad_norm": 17.163551330566406, - "kl": 0.2392578125, - "learning_rate": 1.839637948412399e-08, - "loss": 0.0991, - "reward": 0.392857164144516, - "reward_std": 0.1816793605685234, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 4.916046142578125, + "kl": 3.048828125, + "learning_rate": 9.198189742061996e-08, + "loss": 0.172, + "reward": 0.502232164144516, + "reward_std": 0.14042324759066105, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337053582072258, + "rewards/tag_count_reward": 0.435267873108387, "step": 2497 }, { "clip_ratio": 0.0, - "completion_length": 1495.243408203125, + "completion_length": 1760.1139526367188, "epoch": 0.7461728026286312, - "grad_norm": 14.347710609436035, - "kl": 0.2919921875, - "learning_rate": 1.8355984174717993e-08, - "loss": 0.0788, - "reward": 0.3727678880095482, - "reward_std": 0.1743791475892067, - "rewards/accuracy_reward": 0.07812500186264515, + "grad_norm": 9.095718383789062, + "kl": 4.125, + "learning_rate": 9.177992087358996e-08, + "loss": 0.2452, + "reward": 0.5172991305589676, + "reward_std": 0.1437222994863987, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2946428805589676, + "rewards/tag_count_reward": 0.412388414144516, "step": 2498 }, { "clip_ratio": 0.0, - "completion_length": 1358.8259582519531, + "completion_length": 1700.16748046875, "epoch": 0.7464715107161526, - "grad_norm": 16.26650619506836, - "kl": 0.2119140625, - "learning_rate": 1.831562329089974e-08, - "loss": 0.0997, - "reward": 0.4860491380095482, - "reward_std": 0.1765441931784153, - "rewards/accuracy_reward": 0.12723214970901608, + "grad_norm": 22.993492126464844, + "kl": 2.498046875, + "learning_rate": 9.15781164544987e-08, + "loss": 0.1606, + "reward": 0.593191996216774, + "reward_std": 0.17209437116980553, + "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169813156128, + "rewards/tag_count_reward": 0.4414062649011612, "step": 2499 }, { "clip_ratio": 0.0, - "completion_length": 1453.44873046875, + "completion_length": 1814.8505554199219, "epoch": 0.7467702188036741, - "grad_norm": 16.2183780670166, - "kl": 0.23779296875, - "learning_rate": 1.8275296876577944e-08, - "loss": 0.0677, - "reward": 0.4023437649011612, - "reward_std": 0.23276053741574287, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 14.00835132598877, + "kl": 3.890625, + "learning_rate": 9.137648438288972e-08, + "loss": 0.1909, + "reward": 0.5117187723517418, + "reward_std": 0.23183442652225494, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080559372902, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2500 }, { "clip_ratio": 0.0, - "completion_length": 1483.4598693847656, + "completion_length": 1826.1384887695312, "epoch": 0.7470689268911955, - "grad_norm": 16.89532470703125, - "kl": 0.24072265625, - "learning_rate": 1.8235004975623814e-08, - "loss": 0.0931, - "reward": 0.3705357387661934, - "reward_std": 0.21173127368092537, - "rewards/accuracy_reward": 0.04017857415601611, + "grad_norm": 15.795931816101074, + "kl": 2.62890625, + "learning_rate": 9.117502487811907e-08, + "loss": 0.1518, + "reward": 0.4882812723517418, + "reward_std": 0.1846756376326084, + "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.330357164144516, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2501 }, { "clip_ratio": 0.0, - "completion_length": 1365.8661499023438, + "completion_length": 1696.4486999511719, "epoch": 0.747367634978717, - "grad_norm": 14.642939567565918, - "kl": 0.232421875, - "learning_rate": 1.8194747631870942e-08, - "loss": 0.0568, - "reward": 0.4218750149011612, - "reward_std": 0.1666522491723299, - "rewards/accuracy_reward": 0.07589285867288709, + "grad_norm": 16.167802810668945, + "kl": 2.875, + "learning_rate": 9.097373815935472e-08, + "loss": 0.1732, + "reward": 0.4966518059372902, + "reward_std": 0.1530086062848568, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821492433548, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2502 }, { "clip_ratio": 0.0, - "completion_length": 1452.9308776855469, + "completion_length": 1757.5804138183594, "epoch": 0.7476663430662385, - "grad_norm": 20.03021240234375, - "kl": 0.267333984375, - "learning_rate": 1.8154524889115407e-08, - "loss": 0.0888, - "reward": 0.404575914144516, - "reward_std": 0.1847817450761795, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 21.651351928710938, + "kl": 2.45703125, + "learning_rate": 9.077262444557704e-08, + "loss": 0.1477, + "reward": 0.537946455180645, + "reward_std": 0.16259736940264702, + "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2503 }, { "clip_ratio": 0.0, - "completion_length": 1488.8929443359375, + "completion_length": 1814.60498046875, "epoch": 0.74796505115376, - "grad_norm": 16.185766220092773, - "kl": 0.289794921875, - "learning_rate": 1.8114336791115608e-08, - "loss": 0.058, - "reward": 0.353236623108387, - "reward_std": 0.1957206130027771, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 7.348222732543945, + "kl": 3.5, + "learning_rate": 9.057168395557805e-08, + "loss": 0.1798, + "reward": 0.474330373108387, + "reward_std": 0.1610089372843504, + "rewards/accuracy_reward": 0.049107145983725786, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2504 }, { "clip_ratio": 0.0, - "completion_length": 1493.3304443359375, + "completion_length": 1809.9375610351562, "epoch": 0.7482637592412814, - "grad_norm": 17.57107162475586, - "kl": 0.243408203125, - "learning_rate": 1.807418338159224e-08, - "loss": 0.0985, - "reward": 0.3364955484867096, - "reward_std": 0.16298753023147583, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 14.76254940032959, + "kl": 3.16015625, + "learning_rate": 9.03709169079612e-08, + "loss": 0.1818, + "reward": 0.4648437723517418, + "reward_std": 0.15371510200202465, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.4268973469734192, "step": 2505 }, { "clip_ratio": 0.0, - "completion_length": 1475.7522888183594, + "completion_length": 1785.1005249023438, "epoch": 0.7485624673288029, - "grad_norm": 17.779600143432617, - "kl": 0.254150390625, - "learning_rate": 1.803406470422828e-08, - "loss": 0.0945, - "reward": 0.3565848395228386, - "reward_std": 0.18717077001929283, - "rewards/accuracy_reward": 0.029017859138548374, + "grad_norm": 5.664505958557129, + "kl": 4.015625, + "learning_rate": 9.01703235211414e-08, + "loss": 0.2246, + "reward": 0.462611623108387, + "reward_std": 0.1539636105298996, + "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2506 }, { "clip_ratio": 0.0, - "completion_length": 1526.8259887695312, + "completion_length": 1801.2143859863281, "epoch": 0.7488611754163244, - "grad_norm": 14.23062515258789, - "kl": 0.25341796875, - "learning_rate": 1.7993980802668944e-08, - "loss": 0.067, - "reward": 0.3470982313156128, - "reward_std": 0.1691833958029747, - "rewards/accuracy_reward": 0.026785715483129025, + "grad_norm": 5.6104607582092285, + "kl": 3.61328125, + "learning_rate": 8.996990401334473e-08, + "loss": 0.1938, + "reward": 0.4380580559372902, + "reward_std": 0.13196823187172413, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2507 }, { "clip_ratio": 0.0, - "completion_length": 1400.060302734375, + "completion_length": 1664.4822387695312, "epoch": 0.7491598835038459, - "grad_norm": 18.974042892456055, - "kl": 0.232421875, - "learning_rate": 1.7953931720521553e-08, - "loss": 0.104, - "reward": 0.4553571566939354, - "reward_std": 0.17463848739862442, - "rewards/accuracy_reward": 0.10044643562287092, + "grad_norm": 10.017193794250488, + "kl": 2.720703125, + "learning_rate": 8.976965860260777e-08, + "loss": 0.1816, + "reward": 0.5775669813156128, + "reward_std": 0.18173165619373322, + "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.4458705559372902, "step": 2508 }, { "clip_ratio": 0.0, - "completion_length": 1458.477783203125, + "completion_length": 1721.05810546875, "epoch": 0.7494585915913673, - "grad_norm": 20.276601791381836, - "kl": 0.287109375, - "learning_rate": 1.7913917501355606e-08, - "loss": 0.1303, - "reward": 0.3431919813156128, - "reward_std": 0.19655732065439224, - "rewards/accuracy_reward": 0.020089286845177412, + "grad_norm": 16.97480010986328, + "kl": 2.900390625, + "learning_rate": 8.956958750677804e-08, + "loss": 0.1879, + "reward": 0.4765625149011612, + "reward_std": 0.1632473971694708, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4408482387661934, "step": 2509 }, { "clip_ratio": 0.0, - "completion_length": 1561.6161499023438, + "completion_length": 1826.3572082519531, "epoch": 0.7497572996788888, - "grad_norm": 16.1690616607666, - "kl": 0.2880859375, - "learning_rate": 1.7873938188702637e-08, - "loss": 0.0972, - "reward": 0.3270089477300644, - "reward_std": 0.16356050968170166, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 5.590494155883789, + "kl": 4.48046875, + "learning_rate": 8.936969094351318e-08, + "loss": 0.2445, + "reward": 0.4531250223517418, + "reward_std": 0.1315528191626072, + "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.286830373108387, + "rewards/tag_count_reward": 0.4062500149011612, "step": 2510 }, { "clip_ratio": 0.0, - "completion_length": 1460.3772888183594, + "completion_length": 1706.0134887695312, "epoch": 0.7500560077664102, - "grad_norm": 17.294185638427734, - "kl": 0.23486328125, - "learning_rate": 1.783399382605622e-08, - "loss": 0.097, - "reward": 0.4408482313156128, - "reward_std": 0.2199273444712162, - "rewards/accuracy_reward": 0.10267857578583062, + "grad_norm": 9.51931381225586, + "kl": 3.36328125, + "learning_rate": 8.916996913028111e-08, + "loss": 0.2182, + "reward": 0.5412946790456772, + "reward_std": 0.16308420337736607, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4453125149011612, "step": 2511 }, { "clip_ratio": 0.0, - "completion_length": 1426.8817443847656, + "completion_length": 1731.274658203125, "epoch": 0.7503547158539318, - "grad_norm": 18.85512924194336, - "kl": 0.29345703125, - "learning_rate": 1.7794084456871934e-08, - "loss": 0.0942, - "reward": 0.486607164144516, - "reward_std": 0.17993799969553947, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 27.894014358520508, + "kl": 5.06640625, + "learning_rate": 8.897042228435966e-08, + "loss": 0.2963, + "reward": 0.5708705633878708, + "reward_std": 0.14316944405436516, + "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928656578064, + "rewards/tag_count_reward": 0.412388414144516, "step": 2512 }, { "clip_ratio": 0.0, - "completion_length": 1423.2656860351562, + "completion_length": 1665.7701416015625, "epoch": 0.7506534239414532, - "grad_norm": 17.899051666259766, - "kl": 0.27978515625, - "learning_rate": 1.7754210124567215e-08, - "loss": 0.103, - "reward": 0.435267873108387, - "reward_std": 0.18771584704518318, - "rewards/accuracy_reward": 0.1071428656578064, + "grad_norm": 6.558772087097168, + "kl": 4.0703125, + "learning_rate": 8.877105062283608e-08, + "loss": 0.2457, + "reward": 0.5546875298023224, + "reward_std": 0.14785056747496128, + "rewards/accuracy_reward": 0.11160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.4430803805589676, "step": 2513 }, { "clip_ratio": 0.0, - "completion_length": 1455.8795471191406, + "completion_length": 1769.9375610351562, "epoch": 0.7509521320289747, - "grad_norm": 13.993965148925781, - "kl": 0.2568359375, - "learning_rate": 1.771437087252146e-08, - "loss": 0.0825, - "reward": 0.3716517984867096, - "reward_std": 0.19263339415192604, - "rewards/accuracy_reward": 0.058035718742758036, + "grad_norm": 56.47796630859375, + "kl": 6.29296875, + "learning_rate": 8.857185436260731e-08, + "loss": 0.3165, + "reward": 0.4726562723517418, + "reward_std": 0.12784571200609207, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2514 }, { "clip_ratio": 0.0, - "completion_length": 1459.12060546875, + "completion_length": 1806.4621276855469, "epoch": 0.7512508401164961, - "grad_norm": 18.098291397094727, - "kl": 0.29296875, - "learning_rate": 1.767456674407588e-08, - "loss": 0.0767, - "reward": 0.3911830559372902, - "reward_std": 0.20477458462119102, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 16.063987731933594, + "kl": 4.48828125, + "learning_rate": 8.83728337203794e-08, + "loss": 0.2585, + "reward": 0.4977678805589676, + "reward_std": 0.18256431072950363, + "rewards/accuracy_reward": 0.08258929196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/tag_count_reward": 0.4151785969734192, "step": 2515 }, { "clip_ratio": 0.0, - "completion_length": 1423.9018249511719, + "completion_length": 1686.83935546875, "epoch": 0.7515495482040176, - "grad_norm": 20.637866973876953, - "kl": 0.27490234375, - "learning_rate": 1.7634797782533433e-08, - "loss": 0.1373, - "reward": 0.3510044813156128, - "reward_std": 0.15989653766155243, - "rewards/accuracy_reward": 0.0223214291036129, + "grad_norm": 6.521564960479736, + "kl": 3.87109375, + "learning_rate": 8.817398891266716e-08, + "loss": 0.2172, + "reward": 0.4508928805589676, + "reward_std": 0.15058153495192528, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4263392984867096, "step": 2516 }, { "clip_ratio": 0.0, - "completion_length": 1504.2456359863281, + "completion_length": 1746.0782165527344, "epoch": 0.7518482562915391, - "grad_norm": 19.265975952148438, - "kl": 0.276611328125, - "learning_rate": 1.7595064031158868e-08, - "loss": 0.0999, - "reward": 0.4051339477300644, - "reward_std": 0.20318745076656342, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 21.691272735595703, + "kl": 1.21484375, + "learning_rate": 8.797532015579435e-08, + "loss": 0.1006, + "reward": 0.5368303805589676, + "reward_std": 0.14051378518342972, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.452008955180645, "step": 2517 }, { "clip_ratio": 0.0, - "completion_length": 1459.8996276855469, + "completion_length": 1773.32373046875, "epoch": 0.7521469643790606, - "grad_norm": 13.562911987304688, - "kl": 0.25048828125, - "learning_rate": 1.7555365533178628e-08, - "loss": 0.0781, - "reward": 0.4542410895228386, - "reward_std": 0.20880122482776642, - "rewards/accuracy_reward": 0.1339285783469677, + "grad_norm": 22.037166595458984, + "kl": 2.728515625, + "learning_rate": 8.777682766589314e-08, + "loss": 0.1747, + "reward": 0.5597098544239998, + "reward_std": 0.18050232902169228, + "rewards/accuracy_reward": 0.1316964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4280134215950966, "step": 2518 }, { "clip_ratio": 0.0, - "completion_length": 1574.341552734375, + "completion_length": 1881.8237609863281, "epoch": 0.752445672466582, - "grad_norm": 18.430156707763672, - "kl": 0.280029296875, - "learning_rate": 1.751570233178075e-08, - "loss": 0.0942, - "reward": 0.3833705484867096, - "reward_std": 0.18479757010936737, - "rewards/accuracy_reward": 0.053571430034935474, + "grad_norm": 21.99489974975586, + "kl": 2.462890625, + "learning_rate": 8.757851165890376e-08, + "loss": 0.1442, + "reward": 0.473772332072258, + "reward_std": 0.12414992041885853, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.426897332072258, "step": 2519 }, { "clip_ratio": 0.0, - "completion_length": 1500.9687805175781, + "completion_length": 1782.3773193359375, "epoch": 0.7527443805541035, - "grad_norm": 13.695568084716797, - "kl": 0.24658203125, - "learning_rate": 1.7476074470114953e-08, - "loss": 0.0689, - "reward": 0.4012276977300644, - "reward_std": 0.14450164139270782, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 9.935748100280762, + "kl": 3.1796875, + "learning_rate": 8.738037235057477e-08, + "loss": 0.2038, + "reward": 0.503348246216774, + "reward_std": 0.1278661135584116, + "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2520 }, { "clip_ratio": 0.0, - "completion_length": 1465.1585693359375, + "completion_length": 1759.0536499023438, "epoch": 0.753043088641625, - "grad_norm": 16.194019317626953, - "kl": 0.294677734375, - "learning_rate": 1.7436481991292435e-08, - "loss": 0.0995, - "reward": 0.3671875149011612, - "reward_std": 0.16142040863633156, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 13.794991493225098, + "kl": 3.9453125, + "learning_rate": 8.718240995646217e-08, + "loss": 0.256, + "reward": 0.4687500223517418, + "reward_std": 0.15572605654597282, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247768059372902, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2521 }, { "clip_ratio": 0.0, - "completion_length": 1496.5982666015625, + "completion_length": 1799.4911804199219, "epoch": 0.7533417967291465, - "grad_norm": 15.605169296264648, - "kl": 0.24462890625, - "learning_rate": 1.7396924938385933e-08, - "loss": 0.0822, - "reward": 0.3627232313156128, - "reward_std": 0.20971255376935005, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 4.69957971572876, + "kl": 3.5390625, + "learning_rate": 8.698462469192965e-08, + "loss": 0.1799, + "reward": 0.4983259215950966, + "reward_std": 0.19922034069895744, + "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.435825914144516, "step": 2522 }, { "clip_ratio": 0.0, - "completion_length": 1466.7366943359375, + "completion_length": 1769.7768859863281, "epoch": 0.7536405048166679, - "grad_norm": 18.99736213684082, - "kl": 0.259033203125, - "learning_rate": 1.735740335442967e-08, - "loss": 0.1003, - "reward": 0.3945312649011612, - "reward_std": 0.2207082100212574, - "rewards/accuracy_reward": 0.07589285960420966, + "grad_norm": 10.544771194458008, + "kl": 3.453125, + "learning_rate": 8.678701677214837e-08, + "loss": 0.1988, + "reward": 0.5368303805589676, + "reward_std": 0.203833669424057, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2523 }, { "clip_ratio": 0.0, - "completion_length": 1496.6205749511719, + "completion_length": 1867.1786499023438, "epoch": 0.7539392129041894, - "grad_norm": 15.182574272155762, - "kl": 0.296875, - "learning_rate": 1.731791728241923e-08, - "loss": 0.0661, - "reward": 0.3593750074505806, - "reward_std": 0.14617541804909706, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 24.625850677490234, + "kl": 5.01171875, + "learning_rate": 8.658958641209616e-08, + "loss": 0.2479, + "reward": 0.4614955633878708, + "reward_std": 0.12141005881130695, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2524 }, { "clip_ratio": 0.0, - "completion_length": 1480.38623046875, + "completion_length": 1772.3081359863281, "epoch": 0.7542379209917108, - "grad_norm": 20.339271545410156, - "kl": 0.29248046875, - "learning_rate": 1.7278466765311593e-08, - "loss": 0.1053, - "reward": 0.408482164144516, - "reward_std": 0.1919686272740364, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 6.263752460479736, + "kl": 3.361328125, + "learning_rate": 8.639233382655798e-08, + "loss": 0.1929, + "reward": 0.5000000223517418, + "reward_std": 0.1610074806958437, + "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2525 }, { "clip_ratio": 0.0, - "completion_length": 1539.8773193359375, + "completion_length": 1824.2255249023438, "epoch": 0.7545366290792324, - "grad_norm": 16.816402435302734, - "kl": 0.261962890625, - "learning_rate": 1.7239051846025083e-08, - "loss": 0.0844, - "reward": 0.3777901977300644, - "reward_std": 0.17111797630786896, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 22.25933074951172, + "kl": 2.689453125, + "learning_rate": 8.619525923012542e-08, + "loss": 0.1672, + "reward": 0.468191996216774, + "reward_std": 0.14204774051904678, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2526 }, { "clip_ratio": 0.0, - "completion_length": 1416.1004943847656, + "completion_length": 1712.0469665527344, "epoch": 0.7548353371667538, - "grad_norm": 19.136550903320312, - "kl": 0.24609375, - "learning_rate": 1.7199672567439223e-08, - "loss": 0.111, - "reward": 0.5714285969734192, - "reward_std": 0.19611110165715218, - "rewards/accuracy_reward": 0.2433035895228386, + "grad_norm": 29.006227493286133, + "kl": 3.01171875, + "learning_rate": 8.59983628371961e-08, + "loss": 0.1992, + "reward": 0.6941964626312256, + "reward_std": 0.1673908270895481, + "rewards/accuracy_reward": 0.2633928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2527 }, { "clip_ratio": 0.0, - "completion_length": 1438.6607360839844, + "completion_length": 1713.0625915527344, "epoch": 0.7551340452542753, - "grad_norm": 23.620458602905273, - "kl": 0.30322265625, - "learning_rate": 1.7160328972394834e-08, - "loss": 0.1426, - "reward": 0.4285714477300644, - "reward_std": 0.19478188455104828, - "rewards/accuracy_reward": 0.10714286006987095, + "grad_norm": 15.818685531616211, + "kl": 3.248046875, + "learning_rate": 8.580164486197417e-08, + "loss": 0.2141, + "reward": 0.553013414144516, + "reward_std": 0.14394951052963734, + "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.428013414144516, "step": 2528 }, { "clip_ratio": 0.0, - "completion_length": 1489.4844665527344, + "completion_length": 1795.2523193359375, "epoch": 0.7554327533417967, - "grad_norm": 20.668441772460938, - "kl": 0.267333984375, - "learning_rate": 1.7121021103693904e-08, - "loss": 0.0989, - "reward": 0.3738839402794838, - "reward_std": 0.18611356616020203, - "rewards/accuracy_reward": 0.05133928847499192, + "grad_norm": 10.3928861618042, + "kl": 3.48046875, + "learning_rate": 8.560510551846953e-08, + "loss": 0.2159, + "reward": 0.4827009215950966, + "reward_std": 0.16248556785285473, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2529 }, { "clip_ratio": 0.0, - "completion_length": 1547.0380249023438, + "completion_length": 1841.305908203125, "epoch": 0.7557314614293182, - "grad_norm": 19.187158584594727, - "kl": 0.26025390625, - "learning_rate": 1.7081749004099517e-08, - "loss": 0.0902, - "reward": 0.3867187723517418, - "reward_std": 0.20392903313040733, - "rewards/accuracy_reward": 0.06026786146685481, + "grad_norm": 10.75475788116455, + "kl": 4.267578125, + "learning_rate": 8.540874502049758e-08, + "loss": 0.2309, + "reward": 0.4882812798023224, + "reward_std": 0.15325036644935608, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2530 }, { "clip_ratio": 0.0, - "completion_length": 1492.7701721191406, + "completion_length": 1786.5603332519531, "epoch": 0.7560301695168397, - "grad_norm": 18.261594772338867, - "kl": 0.2509765625, - "learning_rate": 1.7042512716335872e-08, - "loss": 0.0941, - "reward": 0.3710937649011612, - "reward_std": 0.1705406755208969, - "rewards/accuracy_reward": 0.05133928777649999, + "grad_norm": 5.4561991691589355, + "kl": 3.59375, + "learning_rate": 8.521256358167936e-08, + "loss": 0.2042, + "reward": 0.4799107313156128, + "reward_std": 0.1281038261950016, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4174107313156128, "step": 2531 }, { "clip_ratio": 0.0, - "completion_length": 1511.8326721191406, + "completion_length": 1762.7054443359375, "epoch": 0.7563288776043612, - "grad_norm": 22.135025024414062, - "kl": 0.28369140625, - "learning_rate": 1.7003312283088228e-08, - "loss": 0.1141, - "reward": 0.3805803656578064, - "reward_std": 0.20089785382151604, - "rewards/accuracy_reward": 0.04464286006987095, + "grad_norm": 7.350508689880371, + "kl": 2.31640625, + "learning_rate": 8.501656141544113e-08, + "loss": 0.1519, + "reward": 0.490513414144516, + "reward_std": 0.15046704933047295, + "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4458705484867096, "step": 2532 }, { "clip_ratio": 0.0, - "completion_length": 1477.9241943359375, + "completion_length": 1769.0759887695312, "epoch": 0.7566275856918826, - "grad_norm": 18.007169723510742, - "kl": 0.256103515625, - "learning_rate": 1.6964147747002783e-08, - "loss": 0.0791, - "reward": 0.4670759066939354, - "reward_std": 0.18778035417199135, - "rewards/accuracy_reward": 0.13616072130389512, + "grad_norm": 13.729381561279297, + "kl": 3.3515625, + "learning_rate": 8.482073873501392e-08, + "loss": 0.1805, + "reward": 0.5959821715950966, + "reward_std": 0.15152987651526928, + "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2533 }, { "clip_ratio": 0.0, - "completion_length": 1487.7098999023438, + "completion_length": 1786.852783203125, "epoch": 0.7569262937794041, - "grad_norm": 17.445024490356445, - "kl": 0.229248046875, - "learning_rate": 1.692501915068674e-08, - "loss": 0.0886, - "reward": 0.3794643059372902, - "reward_std": 0.19419174268841743, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 6.062173843383789, + "kl": 2.94140625, + "learning_rate": 8.462509575343371e-08, + "loss": 0.1535, + "reward": 0.4927455708384514, + "reward_std": 0.18924615904688835, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482142984867096, + "rewards/tag_count_reward": 0.432477705180645, "step": 2534 }, { "clip_ratio": 0.0, - "completion_length": 1430.57373046875, + "completion_length": 1739.4197082519531, "epoch": 0.7572250018669255, - "grad_norm": 15.652399063110352, - "kl": 0.25439453125, - "learning_rate": 1.688592653670815e-08, - "loss": 0.0991, - "reward": 0.4285714514553547, - "reward_std": 0.18320759385824203, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 5.073615550994873, + "kl": 3.423828125, + "learning_rate": 8.442963268354075e-08, + "loss": 0.2045, + "reward": 0.5122768059372902, + "reward_std": 0.12731828913092613, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3415178768336773, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2535 }, { "clip_ratio": 0.0, - "completion_length": 1495.54248046875, + "completion_length": 1804.4107971191406, "epoch": 0.7575237099544471, - "grad_norm": 18.923599243164062, - "kl": 0.296875, - "learning_rate": 1.6846869947595964e-08, - "loss": 0.0916, - "reward": 0.4029017984867096, - "reward_std": 0.22663862630724907, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 17.259685516357422, + "kl": 4.8203125, + "learning_rate": 8.423434973797983e-08, + "loss": 0.2555, + "reward": 0.4972098469734192, + "reward_std": 0.19531960040330887, + "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.407924123108387, "step": 2536 }, { "clip_ratio": 0.0, - "completion_length": 1544.4777526855469, + "completion_length": 1785.587158203125, "epoch": 0.7578224180419685, - "grad_norm": 17.006092071533203, - "kl": 0.2822265625, - "learning_rate": 1.680784942583993e-08, - "loss": 0.0978, - "reward": 0.3604910895228386, - "reward_std": 0.16763410530984402, - "rewards/accuracy_reward": 0.026785716181620955, + "grad_norm": 17.887813568115234, + "kl": 2.78125, + "learning_rate": 8.403924712919965e-08, + "loss": 0.1755, + "reward": 0.4559151977300644, + "reward_std": 0.1422435138374567, + "rewards/accuracy_reward": 0.03348214388824999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053805589676, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2537 }, { "clip_ratio": 0.0, - "completion_length": 1484.3884887695312, + "completion_length": 1731.8304138183594, "epoch": 0.75812112612949, - "grad_norm": 14.95588493347168, - "kl": 0.23876953125, - "learning_rate": 1.6768865013890526e-08, - "loss": 0.0702, - "reward": 0.440290205180645, - "reward_std": 0.19733745232224464, - "rewards/accuracy_reward": 0.10491071967408061, + "grad_norm": 12.730782508850098, + "kl": 2.296875, + "learning_rate": 8.384432506945263e-08, + "loss": 0.1421, + "reward": 0.5680803805589676, + "reward_std": 0.19015506654977798, + "rewards/accuracy_reward": 0.1272321529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4408482387661934, "step": 2538 }, { "clip_ratio": 0.0, - "completion_length": 1528.2054443359375, + "completion_length": 1852.6273193359375, "epoch": 0.7584198342170114, - "grad_norm": 12.371285438537598, - "kl": 0.247314453125, - "learning_rate": 1.672991675415899e-08, - "loss": 0.0542, - "reward": 0.388950914144516, - "reward_std": 0.19647673517465591, - "rewards/accuracy_reward": 0.06696428754366934, + "grad_norm": 14.77025318145752, + "kl": 3.02734375, + "learning_rate": 8.364958377079495e-08, + "loss": 0.1823, + "reward": 0.5133928805589676, + "reward_std": 0.19640015810728073, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4196428805589676, "step": 2539 }, { "clip_ratio": 0.0, - "completion_length": 1491.8348999023438, + "completion_length": 1777.5715026855469, "epoch": 0.758718542304533, - "grad_norm": 17.61628532409668, - "kl": 0.25732421875, - "learning_rate": 1.669100468901722e-08, - "loss": 0.1016, - "reward": 0.4146205559372902, - "reward_std": 0.19364530965685844, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 18.738475799560547, + "kl": 2.6484375, + "learning_rate": 8.34550234450861e-08, + "loss": 0.1773, + "reward": 0.549107164144516, + "reward_std": 0.19636374339461327, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2540 }, { "clip_ratio": 0.0, - "completion_length": 1528.669677734375, + "completion_length": 1849.0045471191406, "epoch": 0.7590172503920544, - "grad_norm": 17.02996826171875, - "kl": 0.2587890625, - "learning_rate": 1.66521288607977e-08, - "loss": 0.089, - "reward": 0.3917410969734192, - "reward_std": 0.17775145173072815, - "rewards/accuracy_reward": 0.06696428963914514, + "grad_norm": 5.070845127105713, + "kl": 3.35546875, + "learning_rate": 8.326064430398849e-08, + "loss": 0.2025, + "reward": 0.4804687723517418, + "reward_std": 0.15960625559091568, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247768059372902, + "rewards/tag_count_reward": 0.4068080559372902, "step": 2541 }, { "clip_ratio": 0.0, - "completion_length": 1444.7121276855469, + "completion_length": 1717.2478332519531, "epoch": 0.7593159584795758, - "grad_norm": 18.848417282104492, - "kl": 0.225341796875, - "learning_rate": 1.6613289311793533e-08, - "loss": 0.1148, - "reward": 0.5050223544239998, - "reward_std": 0.21076873689889908, - "rewards/accuracy_reward": 0.15625000605359674, + "grad_norm": 4.99845027923584, + "kl": 2.58203125, + "learning_rate": 8.306644655896766e-08, + "loss": 0.164, + "reward": 0.6328125298023224, + "reward_std": 0.2145119234919548, + "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3487723395228386, + "rewards/tag_count_reward": 0.4386160969734192, "step": 2542 }, { "clip_ratio": 0.0, - "completion_length": 1566.2166137695312, + "completion_length": 1816.9487609863281, "epoch": 0.7596146665670973, - "grad_norm": 17.30683135986328, - "kl": 0.244384765625, - "learning_rate": 1.6574486084258367e-08, - "loss": 0.1002, - "reward": 0.3710937574505806, - "reward_std": 0.18537639826536179, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 11.495452880859375, + "kl": 3.23828125, + "learning_rate": 8.287243042129182e-08, + "loss": 0.1936, + "reward": 0.5050223395228386, + "reward_std": 0.13667232356965542, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.440290205180645, "step": 2543 }, { "clip_ratio": 0.0, - "completion_length": 1506.7813110351562, + "completion_length": 1780.2009582519531, "epoch": 0.7599133746546187, - "grad_norm": 17.777292251586914, - "kl": 0.244873046875, - "learning_rate": 1.6535719220406264e-08, - "loss": 0.1112, - "reward": 0.3750000074505806, - "reward_std": 0.1778065636754036, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 199.31536865234375, + "kl": 6.16015625, + "learning_rate": 8.267859610203131e-08, + "loss": 0.3194, + "reward": 0.4681919887661934, + "reward_std": 0.1352878287434578, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337053582072258, + "rewards/tag_count_reward": 0.428013414144516, "step": 2544 }, { "clip_ratio": 0.0, - "completion_length": 1483.6139221191406, + "completion_length": 1744.8504943847656, "epoch": 0.7602120827421402, - "grad_norm": 13.123252868652344, - "kl": 0.2509765625, - "learning_rate": 1.6496988762411806e-08, - "loss": 0.084, - "reward": 0.4224330484867096, - "reward_std": 0.1980648934841156, - "rewards/accuracy_reward": 0.09598214668221772, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "grad_norm": 40.7397346496582, + "kl": 5.7421875, + "learning_rate": 8.248494381205903e-08, + "loss": 0.2943, + "reward": 0.5318080484867096, + "reward_std": 0.17522741854190826, + "rewards/accuracy_reward": 0.11383929383009672, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2545 }, { "clip_ratio": 0.0, - "completion_length": 1503.6875915527344, + "completion_length": 1744.3817443847656, "epoch": 0.7605107908296617, - "grad_norm": 15.194839477539062, - "kl": 0.2509765625, - "learning_rate": 1.645829475240994e-08, - "loss": 0.0894, - "reward": 0.387834832072258, - "reward_std": 0.1664637140929699, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 28.998428344726562, + "kl": 4.8046875, + "learning_rate": 8.22914737620497e-08, + "loss": 0.2782, + "reward": 0.534598246216774, + "reward_std": 0.13515443168580532, + "rewards/accuracy_reward": 0.10044643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4341518133878708, "step": 2546 }, { "clip_ratio": 0.0, - "completion_length": 1515.2902526855469, + "completion_length": 1806.4665832519531, "epoch": 0.7608094989171832, - "grad_norm": 16.53972625732422, - "kl": 0.2529296875, - "learning_rate": 1.6419637232495927e-08, - "loss": 0.094, - "reward": 0.490513414144516, - "reward_std": 0.1947273053228855, - "rewards/accuracy_reward": 0.1674107201397419, + "grad_norm": 42.94977569580078, + "kl": 5.84375, + "learning_rate": 8.209818616247963e-08, + "loss": 0.3211, + "reward": 0.5864955559372902, + "reward_std": 0.15437506884336472, + "rewards/accuracy_reward": 0.16741071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2547 }, { "clip_ratio": 0.0, - "completion_length": 1498.4554443359375, + "completion_length": 1772.33935546875, "epoch": 0.7611082070047046, - "grad_norm": 18.909400939941406, - "kl": 0.25244140625, - "learning_rate": 1.6381016244725386e-08, - "loss": 0.0977, - "reward": 0.4252232387661934, - "reward_std": 0.15088247135281563, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 10.259172439575195, + "kl": 3.66796875, + "learning_rate": 8.190508122362694e-08, + "loss": 0.2251, + "reward": 0.5507812649011612, + "reward_std": 0.12391670420765877, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2548 }, { "clip_ratio": 0.0, - "completion_length": 1402.6406860351562, + "completion_length": 1687.5447082519531, "epoch": 0.7614069150922261, - "grad_norm": 18.369739532470703, - "kl": 0.2109375, - "learning_rate": 1.634243183111415e-08, - "loss": 0.1089, - "reward": 0.495535746216774, - "reward_std": 0.2513570375740528, - "rewards/accuracy_reward": 0.13616072130389512, + "grad_norm": 8.613177299499512, + "kl": 3.41015625, + "learning_rate": 8.171215915557075e-08, + "loss": 0.2253, + "reward": 0.627232164144516, + "reward_std": 0.22335196658968925, + "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3593750074505806, + "rewards/tag_count_reward": 0.4464285969734192, "step": 2549 }, { "clip_ratio": 0.0, - "completion_length": 1485.4822387695312, + "completion_length": 1787.3505554199219, "epoch": 0.7617056231797475, - "grad_norm": 16.74264907836914, - "kl": 0.23876953125, - "learning_rate": 1.6303884033638283e-08, - "loss": 0.1079, - "reward": 0.3476562723517418, - "reward_std": 0.1691725291311741, + "grad_norm": 7.240253448486328, + "kl": 3.7578125, + "learning_rate": 8.151942016819141e-08, + "loss": 0.2202, + "reward": 0.4464285969734192, + "reward_std": 0.11437452584505081, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4375000149011612, "step": 2550 }, { "clip_ratio": 0.0, - "completion_length": 1497.8215026855469, + "completion_length": 1809.8817749023438, "epoch": 0.7620043312672691, - "grad_norm": 16.531522750854492, - "kl": 0.26611328125, - "learning_rate": 1.6265372894234024e-08, - "loss": 0.0772, - "reward": 0.3978794887661934, - "reward_std": 0.1990358904004097, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 13.29697036743164, + "kl": 3.26171875, + "learning_rate": 8.132686447117013e-08, + "loss": 0.187, + "reward": 0.5184151902794838, + "reward_std": 0.16947563365101814, + "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2551 }, { "clip_ratio": 0.0, - "completion_length": 1540.0603332519531, + "completion_length": 1774.5112609863281, "epoch": 0.7623030393547905, - "grad_norm": 16.09724235534668, - "kl": 0.279296875, - "learning_rate": 1.6226898454797692e-08, - "loss": 0.104, - "reward": 0.341517873108387, - "reward_std": 0.20617226883769035, - "rewards/accuracy_reward": 0.04464285867288709, + "grad_norm": 9.201582908630371, + "kl": 4.15625, + "learning_rate": 8.113449227398847e-08, + "loss": 0.2491, + "reward": 0.4637277126312256, + "reward_std": 0.15675169229507446, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2968750149011612, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2552 }, { "clip_ratio": 0.0, - "completion_length": 1521.2813415527344, + "completion_length": 1826.8661804199219, "epoch": 0.762601747442312, - "grad_norm": 19.05103302001953, - "kl": 0.261962890625, - "learning_rate": 1.6188460757185727e-08, - "loss": 0.0941, - "reward": 0.3627232238650322, - "reward_std": 0.1943354345858097, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 36.36302947998047, + "kl": 5.66796875, + "learning_rate": 8.094230378592865e-08, + "loss": 0.3073, + "reward": 0.4525669887661934, + "reward_std": 0.15038013085722923, + "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.3945312649011612, "step": 2553 }, { "clip_ratio": 0.0, - "completion_length": 1561.6563110351562, + "completion_length": 1825.9465026855469, "epoch": 0.7629004555298334, - "grad_norm": 14.567628860473633, - "kl": 0.263671875, - "learning_rate": 1.6150059843214587e-08, - "loss": 0.0808, - "reward": 0.3113839402794838, - "reward_std": 0.1678837314248085, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 26.15264129638672, + "kl": 2.6015625, + "learning_rate": 8.075029921607293e-08, + "loss": 0.1607, + "reward": 0.4492187723517418, + "reward_std": 0.14164195954799652, + "rewards/accuracy_reward": 0.02455357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.424665205180645, "step": 2554 }, { "clip_ratio": 0.0, - "completion_length": 1523.1897888183594, + "completion_length": 1802.8415832519531, "epoch": 0.763199163617355, - "grad_norm": 17.45039939880371, - "kl": 0.27978515625, - "learning_rate": 1.6111695754660664e-08, - "loss": 0.0935, - "reward": 0.4196428805589676, - "reward_std": 0.16865506023168564, - "rewards/accuracy_reward": 0.12053572130389512, + "grad_norm": 16.039710998535156, + "kl": 3.78125, + "learning_rate": 8.055847877330332e-08, + "loss": 0.2149, + "reward": 0.5284598469734192, + "reward_std": 0.12426152639091015, + "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071566939354, + "rewards/tag_count_reward": 0.4101562649011612, "step": 2555 }, { "clip_ratio": 0.0, - "completion_length": 1492.4800109863281, + "completion_length": 1782.1317749023438, "epoch": 0.7634978717048764, - "grad_norm": 15.363375663757324, - "kl": 0.23291015625, - "learning_rate": 1.607336853326035e-08, - "loss": 0.0919, - "reward": 0.4062500223517418, - "reward_std": 0.1796344704926014, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 32.790618896484375, + "kl": 2.396484375, + "learning_rate": 8.036684266630176e-08, + "loss": 0.1668, + "reward": 0.5083705708384514, + "reward_std": 0.149326890707016, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3415178656578064, + "rewards/tag_count_reward": 0.428013414144516, "step": 2556 }, { "clip_ratio": 0.0, - "completion_length": 1485.7344055175781, + "completion_length": 1777.6697082519531, "epoch": 0.7637965797923979, - "grad_norm": 17.05866241455078, - "kl": 0.253173828125, - "learning_rate": 1.603507822070993e-08, - "loss": 0.1132, - "reward": 0.376674123108387, - "reward_std": 0.17617788910865784, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 15.758880615234375, + "kl": 3.53515625, + "learning_rate": 8.017539110354965e-08, + "loss": 0.2194, + "reward": 0.483258955180645, + "reward_std": 0.1491873934864998, + "rewards/accuracy_reward": 0.0625000016298145, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2557 }, { "clip_ratio": 0.0, - "completion_length": 1514.4063415527344, + "completion_length": 1768.8058776855469, "epoch": 0.7640952878799193, - "grad_norm": 13.346781730651855, - "kl": 0.2373046875, - "learning_rate": 1.599682485866546e-08, - "loss": 0.0942, - "reward": 0.4704241380095482, - "reward_std": 0.20600362494587898, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 30.996421813964844, + "kl": 3.43359375, + "learning_rate": 7.99841242933273e-08, + "loss": 0.1947, + "reward": 0.5887276977300644, + "reward_std": 0.1620193738490343, + "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2558 }, { "clip_ratio": 0.0, - "completion_length": 1508.1072387695312, + "completion_length": 1755.1429138183594, "epoch": 0.7643939959674408, - "grad_norm": 16.044723510742188, - "kl": 0.23876953125, - "learning_rate": 1.595860848874288e-08, - "loss": 0.0965, - "reward": 0.3694196566939354, - "reward_std": 0.1832047887146473, - "rewards/accuracy_reward": 0.03125000209547579, + "grad_norm": 16.56763458251953, + "kl": 2.830078125, + "learning_rate": 7.979304244371441e-08, + "loss": 0.1736, + "reward": 0.474888414144516, + "reward_std": 0.14932403899729252, + "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4324776902794838, "step": 2559 }, { "clip_ratio": 0.0, - "completion_length": 1517.0291137695312, + "completion_length": 1778.8215026855469, "epoch": 0.7646927040549623, - "grad_norm": 15.723095893859863, - "kl": 0.232666015625, - "learning_rate": 1.592042915251786e-08, - "loss": 0.0881, - "reward": 0.4849330559372902, - "reward_std": 0.20784618332982063, - "rewards/accuracy_reward": 0.15178571827709675, + "grad_norm": 5.034185409545898, + "kl": 3.40234375, + "learning_rate": 7.96021457625893e-08, + "loss": 0.2016, + "reward": 0.5948660969734192, + "reward_std": 0.17062820494174957, + "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2560 }, { "clip_ratio": 0.0, - "completion_length": 1550.6585388183594, + "completion_length": 1801.3973999023438, "epoch": 0.7649914121424838, - "grad_norm": 17.252973556518555, - "kl": 0.2490234375, - "learning_rate": 1.5882286891525753e-08, - "loss": 0.0908, - "reward": 0.4118303805589676, - "reward_std": 0.18879318237304688, - "rewards/accuracy_reward": 0.09151786286383867, + "grad_norm": 27.855850219726562, + "kl": 5.24609375, + "learning_rate": 7.941143445762876e-08, + "loss": 0.274, + "reward": 0.5117187723517418, + "reward_std": 0.12441137991845608, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.420200914144516, "step": 2561 }, { "clip_ratio": 0.0, - "completion_length": 1567.1138916015625, + "completion_length": 1796.2210693359375, "epoch": 0.7652901202300052, - "grad_norm": 15.452845573425293, - "kl": 0.299560546875, - "learning_rate": 1.5844181747261632e-08, - "loss": 0.0848, - "reward": 0.376674123108387, - "reward_std": 0.17977901175618172, - "rewards/accuracy_reward": 0.06250000093132257, + "grad_norm": 53.55901336669922, + "kl": 4.9609375, + "learning_rate": 7.922090873630815e-08, + "loss": 0.262, + "reward": 0.4966517984867096, + "reward_std": 0.14928488247096539, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2562 }, { "clip_ratio": 0.0, - "completion_length": 1547.6853637695312, + "completion_length": 1798.8036499023438, "epoch": 0.7655888283175267, - "grad_norm": 18.01922035217285, - "kl": 0.246826171875, - "learning_rate": 1.580611376118012e-08, - "loss": 0.1031, - "reward": 0.356584832072258, - "reward_std": 0.15242668241262436, - "rewards/accuracy_reward": 0.0401785746216774, + "grad_norm": 47.46797561645508, + "kl": 5.90625, + "learning_rate": 7.903056880590059e-08, + "loss": 0.3221, + "reward": 0.4737723395228386, + "reward_std": 0.1267513409256935, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2563 }, { "clip_ratio": 0.0, - "completion_length": 1579.8683776855469, + "completion_length": 1804.5268859863281, "epoch": 0.7658875364050481, - "grad_norm": 15.281736373901367, - "kl": 0.261962890625, - "learning_rate": 1.5768082974695472e-08, - "loss": 0.1038, - "reward": 0.3370535895228386, - "reward_std": 0.1951485238969326, - "rewards/accuracy_reward": 0.024553572991862893, + "grad_norm": 17.308568954467773, + "kl": 3.91015625, + "learning_rate": 7.884041487347737e-08, + "loss": 0.2156, + "reward": 0.4899553805589676, + "reward_std": 0.1892781015485525, + "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000074505806, + "rewards/tag_count_reward": 0.427455373108387, "step": 2564 }, { "clip_ratio": 0.0, - "completion_length": 1504.3125610351562, + "completion_length": 1805.5223999023438, "epoch": 0.7661862444925697, - "grad_norm": 14.399866104125977, - "kl": 0.243408203125, - "learning_rate": 1.5730089429181474e-08, - "loss": 0.0926, - "reward": 0.3543526977300644, - "reward_std": 0.18700026348233223, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 23.61330223083496, + "kl": 4.26953125, + "learning_rate": 7.865044714590738e-08, + "loss": 0.2447, + "reward": 0.4698660895228386, + "reward_std": 0.186822809278965, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4118303805589676, "step": 2565 }, { "clip_ratio": 0.0, - "completion_length": 1546.2322082519531, + "completion_length": 1799.5156860351562, "epoch": 0.7664849525800911, - "grad_norm": 14.104732513427734, - "kl": 0.275390625, - "learning_rate": 1.5692133165971357e-08, - "loss": 0.0748, - "reward": 0.3755580559372902, - "reward_std": 0.1365160271525383, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 7.196094512939453, + "kl": 3.63671875, + "learning_rate": 7.846066582985678e-08, + "loss": 0.1999, + "reward": 0.4905134215950966, + "reward_std": 0.12475656718015671, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.301897332072258, + "rewards/tag_count_reward": 0.407924123108387, "step": 2566 }, { "clip_ratio": 0.0, - "completion_length": 1499.72998046875, + "completion_length": 1778.9375610351562, "epoch": 0.7667836606676126, - "grad_norm": 17.412355422973633, - "kl": 0.27978515625, - "learning_rate": 1.565421422635782e-08, - "loss": 0.0968, - "reward": 0.3900669813156128, - "reward_std": 0.19148856028914452, - "rewards/accuracy_reward": 0.06250000465661287, + "grad_norm": 20.412097930908203, + "kl": 2.669921875, + "learning_rate": 7.827107113178911e-08, + "loss": 0.1643, + "reward": 0.4983259215950966, + "reward_std": 0.16344180889427662, + "rewards/accuracy_reward": 0.07589286216534674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2567 }, { "clip_ratio": 0.0, - "completion_length": 1547.8728332519531, + "completion_length": 1793.8460693359375, "epoch": 0.767082368755134, - "grad_norm": 21.259206771850586, - "kl": 0.253173828125, - "learning_rate": 1.5616332651592968e-08, - "loss": 0.1229, - "reward": 0.4051339477300644, - "reward_std": 0.14226125925779343, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 40.36150360107422, + "kl": 1.609375, + "learning_rate": 7.808166325796483e-08, + "loss": 0.0974, + "reward": 0.5106027126312256, + "reward_std": 0.09678547270596027, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.4347098469734192, "step": 2568 }, { "clip_ratio": 0.0, - "completion_length": 1485.8572082519531, + "completion_length": 1747.3661499023438, "epoch": 0.7673810768426556, - "grad_norm": 13.497369766235352, - "kl": 0.26171875, - "learning_rate": 1.5578488482888218e-08, - "loss": 0.052, - "reward": 0.4380580633878708, - "reward_std": 0.19732008129358292, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 44.59108352661133, + "kl": 1.6181640625, + "learning_rate": 7.78924424144411e-08, + "loss": 0.1101, + "reward": 0.5781250223517418, + "reward_std": 0.19728682562708855, + "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2569 }, { "clip_ratio": 0.0, - "completion_length": 1540.4308776855469, + "completion_length": 1808.2098999023438, "epoch": 0.767679784930177, - "grad_norm": 19.245925903320312, - "kl": 0.27587890625, - "learning_rate": 1.5540681761414325e-08, - "loss": 0.0929, - "reward": 0.4196428805589676, - "reward_std": 0.17419279739260674, - "rewards/accuracy_reward": 0.0848214328289032, + "grad_norm": 40.747589111328125, + "kl": 2.22265625, + "learning_rate": 7.770340880707163e-08, + "loss": 0.144, + "reward": 0.5223214402794838, + "reward_std": 0.14578425139188766, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214477300644, + "rewards/tag_count_reward": 0.4218750149011612, "step": 2570 }, { "clip_ratio": 0.0, - "completion_length": 1495.9576721191406, + "completion_length": 1742.4978637695312, "epoch": 0.7679784930176985, - "grad_norm": 19.357187271118164, - "kl": 0.28125, - "learning_rate": 1.5502912528301316e-08, - "loss": 0.0913, - "reward": 0.3867187649011612, - "reward_std": 0.1882437076419592, - "rewards/accuracy_reward": 0.055803575087338686, + "grad_norm": 45.561527252197266, + "kl": 1.576171875, + "learning_rate": 7.751456264150657e-08, + "loss": 0.1084, + "reward": 0.494419664144516, + "reward_std": 0.1180021520704031, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4408482387661934, "step": 2571 }, { "clip_ratio": 0.0, - "completion_length": 1531.7009887695312, + "completion_length": 1809.8572082519531, "epoch": 0.7682772011052199, - "grad_norm": 13.71538257598877, - "kl": 0.2470703125, - "learning_rate": 1.5465180824638386e-08, - "loss": 0.0796, - "reward": 0.3783482313156128, - "reward_std": 0.1596193015575409, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 29.935483932495117, + "kl": 2.6171875, + "learning_rate": 7.732590412319193e-08, + "loss": 0.1552, + "reward": 0.4771205559372902, + "reward_std": 0.13989804685115814, + "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4213169887661934, "step": 2572 }, { "clip_ratio": 0.0, - "completion_length": 1534.7790832519531, + "completion_length": 1817.1719360351562, "epoch": 0.7685759091927414, - "grad_norm": 18.00897216796875, - "kl": 0.22998046875, - "learning_rate": 1.542748669147394e-08, - "loss": 0.1116, - "reward": 0.353794664144516, - "reward_std": 0.17921049147844315, - "rewards/accuracy_reward": 0.022321429569274187, + "grad_norm": 13.055599212646484, + "kl": 1.8095703125, + "learning_rate": 7.71374334573697e-08, + "loss": 0.0987, + "reward": 0.4760044813156128, + "reward_std": 0.1261488664895296, + "rewards/accuracy_reward": 0.037946430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2573 }, { "clip_ratio": 0.0, - "completion_length": 1577.1697082519531, + "completion_length": 1833.3103637695312, "epoch": 0.7688746172802629, - "grad_norm": 13.998360633850098, - "kl": 0.25048828125, - "learning_rate": 1.5389830169815514e-08, - "loss": 0.0991, - "reward": 0.3660714402794838, - "reward_std": 0.15261167660355568, - "rewards/accuracy_reward": 0.04464285937137902, + "grad_norm": 8.414482116699219, + "kl": 4.111328125, + "learning_rate": 7.694915084907758e-08, + "loss": 0.2182, + "reward": 0.4581473395228386, + "reward_std": 0.13189874030649662, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2574 }, { "clip_ratio": 0.0, - "completion_length": 1574.5804443359375, + "completion_length": 1815.3973999023438, "epoch": 0.7691733253677844, - "grad_norm": 16.79865074157715, - "kl": 0.27783203125, - "learning_rate": 1.5352211300629693e-08, - "loss": 0.1112, - "reward": 0.4090401977300644, - "reward_std": 0.15528161823749542, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 7.036265850067139, + "kl": 4.5859375, + "learning_rate": 7.676105650314846e-08, + "loss": 0.2386, + "reward": 0.5312500298023224, + "reward_std": 0.10332498699426651, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651902794838, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2575 }, { - "clip_ratio": 0.0, - "completion_length": 1523.9129943847656, - "epoch": 0.7694720334553058, - "grad_norm": 15.801748275756836, - "kl": 0.252685546875, - "learning_rate": 1.5314630124842144e-08, - "loss": 0.1052, - "reward": 0.3247767984867096, - "reward_std": 0.18082084506750107, - "rewards/accuracy_reward": 0.01562500116415322, + "clip_ratio": 0.0, + "completion_length": 1784.4777526855469, + "epoch": 0.7694720334553058, + "grad_norm": 9.23177433013916, + "kl": 4.32421875, + "learning_rate": 7.657315062421071e-08, + "loss": 0.2426, + "reward": 0.455915205180645, + "reward_std": 0.1437736675143242, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4179687723517418, "step": 2576 }, { "clip_ratio": 0.0, - "completion_length": 1513.93310546875, + "completion_length": 1788.805908203125, "epoch": 0.7697707415428273, - "grad_norm": 13.555604934692383, - "kl": 0.264404296875, - "learning_rate": 1.527708668333747e-08, - "loss": 0.0769, - "reward": 0.4062500223517418, - "reward_std": 0.21907616779208183, - "rewards/accuracy_reward": 0.07812500419095159, + "grad_norm": 6.710785388946533, + "kl": 2.85546875, + "learning_rate": 7.638543341668735e-08, + "loss": 0.1562, + "reward": 0.5083705633878708, + "reward_std": 0.17915258556604385, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2577 }, { "clip_ratio": 0.0, - "completion_length": 1500.8884582519531, + "completion_length": 1755.0246276855469, "epoch": 0.7700694496303487, - "grad_norm": 15.015710830688477, - "kl": 0.239990234375, - "learning_rate": 1.5239581016959275e-08, - "loss": 0.1068, - "reward": 0.4196428805589676, - "reward_std": 0.17965098842978477, - "rewards/accuracy_reward": 0.09598214901052415, + "grad_norm": 10.012096405029297, + "kl": 3.578125, + "learning_rate": 7.619790508479637e-08, + "loss": 0.2144, + "reward": 0.5223214626312256, + "reward_std": 0.1272389031946659, + "rewards/accuracy_reward": 0.10267857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607387661934, + "rewards/tag_count_reward": 0.4196428880095482, "step": 2578 }, { "clip_ratio": 0.0, - "completion_length": 1573.0759582519531, + "completion_length": 1823.0648193359375, "epoch": 0.7703681577178703, - "grad_norm": 14.422245025634766, - "kl": 0.295166015625, - "learning_rate": 1.5202113166510057e-08, - "loss": 0.0779, - "reward": 0.4073660895228386, - "reward_std": 0.19798144325613976, - "rewards/accuracy_reward": 0.09375000419095159, + "grad_norm": 13.80045223236084, + "kl": 3.34765625, + "learning_rate": 7.601056583255027e-08, + "loss": 0.2025, + "reward": 0.5055803954601288, + "reward_std": 0.16695699840784073, + "rewards/accuracy_reward": 0.08035714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4252232387661934, "step": 2579 }, { "clip_ratio": 0.0, - "completion_length": 1549.80810546875, + "completion_length": 1825.1652526855469, "epoch": 0.7706668658053917, - "grad_norm": 15.468863487243652, - "kl": 0.267333984375, - "learning_rate": 1.516468317275114e-08, - "loss": 0.0816, - "reward": 0.384486623108387, - "reward_std": 0.17638737708330154, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 10.490732192993164, + "kl": 2.08984375, + "learning_rate": 7.582341586375571e-08, + "loss": 0.1223, + "reward": 0.5206473469734192, + "reward_std": 0.14675583317875862, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2580 }, { "clip_ratio": 0.0, - "completion_length": 1526.4576721191406, + "completion_length": 1766.7701416015625, "epoch": 0.7709655738929132, - "grad_norm": 16.138994216918945, - "kl": 0.2685546875, - "learning_rate": 1.512729107640271e-08, - "loss": 0.0902, - "reward": 0.478236623108387, - "reward_std": 0.1814950332045555, - "rewards/accuracy_reward": 0.14508929336443543, + "grad_norm": 10.774175643920898, + "kl": 3.14453125, + "learning_rate": 7.563645538201355e-08, + "loss": 0.1871, + "reward": 0.5864955559372902, + "reward_std": 0.15112685039639473, + "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.4347098469734192, "step": 2581 }, { "clip_ratio": 0.0, - "completion_length": 1578.8013916015625, + "completion_length": 1810.4978942871094, "epoch": 0.7712642819804346, - "grad_norm": 15.907672882080078, - "kl": 0.233154296875, - "learning_rate": 1.5089936918143703e-08, - "loss": 0.083, - "reward": 0.424107164144516, - "reward_std": 0.1865755282342434, - "rewards/accuracy_reward": 0.08928571501746774, + "grad_norm": 8.275562286376953, + "kl": 2.80078125, + "learning_rate": 7.544968459071851e-08, + "loss": 0.157, + "reward": 0.5223214477300644, + "reward_std": 0.13444994948804379, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2582 }, { "clip_ratio": 0.0, - "completion_length": 1571.2322082519531, + "completion_length": 1811.9710998535156, "epoch": 0.7715629900679561, - "grad_norm": 15.545485496520996, - "kl": 0.253173828125, - "learning_rate": 1.5052620738611764e-08, - "loss": 0.1291, - "reward": 0.3543526902794838, - "reward_std": 0.19625772535800934, - "rewards/accuracy_reward": 0.033482145285233855, + "grad_norm": 19.367002487182617, + "kl": 4.66015625, + "learning_rate": 7.526310369305883e-08, + "loss": 0.27, + "reward": 0.4508928805589676, + "reward_std": 0.15218351781368256, + "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.419642873108387, "step": 2583 }, { "clip_ratio": 0.0, - "completion_length": 1531.5692749023438, + "completion_length": 1807.696533203125, "epoch": 0.7718616981554776, - "grad_norm": 14.933103561401367, - "kl": 0.237548828125, - "learning_rate": 1.501534257840325e-08, - "loss": 0.0903, - "reward": 0.3984375149011612, - "reward_std": 0.20225728675723076, - "rewards/accuracy_reward": 0.07142857392318547, + "grad_norm": 7.166604518890381, + "kl": 3.140625, + "learning_rate": 7.507671289201625e-08, + "loss": 0.1699, + "reward": 0.4988839477300644, + "reward_std": 0.161212844774127, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.427455373108387, "step": 2584 }, { "clip_ratio": 0.0, - "completion_length": 1485.8348999023438, + "completion_length": 1752.4107971191406, "epoch": 0.772160406242999, - "grad_norm": 20.832183837890625, - "kl": 0.26171875, - "learning_rate": 1.4978102478073164e-08, - "loss": 0.1277, - "reward": 0.4107142984867096, - "reward_std": 0.1463041491806507, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 6.1773529052734375, + "kl": 3.90234375, + "learning_rate": 7.489051239036581e-08, + "loss": 0.2378, + "reward": 0.520647332072258, + "reward_std": 0.1087775956839323, + "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.4402901902794838, "step": 2585 }, { "clip_ratio": 0.0, - "completion_length": 1553.4197387695312, + "completion_length": 1827.0647888183594, "epoch": 0.7724591143305205, - "grad_norm": 18.872020721435547, - "kl": 0.2724609375, - "learning_rate": 1.494090047813505e-08, - "loss": 0.0989, - "reward": 0.3733259066939354, - "reward_std": 0.20651863142848015, - "rewards/accuracy_reward": 0.06696429033763707, + "grad_norm": 8.261789321899414, + "kl": 3.92578125, + "learning_rate": 7.470450239067525e-08, + "loss": 0.2194, + "reward": 0.4888393059372902, + "reward_std": 0.18182992190122604, + "rewards/accuracy_reward": 0.07366071571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2586 }, { "clip_ratio": 0.0, - "completion_length": 1553.0759582519531, + "completion_length": 1826.0380249023438, "epoch": 0.7727578224180419, - "grad_norm": 18.754873275756836, - "kl": 0.249267578125, - "learning_rate": 1.4903736619061048e-08, - "loss": 0.0699, - "reward": 0.4135044738650322, - "reward_std": 0.18130825087428093, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 28.2338924407959, + "kl": 4.63671875, + "learning_rate": 7.451868309530523e-08, + "loss": 0.2468, + "reward": 0.486607164144516, + "reward_std": 0.15239087119698524, + "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3532366156578064, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2587 }, { "clip_ratio": 0.0, - "completion_length": 1461.1004943847656, + "completion_length": 1766.7723999023438, "epoch": 0.7730565305055634, - "grad_norm": 15.999414443969727, - "kl": 0.226806640625, - "learning_rate": 1.486661094128182e-08, - "loss": 0.1042, - "reward": 0.450892873108387, - "reward_std": 0.20725472271442413, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 12.279585838317871, + "kl": 3.640625, + "learning_rate": 7.43330547064091e-08, + "loss": 0.2028, + "reward": 0.5530134215950966, + "reward_std": 0.17561093904078007, + "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3504464477300644, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2588 }, { "clip_ratio": 0.0, - "completion_length": 1477.071533203125, + "completion_length": 1713.9041137695312, "epoch": 0.7733552385930849, - "grad_norm": 12.589746475219727, - "kl": 0.24609375, - "learning_rate": 1.4829523485186434e-08, - "loss": 0.1236, - "reward": 0.4157366305589676, - "reward_std": 0.18324162065982819, - "rewards/accuracy_reward": 0.098214291036129, + "grad_norm": 31.374746322631836, + "kl": 2.736328125, + "learning_rate": 7.414761742593217e-08, + "loss": 0.2052, + "reward": 0.5401785895228386, + "reward_std": 0.14525115489959717, + "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.439732164144516, "step": 2589 }, { "clip_ratio": 0.0, - "completion_length": 1500.9286193847656, + "completion_length": 1813.8639221191406, "epoch": 0.7736539466806064, - "grad_norm": 16.778057098388672, - "kl": 0.2412109375, - "learning_rate": 1.4792474291122431e-08, - "loss": 0.089, - "reward": 0.3549107313156128, - "reward_std": 0.16951234638690948, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 7.422126770019531, + "kl": 2.365234375, + "learning_rate": 7.396237145561216e-08, + "loss": 0.1284, + "reward": 0.4654018059372902, + "reward_std": 0.14411384798586369, + "rewards/accuracy_reward": 0.03125000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.4341517984867096, "step": 2590 }, { "clip_ratio": 0.0, - "completion_length": 1572.1920471191406, + "completion_length": 1783.6340026855469, "epoch": 0.7739526547681278, - "grad_norm": 13.875350952148438, - "kl": 0.26025390625, - "learning_rate": 1.475546339939568e-08, - "loss": 0.0915, - "reward": 0.4112723395228386, - "reward_std": 0.18564359471201897, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 40.946285247802734, + "kl": 2.759765625, + "learning_rate": 7.377731699697839e-08, + "loss": 0.1992, + "reward": 0.557477705180645, + "reward_std": 0.16929181851446629, + "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.301897332072258, + "rewards/tag_count_reward": 0.4213169813156128, "step": 2591 }, { "clip_ratio": 0.0, - "completion_length": 1438.3750915527344, + "completion_length": 1776.5335693359375, "epoch": 0.7742513628556493, - "grad_norm": 17.162904739379883, - "kl": 0.249755859375, - "learning_rate": 1.471849085027041e-08, - "loss": 0.1259, - "reward": 0.377232164144516, - "reward_std": 0.2209152653813362, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 6.593284606933594, + "kl": 4.1015625, + "learning_rate": 7.359245425135204e-08, + "loss": 0.2612, + "reward": 0.4793527126312256, + "reward_std": 0.20341967418789864, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928656578064, + "rewards/tag_count_reward": 0.3945312649011612, "step": 2592 }, { "clip_ratio": 0.0, - "completion_length": 1506.4375610351562, + "completion_length": 1720.1764221191406, "epoch": 0.7745500709431707, - "grad_norm": 17.517776489257812, - "kl": 0.252197265625, - "learning_rate": 1.4681556683969153e-08, - "loss": 0.1283, - "reward": 0.4670759066939354, - "reward_std": 0.194686159491539, - "rewards/accuracy_reward": 0.1428571476135403, + "grad_norm": 24.739835739135742, + "kl": 2.53515625, + "learning_rate": 7.340778341984578e-08, + "loss": 0.1774, + "reward": 0.612723246216774, + "reward_std": 0.1848236545920372, + "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2593 }, { "clip_ratio": 0.0, - "completion_length": 1488.904052734375, + "completion_length": 1763.1898193359375, "epoch": 0.7748487790306923, - "grad_norm": 15.955395698547363, - "kl": 0.23095703125, - "learning_rate": 1.4644660940672625e-08, - "loss": 0.1044, - "reward": 0.3906250223517418, - "reward_std": 0.23870540410280228, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 11.416841506958008, + "kl": 2.826171875, + "learning_rate": 7.322330470336313e-08, + "loss": 0.1719, + "reward": 0.5172991156578064, + "reward_std": 0.2113415040075779, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.330357164144516, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2594 }, { "clip_ratio": 0.0, - "completion_length": 1546.94873046875, + "completion_length": 1795.6429443359375, "epoch": 0.7751474871182137, - "grad_norm": 18.46137237548828, - "kl": 0.240234375, - "learning_rate": 1.4607803660519802e-08, - "loss": 0.1276, - "reward": 0.3632812723517418, - "reward_std": 0.1809709332883358, + "grad_norm": 13.142464637756348, + "kl": 3.09765625, + "learning_rate": 7.303901830259901e-08, + "loss": 0.1773, + "reward": 0.4665178805589676, + "reward_std": 0.15197203680872917, "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2595 }, { "clip_ratio": 0.0, - "completion_length": 1512.0090026855469, + "completion_length": 1734.5000610351562, "epoch": 0.7754461952057352, - "grad_norm": 15.797245025634766, - "kl": 0.257080078125, - "learning_rate": 1.45709848836078e-08, - "loss": 0.1089, - "reward": 0.3722098395228386, - "reward_std": 0.18246085569262505, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 4.362053871154785, + "kl": 3.734375, + "learning_rate": 7.2854924418039e-08, + "loss": 0.2392, + "reward": 0.5050223395228386, + "reward_std": 0.15453839674592018, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4335937723517418, "step": 2596 }, { "clip_ratio": 0.0, - "completion_length": 1481.6072387695312, + "completion_length": 1693.88623046875, "epoch": 0.7757449032932566, - "grad_norm": 15.415982246398926, - "kl": 0.262939453125, - "learning_rate": 1.4534204649991815e-08, - "loss": 0.1094, - "reward": 0.3582589402794838, - "reward_std": 0.1915757916867733, - "rewards/accuracy_reward": 0.05357143026776612, + "grad_norm": 3.745897054672241, + "kl": 4.015625, + "learning_rate": 7.267102324995909e-08, + "loss": 0.2567, + "reward": 0.4921875298023224, + "reward_std": 0.15427321940660477, + "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2597 }, { "clip_ratio": 0.0, - "completion_length": 1583.0179138183594, + "completion_length": 1804.8348693847656, "epoch": 0.7760436113807782, - "grad_norm": 17.806982040405273, - "kl": 0.280029296875, - "learning_rate": 1.4497462999685151e-08, - "loss": 0.0925, - "reward": 0.4235491305589676, - "reward_std": 0.18290168046951294, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 26.579376220703125, + "kl": 5.6640625, + "learning_rate": 7.248731499842575e-08, + "loss": 0.3017, + "reward": 0.5742187798023224, + "reward_std": 0.1528733242303133, + "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.303013414144516, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2598 }, { "clip_ratio": 0.0, - "completion_length": 1549.3996276855469, + "completion_length": 1838.2947387695312, "epoch": 0.7763423194682996, - "grad_norm": 17.192237854003906, - "kl": 0.265380859375, - "learning_rate": 1.4460759972659137e-08, - "loss": 0.0774, - "reward": 0.340959832072258, - "reward_std": 0.1738588958978653, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 34.577247619628906, + "kl": 5.78515625, + "learning_rate": 7.230379986329568e-08, + "loss": 0.2852, + "reward": 0.463169664144516, + "reward_std": 0.16504784300923347, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2599 }, { "clip_ratio": 0.0, - "completion_length": 1590.1250610351562, + "completion_length": 1812.9107971191406, "epoch": 0.7766410275558211, - "grad_norm": 18.12316131591797, - "kl": 0.241455078125, - "learning_rate": 1.4424095608843034e-08, - "loss": 0.0954, - "reward": 0.3398437574505806, - "reward_std": 0.17363471910357475, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 29.398542404174805, + "kl": 4.7578125, + "learning_rate": 7.212047804421517e-08, + "loss": 0.2611, + "reward": 0.4369419887661934, + "reward_std": 0.14650611393153667, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.416852705180645, "step": 2600 }, { "clip_ratio": 0.0, - "completion_length": 1437.1339721679688, + "completion_length": 1719.5514221191406, "epoch": 0.7769397356433425, - "grad_norm": 19.37710189819336, - "kl": 0.229248046875, - "learning_rate": 1.4387469948124108e-08, - "loss": 0.113, - "reward": 0.5128348544239998, - "reward_std": 0.19267990812659264, - "rewards/accuracy_reward": 0.15625000558793545, + "grad_norm": 6.415352821350098, + "kl": 3.796875, + "learning_rate": 7.193734974062054e-08, + "loss": 0.2274, + "reward": 0.5664062723517418, + "reward_std": 0.13747592456638813, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3565848469734192, + "rewards/tag_count_reward": 0.428013414144516, "step": 2601 }, { "clip_ratio": 0.0, - "completion_length": 1515.9844360351562, + "completion_length": 1774.3952026367188, "epoch": 0.777238443730864, - "grad_norm": 17.292465209960938, - "kl": 0.236328125, - "learning_rate": 1.4350883030347466e-08, - "loss": 0.0999, - "reward": 0.444196455180645, - "reward_std": 0.17476245388388634, - "rewards/accuracy_reward": 0.10937500232830644, + "grad_norm": 5.695383548736572, + "kl": 3.86328125, + "learning_rate": 7.175441515173733e-08, + "loss": 0.2232, + "reward": 0.5412946790456772, + "reward_std": 0.16458679735660553, + "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2602 }, { "clip_ratio": 0.0, - "completion_length": 1537.2076721191406, + "completion_length": 1755.0045471191406, "epoch": 0.7775371518183855, - "grad_norm": 19.54884910583496, - "kl": 0.245849609375, - "learning_rate": 1.4314334895316094e-08, - "loss": 0.1109, - "reward": 0.3621651902794838, - "reward_std": 0.1814488172531128, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 7.6023759841918945, + "kl": 3.1484375, + "learning_rate": 7.157167447658046e-08, + "loss": 0.1778, + "reward": 0.4737723469734192, + "reward_std": 0.127156350761652, + "rewards/accuracy_reward": 0.03348214412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4402901902794838, "step": 2603 }, { "clip_ratio": 0.0, - "completion_length": 1576.2991943359375, + "completion_length": 1856.9241943359375, "epoch": 0.777835859905907, - "grad_norm": 16.592002868652344, - "kl": 0.26953125, - "learning_rate": 1.4277825582790803e-08, - "loss": 0.0762, - "reward": 0.4023437649011612, - "reward_std": 0.17042800411581993, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 6.773701190948486, + "kl": 3.423828125, + "learning_rate": 7.138912791395402e-08, + "loss": 0.1879, + "reward": 0.498325914144516, + "reward_std": 0.13170508295297623, + "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4112723469734192, "step": 2604 }, { "clip_ratio": 0.0, - "completion_length": 1528.5759582519531, + "completion_length": 1758.7791137695312, "epoch": 0.7781345679934284, - "grad_norm": 18.245071411132812, - "kl": 0.260498046875, - "learning_rate": 1.4241355132490113e-08, - "loss": 0.1049, - "reward": 0.4893973395228386, - "reward_std": 0.2013261616230011, - "rewards/accuracy_reward": 0.16741072246804833, + "grad_norm": 25.66123390197754, + "kl": 3.3984375, + "learning_rate": 7.120677566245056e-08, + "loss": 0.2219, + "reward": 0.5993303805589676, + "reward_std": 0.16810395568609238, + "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2605 }, { "clip_ratio": 0.0, - "completion_length": 1481.8192443847656, + "completion_length": 1692.1317443847656, "epoch": 0.7784332760809499, - "grad_norm": 16.302223205566406, - "kl": 0.249267578125, - "learning_rate": 1.4204923584090312e-08, - "loss": 0.0998, - "reward": 0.4296875149011612, - "reward_std": 0.17735011130571365, - "rewards/accuracy_reward": 0.09598214784637094, + "grad_norm": 23.810937881469727, + "kl": 3.1796875, + "learning_rate": 7.102461792045156e-08, + "loss": 0.2374, + "reward": 0.5161830559372902, + "reward_std": 0.1274232193827629, + "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.420200914144516, "step": 2606 }, { "clip_ratio": 0.0, - "completion_length": 1602.6161499023438, + "completion_length": 1846.4197692871094, "epoch": 0.7787319841684713, - "grad_norm": 19.050840377807617, - "kl": 0.255859375, - "learning_rate": 1.4168530977225374e-08, - "loss": 0.0821, - "reward": 0.401227705180645, - "reward_std": 0.18116624280810356, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 21.621829986572266, + "kl": 2.158203125, + "learning_rate": 7.084265488612686e-08, + "loss": 0.1169, + "reward": 0.5245535895228386, + "reward_std": 0.12732814252376556, + "rewards/accuracy_reward": 0.08928571688011289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.435267873108387, "step": 2607 }, { "clip_ratio": 0.0, - "completion_length": 1494.2188415527344, + "completion_length": 1774.1719970703125, "epoch": 0.7790306922559929, - "grad_norm": 14.877570152282715, - "kl": 0.24853515625, - "learning_rate": 1.4132177351486857e-08, - "loss": 0.1135, - "reward": 0.4620535895228386, - "reward_std": 0.184462558478117, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 5.051156520843506, + "kl": 3.1875, + "learning_rate": 7.066088675743429e-08, + "loss": 0.1873, + "reward": 0.5747768133878708, + "reward_std": 0.16054519265890121, + "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4296875298023224, "step": 2608 }, { "clip_ratio": 0.0, - "completion_length": 1597.0246276855469, + "completion_length": 1798.8460693359375, "epoch": 0.7793294003435143, - "grad_norm": 17.642379760742188, - "kl": 0.28369140625, - "learning_rate": 1.4095862746423959e-08, - "loss": 0.0976, - "reward": 0.377232164144516, - "reward_std": 0.1654631793498993, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 5.270697116851807, + "kl": 4.0546875, + "learning_rate": 7.047931373211979e-08, + "loss": 0.2318, + "reward": 0.502790205180645, + "reward_std": 0.1417302880436182, + "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2968750149011612, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2609 }, { "clip_ratio": 0.0, - "completion_length": 1423.88623046875, + "completion_length": 1682.8817749023438, "epoch": 0.7796281084310358, - "grad_norm": 18.925533294677734, - "kl": 0.211669921875, - "learning_rate": 1.4059587201543422e-08, - "loss": 0.1102, - "reward": 0.4536830633878708, - "reward_std": 0.1795758120715618, - "rewards/accuracy_reward": 0.08258929080329835, + "grad_norm": 5.234367847442627, + "kl": 2.5078125, + "learning_rate": 7.029793600771711e-08, + "loss": 0.1576, + "reward": 0.549107164144516, + "reward_std": 0.17250079475343227, + "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3710937723517418, + "rewards/tag_count_reward": 0.4531250223517418, "step": 2610 }, { "clip_ratio": 0.0, - "completion_length": 1598.7367248535156, + "completion_length": 1781.180908203125, "epoch": 0.7799268165185572, - "grad_norm": 14.966584205627441, - "kl": 0.29833984375, - "learning_rate": 1.4023350756309455e-08, - "loss": 0.0972, - "reward": 0.3052455484867096, - "reward_std": 0.20185226947069168, - "rewards/accuracy_reward": 0.026785716181620955, + "grad_norm": 37.177146911621094, + "kl": 5.71875, + "learning_rate": 7.011675378154728e-08, + "loss": 0.3142, + "reward": 0.4720982387661934, + "reward_std": 0.17425001971423626, + "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.278459832072258, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2611 }, { "clip_ratio": 0.0, - "completion_length": 1548.8103332519531, + "completion_length": 1775.44873046875, "epoch": 0.7802255246060787, - "grad_norm": 14.575711250305176, - "kl": 0.229736328125, - "learning_rate": 1.3987153450143774e-08, - "loss": 0.1201, - "reward": 0.3761160895228386, - "reward_std": 0.19684725627303123, - "rewards/accuracy_reward": 0.03571428847499192, + "grad_norm": 47.473731994628906, + "kl": 5.96875, + "learning_rate": 6.993576725071887e-08, + "loss": 0.3173, + "reward": 0.4609375223517418, + "reward_std": 0.17072391137480736, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2612 }, { "clip_ratio": 0.0, - "completion_length": 1572.4754943847656, + "completion_length": 1838.3125915527344, "epoch": 0.7805242326936002, - "grad_norm": 14.46743392944336, - "kl": 0.238525390625, - "learning_rate": 1.3950995322425519e-08, - "loss": 0.0882, - "reward": 0.341517873108387, - "reward_std": 0.17375875264406204, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 18.83847999572754, + "kl": 4.16015625, + "learning_rate": 6.975497661212759e-08, + "loss": 0.2282, + "reward": 0.4458705559372902, + "reward_std": 0.13977430760860443, + "rewards/accuracy_reward": 0.01562500069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2613 }, { "clip_ratio": 0.0, - "completion_length": 1514.8125915527344, + "completion_length": 1733.5960388183594, "epoch": 0.7808229407811217, - "grad_norm": 17.721481323242188, - "kl": 0.26220703125, - "learning_rate": 1.391487641249115e-08, - "loss": 0.1265, - "reward": 0.3922991156578064, - "reward_std": 0.166348397731781, - "rewards/accuracy_reward": 0.05803571571595967, + "grad_norm": 23.1492977142334, + "kl": 5.8671875, + "learning_rate": 6.957438206245574e-08, + "loss": 0.3431, + "reward": 0.4843750149011612, + "reward_std": 0.14722616225481033, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2614 }, { "clip_ratio": 0.0, - "completion_length": 1604.8973999023438, + "completion_length": 1848.1407165527344, "epoch": 0.7811216488686431, - "grad_norm": 17.54768180847168, - "kl": 0.263916015625, - "learning_rate": 1.3878796759634542e-08, - "loss": 0.1221, - "reward": 0.390066996216774, - "reward_std": 0.18818630650639534, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 16.575899124145508, + "kl": 4.89453125, + "learning_rate": 6.939398379817271e-08, + "loss": 0.2734, + "reward": 0.5368303805589676, + "reward_std": 0.18778608180582523, + "rewards/accuracy_reward": 0.12500000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.411830373108387, "step": 2615 }, { "clip_ratio": 0.0, - "completion_length": 1603.4509582519531, + "completion_length": 1786.3795471191406, "epoch": 0.7814203569561646, - "grad_norm": 19.644100189208984, - "kl": 0.275390625, - "learning_rate": 1.3842756403106787e-08, - "loss": 0.1089, - "reward": 0.356584832072258, - "reward_std": 0.16554837301373482, + "grad_norm": 6.960030555725098, + "kl": 4.00390625, + "learning_rate": 6.921378201553394e-08, + "loss": 0.2487, + "reward": 0.465959832072258, + "reward_std": 0.11664685793220997, "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419738650322, + "rewards/tag_count_reward": 0.4213169813156128, "step": 2616 }, { "clip_ratio": 0.0, - "completion_length": 1585.1406860351562, + "completion_length": 1836.6630554199219, "epoch": 0.781719065043686, - "grad_norm": 15.968189239501953, - "kl": 0.2724609375, - "learning_rate": 1.380675538211627e-08, - "loss": 0.0866, - "reward": 0.325334832072258, - "reward_std": 0.1709197275340557, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 5.789151668548584, + "kl": 3.1328125, + "learning_rate": 6.903377691058135e-08, + "loss": 0.1767, + "reward": 0.4335937723517418, + "reward_std": 0.13940389081835747, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2617 }, { "clip_ratio": 0.0, - "completion_length": 1509.2478332519531, + "completion_length": 1698.52685546875, "epoch": 0.7820177731312076, - "grad_norm": 18.587308883666992, - "kl": 0.258056640625, - "learning_rate": 1.37707937358286e-08, - "loss": 0.1189, - "reward": 0.4827009215950966, - "reward_std": 0.20300813391804695, - "rewards/accuracy_reward": 0.15178572246804833, + "grad_norm": 8.267499923706055, + "kl": 2.880859375, + "learning_rate": 6.8853968679143e-08, + "loss": 0.1685, + "reward": 0.6021205633878708, + "reward_std": 0.17515825852751732, + "rewards/accuracy_reward": 0.16741072479635477, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151828289032, + "rewards/tag_count_reward": 0.4347098469734192, "step": 2618 }, { "clip_ratio": 0.0, - "completion_length": 1567.5804138183594, + "completion_length": 1811.7076721191406, "epoch": 0.782316481218729, - "grad_norm": 15.440577507019043, - "kl": 0.27001953125, - "learning_rate": 1.373487150336648e-08, - "loss": 0.1052, - "reward": 0.3325893059372902, - "reward_std": 0.19791129976511002, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 5.618217468261719, + "kl": 3.3828125, + "learning_rate": 6.86743575168324e-08, + "loss": 0.1962, + "reward": 0.4860491305589676, + "reward_std": 0.1859223209321499, + "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2619 }, { "clip_ratio": 0.0, - "completion_length": 1555.5715026855469, + "completion_length": 1718.4018859863281, "epoch": 0.7826151893062505, - "grad_norm": 17.3128719329834, - "kl": 0.25146484375, - "learning_rate": 1.36989887238098e-08, - "loss": 0.1032, - "reward": 0.3632812723517418, - "reward_std": 0.17486238107085228, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 13.793340682983398, + "kl": 3.310546875, + "learning_rate": 6.849494361904901e-08, + "loss": 0.2114, + "reward": 0.4497768133878708, + "reward_std": 0.1381816752254963, + "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991305589676, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2620 }, { "clip_ratio": 0.0, - "completion_length": 1534.6429443359375, + "completion_length": 1751.2902526855469, "epoch": 0.7829138973937719, - "grad_norm": 18.22102928161621, - "kl": 0.30517578125, - "learning_rate": 1.3663145436195527e-08, - "loss": 0.1325, - "reward": 0.4296875223517418, - "reward_std": 0.18347087875008583, - "rewards/accuracy_reward": 0.12723214668221772, + "grad_norm": 5.818290710449219, + "kl": 3.578125, + "learning_rate": 6.831572718097765e-08, + "loss": 0.2305, + "reward": 0.5440848469734192, + "reward_std": 0.13982860930263996, + "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4146205484867096, "step": 2621 }, { "clip_ratio": 0.0, - "completion_length": 1560.9018249511719, + "completion_length": 1836.6027526855469, "epoch": 0.7832126054812935, - "grad_norm": 17.343181610107422, - "kl": 0.244873046875, - "learning_rate": 1.3627341679517607e-08, - "loss": 0.0994, - "reward": 0.4665178954601288, - "reward_std": 0.19708619266748428, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 8.277383804321289, + "kl": 4.181640625, + "learning_rate": 6.813670839758804e-08, + "loss": 0.213, + "reward": 0.5820312723517418, + "reward_std": 0.1880913209170103, + "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4079241305589676, "step": 2622 }, { "clip_ratio": 0.0, - "completion_length": 1577.3192749023438, + "completion_length": 1832.40185546875, "epoch": 0.7835113135688149, - "grad_norm": 12.5634765625, - "kl": 0.272216796875, - "learning_rate": 1.3591577492727036e-08, - "loss": 0.0944, - "reward": 0.317522332072258, - "reward_std": 0.17884957417845726, - "rewards/accuracy_reward": 0.020089287078008056, + "grad_norm": 6.086855411529541, + "kl": 3.52734375, + "learning_rate": 6.795788746363518e-08, + "loss": 0.1955, + "reward": 0.4497767984867096, + "reward_std": 0.15682524628937244, + "rewards/accuracy_reward": 0.03125000069849193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297433041036129, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2623 }, { "clip_ratio": 0.0, - "completion_length": 1539.0670166015625, - "epoch": 0.7838100216563364, - "grad_norm": 15.269017219543457, - "kl": 0.25146484375, - "learning_rate": 1.3555852914731758e-08, - "loss": 0.1332, - "reward": 0.3381696566939354, - "reward_std": 0.18782580643892288, - "rewards/accuracy_reward": 0.026785715483129025, + "completion_length": 1746.8505249023438, + "epoch": 0.7838100216563364, + "grad_norm": 11.244091987609863, + "kl": 2.6953125, + "learning_rate": 6.777926457365879e-08, + "loss": 0.1722, + "reward": 0.4849330484867096, + "reward_std": 0.15624531917273998, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2624 }, { "clip_ratio": 0.0, - "completion_length": 1430.2277526855469, + "completion_length": 1684.1116638183594, "epoch": 0.7841087297438578, - "grad_norm": 13.785560607910156, - "kl": 0.20703125, - "learning_rate": 1.3520167984396585e-08, - "loss": 0.0952, - "reward": 0.4592634066939354, - "reward_std": 0.13642597571015358, - "rewards/accuracy_reward": 0.08928571757860482, + "grad_norm": 6.74306583404541, + "kl": 3.6953125, + "learning_rate": 6.760083992198293e-08, + "loss": 0.2113, + "reward": 0.5401785969734192, + "reward_std": 0.12515992112457752, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3699776902794838, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2625 }, { "clip_ratio": 0.0, - "completion_length": 1576.1607971191406, + "completion_length": 1810.9866943359375, "epoch": 0.7844074378313793, - "grad_norm": 17.085065841674805, - "kl": 0.248291015625, - "learning_rate": 1.3484522740543237e-08, - "loss": 0.1147, - "reward": 0.4107143059372902, - "reward_std": 0.1683897078037262, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 12.638365745544434, + "kl": 3.79296875, + "learning_rate": 6.742261370271618e-08, + "loss": 0.2366, + "reward": 0.5273437798023224, + "reward_std": 0.13212844729423523, + "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321492433548, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2626 }, { "clip_ratio": 0.0, - "completion_length": 1568.6540832519531, + "completion_length": 1758.3995971679688, "epoch": 0.7847061459189008, - "grad_norm": 15.333107948303223, - "kl": 0.244140625, - "learning_rate": 1.3448917221950262e-08, - "loss": 0.1148, - "reward": 0.3231026828289032, - "reward_std": 0.16000143997371197, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 3.19701886177063, + "kl": 3.078125, + "learning_rate": 6.724458610975131e-08, + "loss": 0.1619, + "reward": 0.431919664144516, + "reward_std": 0.11859615333378315, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186383992433548, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2627 }, { "clip_ratio": 0.0, - "completion_length": 1564.3013916015625, + "completion_length": 1802.1027526855469, "epoch": 0.7850048540064222, - "grad_norm": 14.055878639221191, - "kl": 0.248046875, - "learning_rate": 1.341335146735294e-08, - "loss": 0.1066, - "reward": 0.3822544813156128, - "reward_std": 0.1615758016705513, - "rewards/accuracy_reward": 0.06696428847499192, + "grad_norm": 15.597496032714844, + "kl": 4.9296875, + "learning_rate": 6.70667573367647e-08, + "loss": 0.2659, + "reward": 0.505580373108387, + "reward_std": 0.15213859640061855, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2628 }, { "clip_ratio": 0.0, - "completion_length": 1624.4866333007812, + "completion_length": 1860.4398498535156, "epoch": 0.7853035620939437, - "grad_norm": 12.711446762084961, - "kl": 0.253662109375, - "learning_rate": 1.3377825515443364e-08, - "loss": 0.0868, - "reward": 0.3281250074505806, - "reward_std": 0.16862763464450836, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 9.682565689086914, + "kl": 4.6875, + "learning_rate": 6.688912757721682e-08, + "loss": 0.2466, + "reward": 0.4380580559372902, + "reward_std": 0.13988455198705196, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4157366305589676, "step": 2629 }, { "clip_ratio": 0.0, - "completion_length": 1588.9375305175781, + "completion_length": 1810.0558776855469, "epoch": 0.7856022701814651, - "grad_norm": 12.946649551391602, - "kl": 0.246337890625, - "learning_rate": 1.3342339404870251e-08, - "loss": 0.0974, - "reward": 0.380580373108387, - "reward_std": 0.20753470435738564, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 25.46393394470215, + "kl": 4.2578125, + "learning_rate": 6.671169702435125e-08, + "loss": 0.2389, + "reward": 0.5011160895228386, + "reward_std": 0.16951162740588188, + "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.416294664144516, "step": 2630 }, { "clip_ratio": 0.0, - "completion_length": 1582.3840026855469, + "completion_length": 1796.5179443359375, "epoch": 0.7859009782689866, - "grad_norm": 15.47833251953125, - "kl": 0.23095703125, - "learning_rate": 1.3306893174239025e-08, - "loss": 0.1123, - "reward": 0.436383955180645, - "reward_std": 0.20755577832460403, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 3.847527027130127, + "kl": 2.693359375, + "learning_rate": 6.653446587119513e-08, + "loss": 0.1521, + "reward": 0.5658482387661934, + "reward_std": 0.15450603514909744, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247768059372902, + "rewards/tag_count_reward": 0.4408482313156128, "step": 2631 }, { "clip_ratio": 0.0, - "completion_length": 1567.5736999511719, + "completion_length": 1801.5045471191406, "epoch": 0.786199686356508, - "grad_norm": 13.121114730834961, - "kl": 0.24072265625, - "learning_rate": 1.3271486862111736e-08, - "loss": 0.1031, - "reward": 0.385602705180645, - "reward_std": 0.18344572186470032, - "rewards/accuracy_reward": 0.06250000419095159, + "grad_norm": 12.678567886352539, + "kl": 3.61328125, + "learning_rate": 6.635743431055868e-08, + "loss": 0.1939, + "reward": 0.513950914144516, + "reward_std": 0.15575001761317253, + "rewards/accuracy_reward": 0.08928571618162096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2632 }, { "clip_ratio": 0.0, - "completion_length": 1567.8036499023438, + "completion_length": 1829.4018859863281, "epoch": 0.7864983944440296, - "grad_norm": 14.023200988769531, - "kl": 0.24169921875, - "learning_rate": 1.3236120507006943e-08, - "loss": 0.1021, - "reward": 0.3392857238650322, - "reward_std": 0.19464296475052834, - "rewards/accuracy_reward": 0.017857143888249993, + "grad_norm": 13.165144920349121, + "kl": 3.08203125, + "learning_rate": 6.618060253503471e-08, + "loss": 0.1759, + "reward": 0.4793526977300644, + "reward_std": 0.18314307555556297, + "rewards/accuracy_reward": 0.051339288940653205, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4280134066939354, "step": 2633 }, { "clip_ratio": 0.0, - "completion_length": 1539.9509582519531, + "completion_length": 1746.4598693847656, "epoch": 0.786797102531551, - "grad_norm": 17.043140411376953, - "kl": 0.25732421875, - "learning_rate": 1.3200794147399786e-08, - "loss": 0.1315, - "reward": 0.4101562574505806, - "reward_std": 0.18616418913006783, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 21.526649475097656, + "kl": 3.453125, + "learning_rate": 6.600397073699893e-08, + "loss": 0.2338, + "reward": 0.5145089402794838, + "reward_std": 0.16112812422215939, + "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2634 }, { "clip_ratio": 0.0, - "completion_length": 1425.8527526855469, + "completion_length": 1708.540283203125, "epoch": 0.7870958106190725, - "grad_norm": 16.491037368774414, - "kl": 0.22705078125, - "learning_rate": 1.3165507821721906e-08, - "loss": 0.1277, - "reward": 0.3750000149011612, - "reward_std": 0.19201992079615593, - "rewards/accuracy_reward": 0.026785714784637094, + "grad_norm": 15.20854377746582, + "kl": 2.4765625, + "learning_rate": 6.582753910860952e-08, + "loss": 0.1663, + "reward": 0.4464285895228386, + "reward_std": 0.13517841510474682, + "rewards/accuracy_reward": 0.0200892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482143059372902, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2635 }, { "clip_ratio": 0.0, - "completion_length": 1491.8036499023438, + "completion_length": 1735.0670471191406, "epoch": 0.7873945187065939, - "grad_norm": 18.88667869567871, - "kl": 0.240234375, - "learning_rate": 1.3130261568361334e-08, - "loss": 0.1142, - "reward": 0.3755580559372902, - "reward_std": 0.1925540193915367, - "rewards/accuracy_reward": 0.03571428591385484, + "grad_norm": 7.885658264160156, + "kl": 2.5703125, + "learning_rate": 6.565130784180667e-08, + "loss": 0.1592, + "reward": 0.4782366305589676, + "reward_std": 0.14847423508763313, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4358259066939354, "step": 2636 }, { "clip_ratio": 0.0, - "completion_length": 1553.2745971679688, + "completion_length": 1768.9620971679688, "epoch": 0.7876932267941155, - "grad_norm": 20.516286849975586, - "kl": 0.26708984375, - "learning_rate": 1.309505542566255e-08, - "loss": 0.1154, - "reward": 0.3917410895228386, - "reward_std": 0.22491783276200294, - "rewards/accuracy_reward": 0.08035714412108064, + "grad_norm": 25.886028289794922, + "kl": 2.490234375, + "learning_rate": 6.547527712831274e-08, + "loss": 0.1771, + "reward": 0.528459832072258, + "reward_std": 0.19036472588777542, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839477300644, + "rewards/tag_count_reward": 0.4280134066939354, "step": 2637 }, { "clip_ratio": 0.0, - "completion_length": 1590.8482666015625, + "completion_length": 1810.6407470703125, "epoch": 0.7879919348816369, - "grad_norm": 16.587919235229492, - "kl": 0.258544921875, - "learning_rate": 1.3059889431926408e-08, - "loss": 0.1027, - "reward": 0.3621651902794838, - "reward_std": 0.17805760353803635, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 9.92557430267334, + "kl": 4.03125, + "learning_rate": 6.529944715963204e-08, + "loss": 0.226, + "reward": 0.4960937723517418, + "reward_std": 0.16224156692624092, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937574505806, + "rewards/tag_count_reward": 0.404575914144516, "step": 2638 }, { "clip_ratio": 0.0, - "completion_length": 1536.6741638183594, + "completion_length": 1718.8036193847656, "epoch": 0.7882906429691584, - "grad_norm": 18.634817123413086, - "kl": 0.260498046875, - "learning_rate": 1.3024763625410024e-08, - "loss": 0.1251, - "reward": 0.4486607387661934, - "reward_std": 0.19062812626361847, - "rewards/accuracy_reward": 0.12946429196745157, + "grad_norm": 27.776004791259766, + "kl": 2.564453125, + "learning_rate": 6.512381812705012e-08, + "loss": 0.1647, + "reward": 0.5959821790456772, + "reward_std": 0.14928384125232697, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2639 }, { "clip_ratio": 0.0, - "completion_length": 1532.5536193847656, + "completion_length": 1729.8706359863281, "epoch": 0.7885893510566798, - "grad_norm": 15.980862617492676, - "kl": 0.251708984375, - "learning_rate": 1.2989678044326858e-08, - "loss": 0.1158, - "reward": 0.4101562649011612, - "reward_std": 0.1807083860039711, - "rewards/accuracy_reward": 0.08928571827709675, + "grad_norm": 12.225401878356934, + "kl": 3.00390625, + "learning_rate": 6.494839022163429e-08, + "loss": 0.1905, + "reward": 0.526227705180645, + "reward_std": 0.17418872937560081, + "rewards/accuracy_reward": 0.09598214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2640 }, { "clip_ratio": 0.0, - "completion_length": 1594.7634887695312, + "completion_length": 1812.6072082519531, "epoch": 0.7888880591442013, - "grad_norm": 16.966651916503906, - "kl": 0.283203125, - "learning_rate": 1.2954632726846593e-08, - "loss": 0.1037, - "reward": 0.3169642984867096, - "reward_std": 0.187081441283226, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 10.067612648010254, + "kl": 4.5, + "learning_rate": 6.477316363423297e-08, + "loss": 0.2432, + "reward": 0.4581473395228386, + "reward_std": 0.1527930125594139, + "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714328289032, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2641 }, { "clip_ratio": 0.0, - "completion_length": 1510.9308471679688, + "completion_length": 1692.8014221191406, "epoch": 0.7891867672317228, - "grad_norm": 14.426924705505371, - "kl": 0.22216796875, - "learning_rate": 1.2919627711095066e-08, - "loss": 0.1197, - "reward": 0.400669664144516, - "reward_std": 0.17382289841771126, - "rewards/accuracy_reward": 0.05803571571595967, + "grad_norm": 12.770482063293457, + "kl": 2.802734375, + "learning_rate": 6.459813855547533e-08, + "loss": 0.1898, + "reward": 0.5033482387661934, + "reward_std": 0.1317837107926607, + "rewards/accuracy_reward": 0.06026785867288709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339328289032, + "rewards/tag_count_reward": 0.443080373108387, "step": 2642 }, { "clip_ratio": 0.0, - "completion_length": 1516.3661193847656, + "completion_length": 1779.6541137695312, "epoch": 0.7894854753192443, - "grad_norm": 16.97919464111328, - "kl": 0.247314453125, - "learning_rate": 1.2884663035154337e-08, - "loss": 0.1148, - "reward": 0.423549123108387, - "reward_std": 0.20486875995993614, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 12.473730087280273, + "kl": 3.9765625, + "learning_rate": 6.442331517577168e-08, + "loss": 0.2352, + "reward": 0.5680803880095482, + "reward_std": 0.1985703930258751, + "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2643 }, { "clip_ratio": 0.0, - "completion_length": 1534.1384582519531, + "completion_length": 1763.8728332519531, "epoch": 0.7897841834067657, - "grad_norm": 15.46893310546875, - "kl": 0.2802734375, - "learning_rate": 1.2849738737062521e-08, - "loss": 0.0888, - "reward": 0.3487723395228386, - "reward_std": 0.17399592697620392, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 16.01439666748047, + "kl": 3.8359375, + "learning_rate": 6.42486936853126e-08, + "loss": 0.2176, + "reward": 0.4715401902794838, + "reward_std": 0.14035964012145996, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.431361623108387, "step": 2644 }, { "clip_ratio": 0.0, - "completion_length": 1488.9197082519531, + "completion_length": 1730.2545776367188, "epoch": 0.7900828914942872, - "grad_norm": 19.243274688720703, - "kl": 0.240234375, - "learning_rate": 1.2814854854813839e-08, - "loss": 0.1341, - "reward": 0.4319196566939354, - "reward_std": 0.17727119475603104, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 13.4894437789917, + "kl": 4.796875, + "learning_rate": 6.407427427406919e-08, + "loss": 0.3041, + "reward": 0.5111607387661934, + "reward_std": 0.15011709742248058, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2645 }, { "clip_ratio": 0.0, - "completion_length": 1593.0290832519531, + "completion_length": 1804.3438110351562, "epoch": 0.7903815995818086, - "grad_norm": 16.382936477661133, - "kl": 0.26904296875, - "learning_rate": 1.2780011426358555e-08, - "loss": 0.1098, - "reward": 0.4430803880095482, - "reward_std": 0.19528690353035927, - "rewards/accuracy_reward": 0.13169643585570157, + "grad_norm": 32.61983108520508, + "kl": 4.8203125, + "learning_rate": 6.390005713179277e-08, + "loss": 0.2421, + "reward": 0.5602678656578064, + "reward_std": 0.14925477467477322, + "rewards/accuracy_reward": 0.13839286239817739, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2646 }, { "clip_ratio": 0.0, - "completion_length": 1630.1965026855469, + "completion_length": 1820.66748046875, "epoch": 0.7906803076693302, - "grad_norm": 20.17667007446289, - "kl": 0.28173828125, - "learning_rate": 1.2745208489602877e-08, - "loss": 0.1152, - "reward": 0.4263393133878708, - "reward_std": 0.17420080304145813, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 10.622252464294434, + "kl": 4.8671875, + "learning_rate": 6.372604244801438e-08, + "loss": 0.2661, + "reward": 0.558593787252903, + "reward_std": 0.16292515583336353, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071492433548, + "rewards/tag_count_reward": 0.4045759066939354, "step": 2647 }, { "clip_ratio": 0.0, - "completion_length": 1563.118408203125, + "completion_length": 1771.5045471191406, "epoch": 0.7909790157568516, - "grad_norm": 13.887938499450684, - "kl": 0.260009765625, - "learning_rate": 1.2710446082408993e-08, - "loss": 0.0849, - "reward": 0.448102705180645, - "reward_std": 0.19233566895127296, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 10.448858261108398, + "kl": 3.24609375, + "learning_rate": 6.355223041204496e-08, + "loss": 0.2075, + "reward": 0.5295759066939354, + "reward_std": 0.16583773121237755, + "rewards/accuracy_reward": 0.10044643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3498884066939354, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2648 }, { "clip_ratio": 0.0, - "completion_length": 1479.0759582519531, + "completion_length": 1737.3951416015625, "epoch": 0.7912777238443731, - "grad_norm": 17.261455535888672, - "kl": 0.245849609375, - "learning_rate": 1.267572424259502e-08, - "loss": 0.1338, - "reward": 0.4408482238650322, - "reward_std": 0.13740773312747478, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 15.623565673828125, + "kl": 3.35546875, + "learning_rate": 6.337862121297511e-08, + "loss": 0.2108, + "reward": 0.5385044813156128, + "reward_std": 0.11819391697645187, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2649 }, { "clip_ratio": 0.0, - "completion_length": 1615.3482666015625, + "completion_length": 1834.9911499023438, "epoch": 0.7915764319318945, - "grad_norm": 18.469125747680664, - "kl": 0.26123046875, - "learning_rate": 1.2641043007934887e-08, - "loss": 0.1064, - "reward": 0.361607164144516, - "reward_std": 0.2011253573000431, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 14.918390274047852, + "kl": 3.13671875, + "learning_rate": 6.320521503967444e-08, + "loss": 0.1845, + "reward": 0.497209832072258, + "reward_std": 0.1772791799157858, + "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2650 }, { "clip_ratio": 0.0, - "completion_length": 1519.7947082519531, + "completion_length": 1805.6384887695312, "epoch": 0.7918751400194161, - "grad_norm": 14.840165138244629, - "kl": 0.241943359375, - "learning_rate": 1.260640241615839e-08, - "loss": 0.0969, - "reward": 0.4670759215950966, - "reward_std": 0.18409734219312668, - "rewards/accuracy_reward": 0.1339285746216774, + "grad_norm": 6.5464396476745605, + "kl": 3.9453125, + "learning_rate": 6.303201208079196e-08, + "loss": 0.2275, + "reward": 0.5530134215950966, + "reward_std": 0.14531385898590088, + "rewards/accuracy_reward": 0.13616072502918541, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2651 }, { "clip_ratio": 0.0, - "completion_length": 1546.8906860351562, + "completion_length": 1750.1920166015625, "epoch": 0.7921738481069375, - "grad_norm": 15.975370407104492, - "kl": 0.24072265625, - "learning_rate": 1.2571802504951113e-08, - "loss": 0.1175, - "reward": 0.3805803805589676, - "reward_std": 0.15492428094148636, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 10.428670883178711, + "kl": 3.4296875, + "learning_rate": 6.285901252475556e-08, + "loss": 0.1997, + "reward": 0.4704241305589676, + "reward_std": 0.12357258051633835, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4257812649011612, "step": 2652 }, { "clip_ratio": 0.0, - "completion_length": 1586.3482971191406, + "completion_length": 1788.2143859863281, "epoch": 0.792472556194459, - "grad_norm": 17.914306640625, - "kl": 0.257080078125, - "learning_rate": 1.2537243311954331e-08, - "loss": 0.1158, - "reward": 0.3197544738650322, - "reward_std": 0.15639489144086838, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 16.69830894470215, + "kl": 2.7109375, + "learning_rate": 6.268621655977166e-08, + "loss": 0.1681, + "reward": 0.4458705484867096, + "reward_std": 0.1195985097438097, + "rewards/accuracy_reward": 0.011160715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2653 }, { "clip_ratio": 0.0, - "completion_length": 1484.0111999511719, + "completion_length": 1737.3371276855469, "epoch": 0.7927712642819804, - "grad_norm": 16.6837215423584, - "kl": 0.2451171875, - "learning_rate": 1.2502724874765086e-08, - "loss": 0.1242, - "reward": 0.376674123108387, - "reward_std": 0.14372852444648743, + "grad_norm": 13.762235641479492, + "kl": 3.296875, + "learning_rate": 6.251362437382543e-08, + "loss": 0.2129, + "reward": 0.4665178880095482, + "reward_std": 0.10243247449398041, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.340959832072258, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2654 }, { "clip_ratio": 0.0, - "completion_length": 1508.5514221191406, + "completion_length": 1712.7277526855469, "epoch": 0.7930699723695019, - "grad_norm": 13.134584426879883, - "kl": 0.24072265625, - "learning_rate": 1.2468247230936062e-08, - "loss": 0.1008, - "reward": 0.3934151902794838, - "reward_std": 0.19865359738469124, - "rewards/accuracy_reward": 0.06250000325962901, + "grad_norm": 3.9527676105499268, + "kl": 3.796875, + "learning_rate": 6.234123615468031e-08, + "loss": 0.2408, + "reward": 0.4905134215950966, + "reward_std": 0.1691918671131134, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2655 }, { "clip_ratio": 0.0, - "completion_length": 1498.96435546875, + "completion_length": 1751.6317749023438, "epoch": 0.7933686804570234, - "grad_norm": 17.78189468383789, - "kl": 0.245361328125, - "learning_rate": 1.2433810417975533e-08, - "loss": 0.1301, - "reward": 0.353794664144516, - "reward_std": 0.19010676443576813, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 25.405956268310547, + "kl": 4.2109375, + "learning_rate": 6.216905208987766e-08, + "loss": 0.2351, + "reward": 0.4720982387661934, + "reward_std": 0.15214226208627224, + "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2656 }, { "clip_ratio": 0.0, - "completion_length": 1526.4554138183594, + "completion_length": 1746.3326721191406, "epoch": 0.7936673885445449, - "grad_norm": 15.72220516204834, - "kl": 0.248779296875, - "learning_rate": 1.2399414473347402e-08, - "loss": 0.0981, - "reward": 0.3716518059372902, - "reward_std": 0.1618395820260048, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 36.62749481201172, + "kl": 5.3828125, + "learning_rate": 6.199707236673702e-08, + "loss": 0.2846, + "reward": 0.4704241305589676, + "reward_std": 0.1415501032024622, + "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089402794838, + "rewards/tag_count_reward": 0.4146205484867096, "step": 2657 }, { "clip_ratio": 0.0, - "completion_length": 1517.3594665527344, + "completion_length": 1757.1496276855469, "epoch": 0.7939660966320663, - "grad_norm": 19.383989334106445, - "kl": 0.263671875, - "learning_rate": 1.2365059434471054e-08, - "loss": 0.1215, - "reward": 0.3800223395228386, - "reward_std": 0.1864643581211567, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 55.959503173828125, + "kl": 6.9765625, + "learning_rate": 6.182529717235526e-08, + "loss": 0.3784, + "reward": 0.4838169887661934, + "reward_std": 0.1719274688512087, + "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2658 }, { "clip_ratio": 0.0, - "completion_length": 1501.2031860351562, + "completion_length": 1714.01123046875, "epoch": 0.7942648047195878, - "grad_norm": 13.156661033630371, - "kl": 0.25439453125, - "learning_rate": 1.2330745338721416e-08, - "loss": 0.0815, - "reward": 0.3889509066939354, - "reward_std": 0.19342713057994843, - "rewards/accuracy_reward": 0.06919643143191934, + "grad_norm": 6.323443412780762, + "kl": 3.388671875, + "learning_rate": 6.165372669360708e-08, + "loss": 0.2038, + "reward": 0.5284598469734192, + "reward_std": 0.14461936242878437, + "rewards/accuracy_reward": 0.08928571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544738650322, + "rewards/tag_count_reward": 0.4391741305589676, "step": 2659 }, { "clip_ratio": 0.0, - "completion_length": 1571.2991333007812, + "completion_length": 1756.5023193359375, "epoch": 0.7945635128071092, - "grad_norm": 16.51885223388672, - "kl": 0.255615234375, - "learning_rate": 1.2296472223428888e-08, - "loss": 0.1081, - "reward": 0.3649553656578064, - "reward_std": 0.21018481999635696, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 8.904153823852539, + "kl": 3.830078125, + "learning_rate": 6.148236111714444e-08, + "loss": 0.2172, + "reward": 0.5351562798023224, + "reward_std": 0.1743953749537468, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.423549123108387, "step": 2660 }, { "clip_ratio": 0.0, - "completion_length": 1518.763427734375, + "completion_length": 1720.1116943359375, "epoch": 0.7948622208946308, - "grad_norm": 21.62784767150879, - "kl": 0.262939453125, - "learning_rate": 1.226224012587922e-08, - "loss": 0.1423, - "reward": 0.388392873108387, - "reward_std": 0.17744573578238487, - "rewards/accuracy_reward": 0.066964291036129, + "grad_norm": 9.021265983581543, + "kl": 3.875, + "learning_rate": 6.13112006293961e-08, + "loss": 0.2386, + "reward": 0.5206473395228386, + "reward_std": 0.14494216814637184, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4291294887661934, "step": 2661 }, { "clip_ratio": 0.0, - "completion_length": 1554.1585693359375, + "completion_length": 1818.4509887695312, "epoch": 0.7951609289821522, - "grad_norm": 16.68658447265625, - "kl": 0.2587890625, - "learning_rate": 1.2228049083313596e-08, - "loss": 0.1097, - "reward": 0.369419664144516, - "reward_std": 0.1870456263422966, - "rewards/accuracy_reward": 0.05580357578583062, + "grad_norm": 22.548654556274414, + "kl": 5.16015625, + "learning_rate": 6.114024541656798e-08, + "loss": 0.2705, + "reward": 0.4782366380095482, + "reward_std": 0.14867137372493744, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4068080484867096, "step": 2662 }, { "clip_ratio": 0.0, - "completion_length": 1508.6607666015625, + "completion_length": 1759.4353637695312, "epoch": 0.7954596370696737, - "grad_norm": 16.01185417175293, - "kl": 0.220458984375, - "learning_rate": 1.2193899132928537e-08, - "loss": 0.1142, - "reward": 0.4609375223517418, - "reward_std": 0.23270002007484436, - "rewards/accuracy_reward": 0.10491072037257254, + "grad_norm": 12.398488998413086, + "kl": 3.4609375, + "learning_rate": 6.096949566464268e-08, + "loss": 0.2072, + "reward": 0.5424107387661934, + "reward_std": 0.17735926993191242, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3560267984867096, + "rewards/tag_count_reward": 0.428571455180645, "step": 2663 }, { "clip_ratio": 0.0, - "completion_length": 1535.4107971191406, + "completion_length": 1742.5090026855469, "epoch": 0.7957583451571951, - "grad_norm": 18.085227966308594, - "kl": 0.251708984375, - "learning_rate": 1.2159790311875806e-08, - "loss": 0.0912, - "reward": 0.4977678880095482, - "reward_std": 0.19898298755288124, - "rewards/accuracy_reward": 0.1674107238650322, + "grad_norm": 35.125389099121094, + "kl": 3.6640625, + "learning_rate": 6.079895155937903e-08, + "loss": 0.2617, + "reward": 0.5993303805589676, + "reward_std": 0.18505988642573357, + "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2664 }, { "clip_ratio": 0.0, - "completion_length": 1611.1964721679688, + "completion_length": 1786.9175109863281, "epoch": 0.7960570532447167, - "grad_norm": 19.77228546142578, - "kl": 0.26416015625, - "learning_rate": 1.2125722657262483e-08, - "loss": 0.1127, - "reward": 0.4693080484867096, - "reward_std": 0.20775264129042625, - "rewards/accuracy_reward": 0.15401786426082253, + "grad_norm": 14.177800178527832, + "kl": 3.06640625, + "learning_rate": 6.062861328631241e-08, + "loss": 0.1845, + "reward": 0.5954241380095482, + "reward_std": 0.17716820910573006, + "rewards/accuracy_reward": 0.17633929895237088, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901828289032, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2665 }, { "clip_ratio": 0.0, - "completion_length": 1573.2991638183594, + "completion_length": 1786.2791137695312, "epoch": 0.7963557613322381, - "grad_norm": 16.211872100830078, - "kl": 0.254150390625, - "learning_rate": 1.2091696206150841e-08, - "loss": 0.1085, - "reward": 0.3164062649011612, - "reward_std": 0.14959212765097618, - "rewards/accuracy_reward": 0.0, + "grad_norm": 12.560492515563965, + "kl": 3.0859375, + "learning_rate": 6.045848103075421e-08, + "loss": 0.1964, + "reward": 0.4218750223517418, + "reward_std": 0.1330568864941597, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2666 }, { "clip_ratio": 0.0, - "completion_length": 1594.9755249023438, + "completion_length": 1818.74560546875, "epoch": 0.7966544694197596, - "grad_norm": 17.140920639038086, - "kl": 0.257568359375, - "learning_rate": 1.2057710995558302e-08, - "loss": 0.1167, - "reward": 0.3610491156578064, - "reward_std": 0.1718156486749649, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 14.246768951416016, + "kl": 3.19921875, + "learning_rate": 6.02885549777915e-08, + "loss": 0.1822, + "reward": 0.4720982387661934, + "reward_std": 0.11956014856696129, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2667 }, { "clip_ratio": 0.0, - "completion_length": 1556.9710083007812, + "completion_length": 1807.6005249023438, "epoch": 0.796953177507281, - "grad_norm": 16.631343841552734, - "kl": 0.238525390625, - "learning_rate": 1.2023767062457451e-08, - "loss": 0.1101, - "reward": 0.3777901977300644, - "reward_std": 0.18680329620838165, - "rewards/accuracy_reward": 0.05133928800933063, + "grad_norm": 5.390991687774658, + "kl": 3.1796875, + "learning_rate": 6.011883531228726e-08, + "loss": 0.1888, + "reward": 0.4765625223517418, + "reward_std": 0.1474661510437727, + "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2668 }, { "clip_ratio": 0.0, - "completion_length": 1508.0112609863281, + "completion_length": 1702.1161499023438, "epoch": 0.7972518855948025, - "grad_norm": 17.778074264526367, - "kl": 0.251708984375, - "learning_rate": 1.1989864443775983e-08, - "loss": 0.1238, - "reward": 0.3521205484867096, - "reward_std": 0.19859545677900314, - "rewards/accuracy_reward": 0.022321429336443543, + "grad_norm": 11.103914260864258, + "kl": 3.291015625, + "learning_rate": 5.994932221887991e-08, + "loss": 0.2005, + "reward": 0.4760044813156128, + "reward_std": 0.14368155226111412, + "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2669 }, { "clip_ratio": 0.0, - "completion_length": 1513.9264221191406, + "completion_length": 1740.0290832519531, "epoch": 0.797550593682324, - "grad_norm": 13.279610633850098, - "kl": 0.26611328125, - "learning_rate": 1.1956003176396589e-08, - "loss": 0.108, - "reward": 0.341517873108387, - "reward_std": 0.1875259317457676, - "rewards/accuracy_reward": 0.026785714784637094, + "grad_norm": 12.066716194152832, + "kl": 3.64453125, + "learning_rate": 5.978001588198295e-08, + "loss": 0.2162, + "reward": 0.4575893133878708, + "reward_std": 0.15499232150614262, + "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2670 }, { "clip_ratio": 0.0, - "completion_length": 1567.2857666015625, + "completion_length": 1809.8750915527344, "epoch": 0.7978493017698454, - "grad_norm": 18.190465927124023, - "kl": 0.25537109375, - "learning_rate": 1.1922183297157046e-08, - "loss": 0.1058, - "reward": 0.4436384066939354, - "reward_std": 0.16809552907943726, - "rewards/accuracy_reward": 0.1250000037252903, + "grad_norm": 9.404921531677246, + "kl": 4.16015625, + "learning_rate": 5.961091648578523e-08, + "loss": 0.2347, + "reward": 0.5558035895228386, + "reward_std": 0.1346595361828804, + "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318638414144516, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2671 }, { "clip_ratio": 0.0, - "completion_length": 1453.0223999023438, + "completion_length": 1710.2255249023438, "epoch": 0.7981480098573669, - "grad_norm": 18.767345428466797, - "kl": 0.23779296875, - "learning_rate": 1.188840484285003e-08, - "loss": 0.1195, - "reward": 0.4302455484867096, - "reward_std": 0.23360246047377586, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 9.469446182250977, + "kl": 4.16015625, + "learning_rate": 5.944202421425015e-08, + "loss": 0.265, + "reward": 0.530133955180645, + "reward_std": 0.17563442513346672, + "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4408482387661934, "step": 2672 }, { "clip_ratio": 0.0, - "completion_length": 1562.7232666015625, + "completion_length": 1769.1451721191406, "epoch": 0.7984467179448883, - "grad_norm": 15.55533218383789, - "kl": 0.252685546875, - "learning_rate": 1.185466785022321e-08, - "loss": 0.123, - "reward": 0.442522332072258, - "reward_std": 0.17677440494298935, - "rewards/accuracy_reward": 0.11830357438884676, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "grad_norm": 32.00592041015625, + "kl": 5.65234375, + "learning_rate": 5.927333925111605e-08, + "loss": 0.3045, + "reward": 0.5742187723517418, + "reward_std": 0.1626296602189541, + "rewards/accuracy_reward": 0.14732143469154835, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.426897332072258, "step": 2673 }, { "clip_ratio": 0.0, - "completion_length": 1482.0201721191406, + "completion_length": 1742.9219665527344, "epoch": 0.7987454260324098, - "grad_norm": 15.44336986541748, - "kl": 0.22607421875, - "learning_rate": 1.1820972355979142e-08, - "loss": 0.1026, - "reward": 0.4815848544239998, - "reward_std": 0.23002209141850471, - "rewards/accuracy_reward": 0.12500000232830644, + "grad_norm": 6.323920249938965, + "kl": 3.2890625, + "learning_rate": 5.910486177989571e-08, + "loss": 0.1869, + "reward": 0.5602678656578064, + "reward_std": 0.19033929519355297, + "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.356584832072258, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2674 }, { "clip_ratio": 0.0, - "completion_length": 1597.669677734375, + "completion_length": 1792.6362609863281, "epoch": 0.7990441341199312, - "grad_norm": 16.21099281311035, - "kl": 0.2509765625, - "learning_rate": 1.1787318396775187e-08, - "loss": 0.1399, - "reward": 0.4062500074505806, - "reward_std": 0.2015884481370449, - "rewards/accuracy_reward": 0.09375000232830644, + "grad_norm": 6.123708248138428, + "kl": 4.0390625, + "learning_rate": 5.893659198387593e-08, + "loss": 0.2396, + "reward": 0.5362723544239998, + "reward_std": 0.16536353528499603, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4268973469734192, "step": 2675 }, { "clip_ratio": 0.0, - "completion_length": 1516.76123046875, + "completion_length": 1775.9598999023438, "epoch": 0.7993428422074528, - "grad_norm": 17.11712646484375, - "kl": 0.240966796875, - "learning_rate": 1.1753706009223568e-08, - "loss": 0.1143, - "reward": 0.3604910895228386, - "reward_std": 0.17871475592255592, - "rewards/accuracy_reward": 0.022321428870782256, + "grad_norm": 16.610986709594727, + "kl": 3.65234375, + "learning_rate": 5.8768530046117844e-08, + "loss": 0.22, + "reward": 0.4564732387661934, + "reward_std": 0.1524439938366413, + "rewards/accuracy_reward": 0.029017857974395156, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2676 }, { "clip_ratio": 0.0, - "completion_length": 1531.4755554199219, + "completion_length": 1742.8505249023438, "epoch": 0.7996415502949742, - "grad_norm": 15.954081535339355, - "kl": 0.24072265625, - "learning_rate": 1.1720135229891287e-08, - "loss": 0.099, - "reward": 0.4006696566939354, - "reward_std": 0.18644366040825844, - "rewards/accuracy_reward": 0.06473214575089514, + "grad_norm": 16.505613327026367, + "kl": 3.365234375, + "learning_rate": 5.8600676149456435e-08, + "loss": 0.1915, + "reward": 0.515066996216774, + "reward_std": 0.1809722352772951, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.432477705180645, "step": 2677 }, { "clip_ratio": 0.0, - "completion_length": 1545.3973999023438, + "completion_length": 1718.6295776367188, "epoch": 0.7999402583824957, - "grad_norm": 20.55814552307129, - "kl": 0.251708984375, - "learning_rate": 1.1686606095300033e-08, - "loss": 0.1465, - "reward": 0.4051339477300644, - "reward_std": 0.1601501889526844, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 15.703069686889648, + "kl": 3.220703125, + "learning_rate": 5.8433030476500165e-08, + "loss": 0.1944, + "reward": 0.5150669738650322, + "reward_std": 0.10030623432248831, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4369419813156128, "step": 2678 }, { "clip_ratio": 0.0, - "completion_length": 1561.4241638183594, + "completion_length": 1786.0648193359375, "epoch": 0.8002389664700171, - "grad_norm": 13.445137977600098, - "kl": 0.26611328125, - "learning_rate": 1.165311864192623e-08, - "loss": 0.0974, - "reward": 0.4012276977300644, - "reward_std": 0.1826607771217823, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 5.701539993286133, + "kl": 3.51953125, + "learning_rate": 5.8265593209631145e-08, + "loss": 0.1945, + "reward": 0.4916294887661934, + "reward_std": 0.16214940883219242, + "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2679 }, { "clip_ratio": 0.0, - "completion_length": 1504.5536193847656, + "completion_length": 1770.3639221191406, "epoch": 0.8005376745575387, - "grad_norm": 16.603656768798828, - "kl": 0.2646484375, - "learning_rate": 1.1619672906200955e-08, - "loss": 0.1218, - "reward": 0.381138414144516, - "reward_std": 0.1656007505953312, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 17.66838264465332, + "kl": 2.96875, + "learning_rate": 5.809836453100478e-08, + "loss": 0.188, + "reward": 0.4882812798023224, + "reward_std": 0.15546676144003868, + "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2680 }, { "clip_ratio": 0.0, - "completion_length": 1579.1540832519531, + "completion_length": 1759.1228332519531, "epoch": 0.8008363826450601, - "grad_norm": 16.808286666870117, - "kl": 0.270751953125, - "learning_rate": 1.158626892450988e-08, - "loss": 0.127, - "reward": 0.3643973395228386, - "reward_std": 0.16822464764118195, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 11.211368560791016, + "kl": 2.986328125, + "learning_rate": 5.793134462254939e-08, + "loss": 0.177, + "reward": 0.4882812723517418, + "reward_std": 0.12264377996325493, + "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4280134066939354, "step": 2681 }, { "clip_ratio": 0.0, - "completion_length": 1495.4219360351562, + "completion_length": 1714.0291137695312, "epoch": 0.8011350907325816, - "grad_norm": 14.44174575805664, - "kl": 0.225341796875, - "learning_rate": 1.1552906733193268e-08, - "loss": 0.1225, - "reward": 0.4285714477300644, - "reward_std": 0.17146266251802444, - "rewards/accuracy_reward": 0.08482143259607255, + "grad_norm": 15.79074478149414, + "kl": 3.05859375, + "learning_rate": 5.776453366596634e-08, + "loss": 0.2193, + "reward": 0.5373884215950966, + "reward_std": 0.12578619830310345, + "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.4391741305589676, "step": 2682 }, { "clip_ratio": 0.0, - "completion_length": 1525.9666137695312, + "completion_length": 1723.0201416015625, "epoch": 0.801433798820103, - "grad_norm": 15.564104080200195, - "kl": 0.23193359375, - "learning_rate": 1.1519586368545903e-08, - "loss": 0.1338, - "reward": 0.3744419738650322, - "reward_std": 0.15807993337512016, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 13.78646469116211, + "kl": 2.994140625, + "learning_rate": 5.7597931842729515e-08, + "loss": 0.1952, + "reward": 0.475446455180645, + "reward_std": 0.10618437454104424, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2683 }, { "clip_ratio": 0.0, - "completion_length": 1596.9665832519531, + "completion_length": 1791.6764221191406, "epoch": 0.8017325069076245, - "grad_norm": 14.828018188476562, - "kl": 0.264404296875, - "learning_rate": 1.148630786681708e-08, - "loss": 0.134, - "reward": 0.3465401902794838, - "reward_std": 0.19097115099430084, - "rewards/accuracy_reward": 0.04017857392318547, + "grad_norm": 3.4600656032562256, + "kl": 3.087890625, + "learning_rate": 5.74315393340854e-08, + "loss": 0.1635, + "reward": 0.4793526977300644, + "reward_std": 0.13877943716943264, + "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2684 }, { "clip_ratio": 0.0, - "completion_length": 1451.8214721679688, + "completion_length": 1719.4710693359375, "epoch": 0.802031214995146, - "grad_norm": 15.211158752441406, - "kl": 0.218017578125, - "learning_rate": 1.1453071264210561e-08, - "loss": 0.0933, - "reward": 0.5468750223517418, - "reward_std": 0.19047754257917404, - "rewards/accuracy_reward": 0.19196428847499192, + "grad_norm": 4.950695037841797, + "kl": 2.779296875, + "learning_rate": 5.726535632105281e-08, + "loss": 0.1491, + "reward": 0.643415205180645, + "reward_std": 0.14173772744834423, + "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107238650322, + "rewards/tag_count_reward": 0.4425223395228386, "step": 2685 }, { "clip_ratio": 0.0, - "completion_length": 1620.1250610351562, + "completion_length": 1784.18310546875, "epoch": 0.8023299230826675, - "grad_norm": 15.257050514221191, - "kl": 0.26123046875, - "learning_rate": 1.1419876596884493e-08, - "loss": 0.1133, - "reward": 0.3320312611758709, - "reward_std": 0.17592758312821388, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 7.332329273223877, + "kl": 3.93359375, + "learning_rate": 5.709938298442246e-08, + "loss": 0.2306, + "reward": 0.479352705180645, + "reward_std": 0.1445106640458107, + "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776940047741, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2686 }, { "clip_ratio": 0.0, - "completion_length": 1535.1920166015625, + "completion_length": 1745.4710388183594, "epoch": 0.8026286311701889, - "grad_norm": 21.634109497070312, - "kl": 0.275390625, - "learning_rate": 1.1386723900951427e-08, - "loss": 0.1561, - "reward": 0.3945312649011612, - "reward_std": 0.19309618696570396, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 40.964088439941406, + "kl": 6.6796875, + "learning_rate": 5.6933619504757144e-08, + "loss": 0.3821, + "reward": 0.5111607387661934, + "reward_std": 0.18121357448399067, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "rewards/tag_count_reward": 0.4174107313156128, "step": 2687 }, { "clip_ratio": 0.0, - "completion_length": 1533.8728637695312, + "completion_length": 1769.55810546875, "epoch": 0.8029273392577104, - "grad_norm": 18.284061431884766, - "kl": 0.2529296875, - "learning_rate": 1.1353613212478264e-08, - "loss": 0.1026, - "reward": 0.4291294813156128, - "reward_std": 0.19026632979512215, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 13.422542572021484, + "kl": 4.00390625, + "learning_rate": 5.676806606239132e-08, + "loss": 0.2217, + "reward": 0.5770089477300644, + "reward_std": 0.21027339063584805, + "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2688 }, { "clip_ratio": 0.0, - "completion_length": 1568.0223999023438, + "completion_length": 1752.7924499511719, "epoch": 0.8032260473452318, - "grad_norm": 16.654949188232422, - "kl": 0.261474609375, - "learning_rate": 1.1320544567486157e-08, - "loss": 0.1356, - "reward": 0.368303582072258, - "reward_std": 0.18869969248771667, - "rewards/accuracy_reward": 0.05580357322469354, + "grad_norm": 5.352603435516357, + "kl": 4.3515625, + "learning_rate": 5.660272283743078e-08, + "loss": 0.2667, + "reward": 0.4966518059372902, + "reward_std": 0.1422208696603775, + "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2689 }, { "clip_ratio": 0.0, - "completion_length": 1537.0134582519531, + "completion_length": 1763.1563415527344, "epoch": 0.8035247554327534, - "grad_norm": 15.833990097045898, - "kl": 0.247314453125, - "learning_rate": 1.1287518001950568e-08, - "loss": 0.1087, - "reward": 0.3777901977300644, - "reward_std": 0.2116011194884777, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 16.05196189880371, + "kl": 3.6953125, + "learning_rate": 5.6437590009752844e-08, + "loss": 0.1804, + "reward": 0.5094866305589676, + "reward_std": 0.19575892388820648, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4402901977300644, "step": 2690 }, { "clip_ratio": 0.0, - "completion_length": 1628.4308776855469, + "completion_length": 1813.7634887695312, "epoch": 0.8038234635202748, - "grad_norm": 13.11300277709961, - "kl": 0.29443359375, - "learning_rate": 1.1254533551801176e-08, - "loss": 0.0986, - "reward": 0.3409598357975483, - "reward_std": 0.18715636804699898, - "rewards/accuracy_reward": 0.06696429033763707, + "grad_norm": 30.903993606567383, + "kl": 5.71875, + "learning_rate": 5.627266775900588e-08, + "loss": 0.2992, + "reward": 0.482142873108387, + "reward_std": 0.15746891126036644, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2739955447614193, + "rewards/tag_count_reward": 0.4062500223517418, "step": 2691 }, { "clip_ratio": 0.0, - "completion_length": 1582.2545471191406, + "completion_length": 1823.0313720703125, "epoch": 0.8041221716077963, - "grad_norm": 13.429619789123535, - "kl": 0.235595703125, - "learning_rate": 1.1221591252921803e-08, - "loss": 0.1058, - "reward": 0.4542410895228386, - "reward_std": 0.20451632142066956, - "rewards/accuracy_reward": 0.1316964365541935, + "grad_norm": 17.795106887817383, + "kl": 4.125, + "learning_rate": 5.610795626460901e-08, + "loss": 0.2262, + "reward": 0.5396205633878708, + "reward_std": 0.15288034081459045, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4123884066939354, "step": 2692 }, { "clip_ratio": 0.0, - "completion_length": 1501.2723999023438, + "completion_length": 1690.6161499023438, "epoch": 0.8044208796953177, - "grad_norm": 18.34618377685547, - "kl": 0.268310546875, - "learning_rate": 1.1188691141150453e-08, - "loss": 0.1356, - "reward": 0.3604910895228386, - "reward_std": 0.1799888014793396, - "rewards/accuracy_reward": 0.03125000186264515, + "grad_norm": 15.001956939697266, + "kl": 4.234375, + "learning_rate": 5.594345570575226e-08, + "loss": 0.2642, + "reward": 0.458705373108387, + "reward_std": 0.16571844927966595, + "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2693 }, { "clip_ratio": 0.0, - "completion_length": 1595.1697082519531, + "completion_length": 1803.6384887695312, "epoch": 0.8047195877828393, - "grad_norm": 17.59939956665039, - "kl": 0.25341796875, - "learning_rate": 1.1155833252279235e-08, - "loss": 0.1199, - "reward": 0.4218750298023224, - "reward_std": 0.18260269612073898, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 6.8132500648498535, + "kl": 4.01171875, + "learning_rate": 5.5779166261396173e-08, + "loss": 0.215, + "reward": 0.5385044887661934, + "reward_std": 0.14865673705935478, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2694 }, { "clip_ratio": 0.0, - "completion_length": 1574.7545471191406, + "completion_length": 1798.7991943359375, "epoch": 0.8050182958703607, - "grad_norm": 15.45522403717041, - "kl": 0.266357421875, - "learning_rate": 1.1123017622054293e-08, - "loss": 0.1249, - "reward": 0.4218750298023224, - "reward_std": 0.17662622779607773, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 8.135534286499023, + "kl": 2.9140625, + "learning_rate": 5.561508811027146e-08, + "loss": 0.1735, + "reward": 0.5407366380095482, + "reward_std": 0.16322751715779305, + "rewards/accuracy_reward": 0.11383929080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.4268973469734192, "step": 2695 }, { "clip_ratio": 0.0, - "completion_length": 1578.5000915527344, + "completion_length": 1775.2567749023438, "epoch": 0.8053170039578822, - "grad_norm": 16.187744140625, - "kl": 0.2705078125, - "learning_rate": 1.1090244286175832e-08, - "loss": 0.1234, - "reward": 0.2952009066939354, - "reward_std": 0.1392081305384636, - "rewards/accuracy_reward": 0.0, + "grad_norm": 11.127165794372559, + "kl": 4.046875, + "learning_rate": 5.545122143087916e-08, + "loss": 0.2369, + "reward": 0.416852705180645, + "reward_std": 0.11688901856541634, + "rewards/accuracy_reward": 0.006696428870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952009066939354, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2696 }, { "clip_ratio": 0.0, - "completion_length": 1572.4643859863281, + "completion_length": 1775.5938415527344, "epoch": 0.8056157120454036, - "grad_norm": 14.064720153808594, - "kl": 0.26416015625, - "learning_rate": 1.1057513280298008e-08, - "loss": 0.1056, - "reward": 0.4241071566939354, - "reward_std": 0.19464444369077682, - "rewards/accuracy_reward": 0.11607143399305642, + "grad_norm": 10.633901596069336, + "kl": 3.65234375, + "learning_rate": 5.528756640149004e-08, + "loss": 0.2172, + "reward": 0.5479910969734192, + "reward_std": 0.16496592015028, + "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.4185268133878708, "step": 2697 }, { "clip_ratio": 0.0, - "completion_length": 1596.8884582519531, + "completion_length": 1841.1317749023438, "epoch": 0.8059144201329251, - "grad_norm": 15.601053237915039, - "kl": 0.26708984375, - "learning_rate": 1.1024824640028969e-08, - "loss": 0.1023, - "reward": 0.3671875149011612, - "reward_std": 0.19144003465771675, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 8.39854621887207, + "kl": 3.82421875, + "learning_rate": 5.512412320014484e-08, + "loss": 0.2115, + "reward": 0.4854910895228386, + "reward_std": 0.1501553263515234, + "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306919664144516, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2698 }, { "clip_ratio": 0.0, - "completion_length": 1543.4777526855469, + "completion_length": 1688.93310546875, "epoch": 0.8062131282204466, - "grad_norm": 18.789770126342773, - "kl": 0.240478515625, - "learning_rate": 1.0992178400930751e-08, - "loss": 0.1372, - "reward": 0.4263393059372902, - "reward_std": 0.1943255253136158, - "rewards/accuracy_reward": 0.08258928824216127, + "grad_norm": 15.52246379852295, + "kl": 2.91015625, + "learning_rate": 5.496089200465376e-08, + "loss": 0.2087, + "reward": 0.541852705180645, + "reward_std": 0.15498312935233116, + "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500223517418, + "rewards/tag_count_reward": 0.439174123108387, "step": 2699 }, { "clip_ratio": 0.0, - "completion_length": 1494.15185546875, + "completion_length": 1743.8371276855469, "epoch": 0.8065118363079681, - "grad_norm": 18.397201538085938, - "kl": 0.22900390625, - "learning_rate": 1.0959574598519244e-08, - "loss": 0.1273, - "reward": 0.4497768133878708, - "reward_std": 0.18687286972999573, - "rewards/accuracy_reward": 0.10714286123402417, + "grad_norm": 9.185998916625977, + "kl": 3.23046875, + "learning_rate": 5.4797872992596225e-08, + "loss": 0.1906, + "reward": 0.571986623108387, + "reward_std": 0.14406094700098038, + "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.4380580559372902, "step": 2700 }, { "clip_ratio": 0.0, - "completion_length": 1571.7835388183594, + "completion_length": 1751.2277526855469, "epoch": 0.8068105443954895, - "grad_norm": 14.816556930541992, - "kl": 0.254638671875, - "learning_rate": 1.0927013268264202e-08, - "loss": 0.0966, - "reward": 0.4101562723517418, - "reward_std": 0.2007622867822647, - "rewards/accuracy_reward": 0.10044643189758062, + "grad_norm": 9.003678321838379, + "kl": 3.8203125, + "learning_rate": 5.4635066341321006e-08, + "loss": 0.2414, + "reward": 0.5390625298023224, + "reward_std": 0.16397096775472164, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2701 }, { "clip_ratio": 0.0, - "completion_length": 1560.66748046875, + "completion_length": 1788.4308776855469, "epoch": 0.807109252483011, - "grad_norm": 18.17963409423828, - "kl": 0.242431640625, - "learning_rate": 1.089449444558917e-08, - "loss": 0.1266, - "reward": 0.4497768059372902, - "reward_std": 0.1722804196178913, - "rewards/accuracy_reward": 0.1160714328289032, + "grad_norm": 22.01239013671875, + "kl": 4.171875, + "learning_rate": 5.447247222794585e-08, + "loss": 0.237, + "reward": 0.5619420036673546, + "reward_std": 0.14808448404073715, + "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2702 }, { "clip_ratio": 0.0, - "completion_length": 1653.5111999511719, + "completion_length": 1831.1920471191406, "epoch": 0.8074079605705324, - "grad_norm": 13.243773460388184, - "kl": 0.251708984375, - "learning_rate": 1.0862018165871417e-08, - "loss": 0.0894, - "reward": 0.3448660895228386, - "reward_std": 0.16607152670621872, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 15.586058616638184, + "kl": 4.89453125, + "learning_rate": 5.431009082935709e-08, + "loss": 0.2662, + "reward": 0.462053582072258, + "reward_std": 0.13528158329427242, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875074505806, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2703 }, { "clip_ratio": 0.0, - "completion_length": 1562.9732971191406, + "completion_length": 1773.5023193359375, "epoch": 0.807706668658054, - "grad_norm": 14.310193061828613, - "kl": 0.24609375, - "learning_rate": 1.0829584464441965e-08, - "loss": 0.1048, - "reward": 0.4330357387661934, - "reward_std": 0.1954818181693554, - "rewards/accuracy_reward": 0.10714286379516125, + "grad_norm": 24.383127212524414, + "kl": 4.65625, + "learning_rate": 5.414792232220983e-08, + "loss": 0.2733, + "reward": 0.5172991305589676, + "reward_std": 0.17912793532013893, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4056919813156128, "step": 2704 }, { "clip_ratio": 0.0, - "completion_length": 1528.7991943359375, + "completion_length": 1716.1161499023438, "epoch": 0.8080053767455754, - "grad_norm": 13.56981086730957, - "kl": 0.258056640625, - "learning_rate": 1.0797193376585517e-08, - "loss": 0.1315, - "reward": 0.349330373108387, - "reward_std": 0.18687626346945763, + "grad_norm": 16.167301177978516, + "kl": 3.61328125, + "learning_rate": 5.398596688292758e-08, + "loss": 0.2262, + "reward": 0.459263414144516, + "reward_std": 0.13561873324215412, "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2705 }, { "clip_ratio": 0.0, - "completion_length": 1549.7277221679688, + "completion_length": 1695.3594665527344, "epoch": 0.8083040848330969, - "grad_norm": 17.64231300354004, - "kl": 0.2490234375, - "learning_rate": 1.0764844937540374e-08, - "loss": 0.1313, - "reward": 0.3437500223517418, - "reward_std": 0.1713394559919834, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 17.307071685791016, + "kl": 3.07421875, + "learning_rate": 5.382422468770187e-08, + "loss": 0.2097, + "reward": 0.466517873108387, + "reward_std": 0.14342264831066132, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.439732164144516, "step": 2706 }, { "clip_ratio": 0.0, - "completion_length": 1591.0848999023438, + "completion_length": 1773.6094360351562, "epoch": 0.8086027929206183, - "grad_norm": 14.032493591308594, - "kl": 0.2353515625, - "learning_rate": 1.0732539182498484e-08, - "loss": 0.088, - "reward": 0.4414062649011612, - "reward_std": 0.24102932214736938, - "rewards/accuracy_reward": 0.11383929289877415, + "grad_norm": 7.1943817138671875, + "kl": 2.28515625, + "learning_rate": 5.366269591249242e-08, + "loss": 0.1376, + "reward": 0.5820312723517418, + "reward_std": 0.20988063886761665, + "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4369419887661934, "step": 2707 }, { "clip_ratio": 0.0, - "completion_length": 1489.9554443359375, + "completion_length": 1661.8951721191406, "epoch": 0.8089015010081398, - "grad_norm": 13.58255386352539, - "kl": 0.226318359375, - "learning_rate": 1.0700276146605348e-08, - "loss": 0.107, - "reward": 0.474330373108387, - "reward_std": 0.20465492829680443, - "rewards/accuracy_reward": 0.13169643376022577, + "grad_norm": 4.238203525543213, + "kl": 3.45703125, + "learning_rate": 5.350138073302673e-08, + "loss": 0.199, + "reward": 0.5881696715950966, + "reward_std": 0.16262873448431492, + "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2708 }, { "clip_ratio": 0.0, - "completion_length": 1525.24560546875, + "completion_length": 1764.0826721191406, "epoch": 0.8092002090956613, - "grad_norm": 17.431318283081055, - "kl": 0.250244140625, - "learning_rate": 1.0668055864959962e-08, - "loss": 0.1282, - "reward": 0.3582589477300644, - "reward_std": 0.17994196712970734, - "rewards/accuracy_reward": 0.03571428847499192, + "grad_norm": 7.046487808227539, + "kl": 3.283203125, + "learning_rate": 5.334027932479981e-08, + "loss": 0.1859, + "reward": 0.4765625298023224, + "reward_std": 0.1326500754803419, + "rewards/accuracy_reward": 0.04017857415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4363839402794838, "step": 2709 }, { "clip_ratio": 0.0, - "completion_length": 1610.83935546875, + "completion_length": 1746.6719360351562, "epoch": 0.8094989171831828, - "grad_norm": 20.052976608276367, - "kl": 0.2529296875, - "learning_rate": 1.063587837261487e-08, - "loss": 0.1258, - "reward": 0.3856026977300644, - "reward_std": 0.16493575274944305, - "rewards/accuracy_reward": 0.04910714388824999, + "grad_norm": 11.575340270996094, + "kl": 2.869140625, + "learning_rate": 5.317939186307435e-08, + "loss": 0.1937, + "reward": 0.5156250223517418, + "reward_std": 0.13782999105751514, + "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.439732164144516, "step": 2710 }, { "clip_ratio": 0.0, - "completion_length": 1620.9130249023438, + "completion_length": 1828.8906860351562, "epoch": 0.8097976252707042, - "grad_norm": 17.028200149536133, - "kl": 0.281982421875, - "learning_rate": 1.060374370457599e-08, - "loss": 0.1137, - "reward": 0.3119419813156128, - "reward_std": 0.1577567122876644, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 16.950571060180664, + "kl": 4.99609375, + "learning_rate": 5.301871852287995e-08, + "loss": 0.2663, + "reward": 0.431361623108387, + "reward_std": 0.14767584018409252, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2711 }, { "clip_ratio": 0.0, - "completion_length": 1592.0536499023438, + "completion_length": 1763.6295471191406, "epoch": 0.8100963333582257, - "grad_norm": 17.8891658782959, - "kl": 0.275390625, - "learning_rate": 1.0571651895802714e-08, - "loss": 0.1237, - "reward": 0.3376116305589676, - "reward_std": 0.17523618042469025, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 3.0434694290161133, + "kl": 3.646484375, + "learning_rate": 5.2858259479013565e-08, + "loss": 0.2056, + "reward": 0.472098246216774, + "reward_std": 0.1568599808961153, + "rewards/accuracy_reward": 0.03794643119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.4341517984867096, "step": 2712 }, { "clip_ratio": 0.0, - "completion_length": 1591.341552734375, + "completion_length": 1807.1116943359375, "epoch": 0.8103950414457471, - "grad_norm": 13.770801544189453, - "kl": 0.26513671875, - "learning_rate": 1.0539602981207796e-08, - "loss": 0.1408, - "reward": 0.3643973395228386, - "reward_std": 0.2244427166879177, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 34.010902404785156, + "kl": 5.30078125, + "learning_rate": 5.269801490603898e-08, + "loss": 0.2875, + "reward": 0.4570312798023224, + "reward_std": 0.1838645115494728, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4079241305589676, "step": 2713 }, { "clip_ratio": 0.0, - "completion_length": 1619.3259582519531, + "completion_length": 1780.1228637695312, "epoch": 0.8106937495332686, - "grad_norm": 11.823457717895508, - "kl": 0.256103515625, - "learning_rate": 1.0507596995657286e-08, - "loss": 0.11, - "reward": 0.3705357313156128, - "reward_std": 0.1745704896748066, - "rewards/accuracy_reward": 0.05580357299186289, + "grad_norm": 4.481138706207275, + "kl": 3.33984375, + "learning_rate": 5.253798497828643e-08, + "loss": 0.1995, + "reward": 0.4838169813156128, + "reward_std": 0.14566761441528797, + "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.428013414144516, "step": 2714 }, { "clip_ratio": 0.0, - "completion_length": 1577.3683776855469, + "completion_length": 1801.4644165039062, "epoch": 0.8109924576207901, - "grad_norm": 17.797191619873047, - "kl": 0.24560546875, - "learning_rate": 1.0475633973970571e-08, - "loss": 0.0956, - "reward": 0.4263393059372902, - "reward_std": 0.17268449813127518, - "rewards/accuracy_reward": 0.08705357415601611, + "grad_norm": 6.651227951049805, + "kl": 3.6484375, + "learning_rate": 5.2378169869852854e-08, + "loss": 0.2174, + "reward": 0.5251116305589676, + "reward_std": 0.18003006651997566, + "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857313156128, + "rewards/tag_count_reward": 0.4179687723517418, "step": 2715 }, { "clip_ratio": 0.0, - "completion_length": 1541.7098693847656, + "completion_length": 1728.3237609863281, "epoch": 0.8112911657083115, - "grad_norm": 18.6142635345459, - "kl": 0.253173828125, - "learning_rate": 1.0443713950920308e-08, - "loss": 0.152, - "reward": 0.4720982387661934, - "reward_std": 0.1767292283475399, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 5.555882453918457, + "kl": 4.171875, + "learning_rate": 5.221856975460154e-08, + "loss": 0.263, + "reward": 0.5926339626312256, + "reward_std": 0.13673717714846134, + "rewards/accuracy_reward": 0.16071429592557251, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.431919664144516, "step": 2716 }, { "clip_ratio": 0.0, - "completion_length": 1582.2857666015625, + "completion_length": 1825.7857971191406, "epoch": 0.811589873795833, - "grad_norm": 15.930620193481445, - "kl": 0.2392578125, - "learning_rate": 1.0411836961232311e-08, - "loss": 0.1035, - "reward": 0.3900669738650322, - "reward_std": 0.20111983641982079, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 3.1283352375030518, + "kl": 3.3828125, + "learning_rate": 5.205918480616156e-08, + "loss": 0.1969, + "reward": 0.4676339477300644, + "reward_std": 0.15566161088645458, + "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2717 }, { "clip_ratio": 0.0, - "completion_length": 1567.4665832519531, + "completion_length": 1734.7634582519531, "epoch": 0.8118885818833544, - "grad_norm": 11.309741020202637, - "kl": 0.25537109375, - "learning_rate": 1.038000303958565e-08, - "loss": 0.1091, - "reward": 0.385044664144516, - "reward_std": 0.20938246697187424, - "rewards/accuracy_reward": 0.06919643236324191, + "grad_norm": 4.940022945404053, + "kl": 3.25390625, + "learning_rate": 5.190001519792825e-08, + "loss": 0.189, + "reward": 0.5055803880095482, + "reward_std": 0.1674402318894863, + "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482164144516, + "rewards/tag_count_reward": 0.427455373108387, "step": 2718 }, { "clip_ratio": 0.0, - "completion_length": 1526.0938415527344, + "completion_length": 1723.1630554199219, "epoch": 0.812187289970876, - "grad_norm": 14.549836158752441, - "kl": 0.255859375, - "learning_rate": 1.0348212220612518e-08, - "loss": 0.1166, - "reward": 0.411830373108387, - "reward_std": 0.1930905021727085, - "rewards/accuracy_reward": 0.08705357438884676, + "grad_norm": 12.245909690856934, + "kl": 2.671875, + "learning_rate": 5.174106110306259e-08, + "loss": 0.1627, + "reward": 0.5357143059372902, + "reward_std": 0.17117277346551418, + "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4352678805589676, "step": 2719 }, { "clip_ratio": 0.0, - "completion_length": 1550.8661499023438, + "completion_length": 1788.6139221191406, "epoch": 0.8124859980583974, - "grad_norm": 17.450672149658203, - "kl": 0.22607421875, - "learning_rate": 1.0316464538898179e-08, - "loss": 0.125, - "reward": 0.4614955633878708, - "reward_std": 0.2300839088857174, - "rewards/accuracy_reward": 0.10937500558793545, + "grad_norm": 3.9004311561584473, + "kl": 3.390625, + "learning_rate": 5.158232269449089e-08, + "loss": 0.2017, + "reward": 0.5764509215950966, + "reward_std": 0.20167012326419353, + "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3521205559372902, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2720 }, { "clip_ratio": 0.0, - "completion_length": 1551.7187805175781, + "completion_length": 1730.7969360351562, "epoch": 0.8127847061459189, - "grad_norm": 16.649581909179688, - "kl": 0.231201171875, - "learning_rate": 1.0284760028981021e-08, - "loss": 0.1064, - "reward": 0.3945312723517418, - "reward_std": 0.1674941349774599, - "rewards/accuracy_reward": 0.05580357206054032, + "grad_norm": 8.614546775817871, + "kl": 3.1328125, + "learning_rate": 5.14238001449051e-08, + "loss": 0.1972, + "reward": 0.5000000223517418, + "reward_std": 0.1376197785139084, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2721 }, { "clip_ratio": 0.0, - "completion_length": 1610.6540832519531, + "completion_length": 1792.0581359863281, "epoch": 0.8130834142334403, - "grad_norm": 18.31049156188965, - "kl": 0.2568359375, - "learning_rate": 1.0253098725352455e-08, - "loss": 0.1239, - "reward": 0.4051339477300644, - "reward_std": 0.17697210237383842, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 12.848841667175293, + "kl": 3.375, + "learning_rate": 5.126549362676227e-08, + "loss": 0.2104, + "reward": 0.536830373108387, + "reward_std": 0.1582099962979555, + "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.431919664144516, "step": 2722 }, { "clip_ratio": 0.0, - "completion_length": 1522.08935546875, + "completion_length": 1760.4777526855469, "epoch": 0.8133821223209619, - "grad_norm": 10.594727516174316, - "kl": 0.231201171875, - "learning_rate": 1.0221480662456843e-08, - "loss": 0.1077, - "reward": 0.4626116305589676, - "reward_std": 0.20625482872128487, - "rewards/accuracy_reward": 0.12500000465661287, + "grad_norm": 7.018321990966797, + "kl": 3.08984375, + "learning_rate": 5.110740331228422e-08, + "loss": 0.1674, + "reward": 0.5664062798023224, + "reward_std": 0.1776423417031765, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.432477705180645, "step": 2723 }, { "clip_ratio": 0.0, - "completion_length": 1481.5514221191406, + "completion_length": 1660.5513916015625, "epoch": 0.8136808304084833, - "grad_norm": 14.569714546203613, - "kl": 0.226318359375, - "learning_rate": 1.0189905874691574e-08, - "loss": 0.1394, - "reward": 0.4157366305589676, - "reward_std": 0.1787715144455433, - "rewards/accuracy_reward": 0.05803571850992739, + "grad_norm": 24.32977867126465, + "kl": 4.1171875, + "learning_rate": 5.094952937345787e-08, + "loss": 0.2407, + "reward": 0.5200893059372902, + "reward_std": 0.15926779992878437, + "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357700914144516, + "rewards/tag_count_reward": 0.4375000223517418, "step": 2724 }, { "clip_ratio": 0.0, - "completion_length": 1601.7523193359375, + "completion_length": 1794.9152526855469, "epoch": 0.8139795384960048, - "grad_norm": 14.801002502441406, - "kl": 0.247314453125, - "learning_rate": 1.0158374396406887e-08, - "loss": 0.1129, - "reward": 0.3203125149011612, - "reward_std": 0.1651533506810665, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 6.6254987716674805, + "kl": 3.3984375, + "learning_rate": 5.079187198203444e-08, + "loss": 0.1889, + "reward": 0.4430803805589676, + "reward_std": 0.13320189528167248, + "rewards/accuracy_reward": 0.01785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4252232387661934, "step": 2725 }, { "clip_ratio": 0.0, - "completion_length": 1640.8572082519531, + "completion_length": 1808.6875915527344, "epoch": 0.8142782465835262, - "grad_norm": 16.69544219970703, - "kl": 0.258544921875, - "learning_rate": 1.012688626190596e-08, - "loss": 0.1221, - "reward": 0.3889509066939354, - "reward_std": 0.1609344743192196, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 7.362576484680176, + "kl": 4.1953125, + "learning_rate": 5.06344313095298e-08, + "loss": 0.2413, + "reward": 0.4966518059372902, + "reward_std": 0.14513612538576126, + "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "rewards/tag_count_reward": 0.4118303805589676, "step": 2726 }, { "clip_ratio": 0.0, - "completion_length": 1635.7790832519531, + "completion_length": 1810.8639221191406, "epoch": 0.8145769546710477, - "grad_norm": 15.35004711151123, - "kl": 0.27099609375, - "learning_rate": 1.0095441505444808e-08, - "loss": 0.1214, - "reward": 0.4051339477300644, - "reward_std": 0.1573411487042904, - "rewards/accuracy_reward": 0.10937500488944352, + "grad_norm": 9.73324966430664, + "kl": 4.55859375, + "learning_rate": 5.047720752722404e-08, + "loss": 0.2525, + "reward": 0.5145089477300644, + "reward_std": 0.11431975848972797, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589328289032, + "rewards/tag_count_reward": 0.4029017984867096, "step": 2727 }, { "clip_ratio": 0.0, - "completion_length": 1555.41748046875, + "completion_length": 1762.24560546875, "epoch": 0.8148756627585692, - "grad_norm": 13.531876564025879, - "kl": 0.233154296875, - "learning_rate": 1.0064040161232218e-08, - "loss": 0.1174, - "reward": 0.4051339477300644, - "reward_std": 0.18142017163336277, - "rewards/accuracy_reward": 0.0736607201397419, + "grad_norm": 12.345558166503906, + "kl": 2.9296875, + "learning_rate": 5.032020080616109e-08, + "loss": 0.1764, + "reward": 0.5184151977300644, + "reward_std": 0.1498600821942091, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4335937723517418, "step": 2728 }, { "clip_ratio": 0.0, - "completion_length": 1547.3527526855469, + "completion_length": 1732.26123046875, "epoch": 0.8151743708460907, - "grad_norm": 13.139270782470703, - "kl": 0.24658203125, - "learning_rate": 1.0032682263429787e-08, - "loss": 0.1313, - "reward": 0.3359375074505806, - "reward_std": 0.14073672704398632, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 5.124916076660156, + "kl": 4.0234375, + "learning_rate": 5.0163411317148934e-08, + "loss": 0.243, + "reward": 0.4520089477300644, + "reward_std": 0.11838055960834026, + "rewards/accuracy_reward": 0.026785715017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2729 }, { "clip_ratio": 0.0, - "completion_length": 1535.4308776855469, + "completion_length": 1721.9844665527344, "epoch": 0.8154730789336121, - "grad_norm": 13.83054256439209, - "kl": 0.22509765625, - "learning_rate": 1.0001367846151849e-08, - "loss": 0.1033, - "reward": 0.4302455484867096, - "reward_std": 0.16104257851839066, - "rewards/accuracy_reward": 0.08705357508733869, + "grad_norm": 15.696487426757812, + "kl": 2.51171875, + "learning_rate": 5.000683923075924e-08, + "loss": 0.1662, + "reward": 0.5329241380095482, + "reward_std": 0.13920924719423056, + "rewards/accuracy_reward": 0.10044643306173384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2730 }, { "clip_ratio": 0.0, - "completion_length": 1692.4777526855469, + "completion_length": 1908.6161499023438, "epoch": 0.8157717870211336, - "grad_norm": 11.23704719543457, - "kl": 0.28076171875, - "learning_rate": 9.970096943465389e-09, - "loss": 0.0988, - "reward": 0.2873883992433548, - "reward_std": 0.15812735259532928, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 6.437333583831787, + "kl": 3.548828125, + "learning_rate": 4.9850484717326946e-08, + "loss": 0.1893, + "reward": 0.4213169813156128, + "reward_std": 0.1636672541499138, + "rewards/accuracy_reward": 0.024553572526201606, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.278459832072258, + "rewards/tag_count_reward": 0.3967634066939354, "step": 2731 }, { "clip_ratio": 0.0, - "completion_length": 1528.1429443359375, + "completion_length": 1783.9688110351562, "epoch": 0.816070495108655, - "grad_norm": 13.01476764678955, - "kl": 0.20458984375, - "learning_rate": 9.938869589390109e-09, - "loss": 0.1103, - "reward": 0.4369419738650322, - "reward_std": 0.2371559925377369, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 19.7557430267334, + "kl": 2.359375, + "learning_rate": 4.9694347946950545e-08, + "loss": 0.1494, + "reward": 0.5368303954601288, + "reward_std": 0.19813701510429382, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.356584832072258, + "rewards/tag_count_reward": 0.4386160969734192, "step": 2732 }, { "clip_ratio": 0.0, - "completion_length": 1526.1920471191406, + "completion_length": 1766.6406860351562, "epoch": 0.8163692031961766, - "grad_norm": 13.276006698608398, - "kl": 0.2666015625, - "learning_rate": 9.907685817898315e-09, - "loss": 0.1036, - "reward": 0.381138414144516, - "reward_std": 0.16360164061188698, - "rewards/accuracy_reward": 0.060267859138548374, + "grad_norm": 13.689188003540039, + "kl": 3.04296875, + "learning_rate": 4.9538429089491576e-08, + "loss": 0.1847, + "reward": 0.4854910895228386, + "reward_std": 0.13879569806158543, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2733 }, { "clip_ratio": 0.0, - "completion_length": 1530.8795166015625, + "completion_length": 1695.8014221191406, "epoch": 0.816667911283698, - "grad_norm": 16.29642105102539, - "kl": 0.2314453125, - "learning_rate": 9.876545662914865e-09, - "loss": 0.1274, - "reward": 0.4129464402794838, - "reward_std": 0.20239246636629105, - "rewards/accuracy_reward": 0.07812500302679837, + "grad_norm": 24.04045867919922, + "kl": 2.02734375, + "learning_rate": 4.9382728314574324e-08, + "loss": 0.1453, + "reward": 0.5463169813156128, + "reward_std": 0.151080759242177, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4503348469734192, "step": 2734 }, { "clip_ratio": 0.0, - "completion_length": 1572.9933776855469, + "completion_length": 1822.2679443359375, "epoch": 0.8169666193712195, - "grad_norm": 16.537506103515625, - "kl": 0.2255859375, - "learning_rate": 9.845449158317215e-09, - "loss": 0.1215, - "reward": 0.4146205559372902, - "reward_std": 0.21106087788939476, - "rewards/accuracy_reward": 0.0758928582072258, + "grad_norm": 21.430410385131836, + "kl": 2.912109375, + "learning_rate": 4.9227245791586075e-08, + "loss": 0.169, + "reward": 0.5044643133878708, + "reward_std": 0.16666945070028305, + "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2735 }, { "clip_ratio": 0.0, - "completion_length": 1604.8415832519531, + "completion_length": 1820.8907165527344, "epoch": 0.8172653274587409, - "grad_norm": 15.69312858581543, - "kl": 0.2353515625, - "learning_rate": 9.814396337935327e-09, - "loss": 0.1084, - "reward": 0.4017857313156128, - "reward_std": 0.2132973074913025, - "rewards/accuracy_reward": 0.06919643189758062, + "grad_norm": 14.485987663269043, + "kl": 2.98046875, + "learning_rate": 4.9071981689676634e-08, + "loss": 0.164, + "reward": 0.5290178880095482, + "reward_std": 0.16725178435444832, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2736 }, { "clip_ratio": 0.0, - "completion_length": 1514.0648193359375, + "completion_length": 1694.6675109863281, "epoch": 0.8175640355462624, - "grad_norm": 13.367655754089355, - "kl": 0.23779296875, - "learning_rate": 9.783387235551599e-09, - "loss": 0.1013, - "reward": 0.3426339402794838, - "reward_std": 0.15391997806727886, - "rewards/accuracy_reward": 0.008928571827709675, + "grad_norm": 6.408156394958496, + "kl": 3.6796875, + "learning_rate": 4.8916936177758e-08, + "loss": 0.2245, + "reward": 0.4419643059372902, + "reward_std": 0.11656048893928528, + "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2737 }, { "clip_ratio": 0.0, - "completion_length": 1576.69873046875, + "completion_length": 1760.5848999023438, "epoch": 0.8178627436337839, - "grad_norm": 12.596209526062012, - "kl": 0.2451171875, - "learning_rate": 9.752421884900914e-09, - "loss": 0.1223, - "reward": 0.3911830484867096, - "reward_std": 0.21502719447016716, - "rewards/accuracy_reward": 0.05580357275903225, + "grad_norm": 5.579799175262451, + "kl": 3.4609375, + "learning_rate": 4.8762109424504566e-08, + "loss": 0.188, + "reward": 0.5066964477300644, + "reward_std": 0.1982294637709856, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/tag_count_reward": 0.439732164144516, "step": 2738 }, { "clip_ratio": 0.0, - "completion_length": 1600.7857971191406, + "completion_length": 1786.9978332519531, "epoch": 0.8181614517213054, - "grad_norm": 16.797449111938477, - "kl": 0.24267578125, - "learning_rate": 9.721500319670523e-09, - "loss": 0.1153, - "reward": 0.4062500223517418, - "reward_std": 0.17846156656742096, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 5.808907508850098, + "kl": 4.00390625, + "learning_rate": 4.8607501598352614e-08, + "loss": 0.2357, + "reward": 0.5239955484867096, + "reward_std": 0.14542509242892265, + "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2739 }, { "clip_ratio": 0.0, - "completion_length": 1625.7969360351562, + "completion_length": 1867.72998046875, "epoch": 0.8184601598088268, - "grad_norm": 14.260807037353516, - "kl": 0.249755859375, - "learning_rate": 9.690622573500067e-09, - "loss": 0.0811, - "reward": 0.3822544887661934, - "reward_std": 0.187506515532732, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 18.80755615234375, + "kl": 4.4140625, + "learning_rate": 4.845311286750034e-08, + "loss": 0.232, + "reward": 0.499441996216774, + "reward_std": 0.16835850104689598, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4146205484867096, "step": 2740 }, { "clip_ratio": 0.0, - "completion_length": 1532.5938415527344, + "completion_length": 1693.1473999023438, "epoch": 0.8187588678963483, - "grad_norm": 14.529183387756348, - "kl": 0.24755859375, - "learning_rate": 9.659788679981517e-09, - "loss": 0.1326, - "reward": 0.3727678656578064, - "reward_std": 0.19718341529369354, - "rewards/accuracy_reward": 0.053571430034935474, + "grad_norm": 6.932196617126465, + "kl": 4.265625, + "learning_rate": 4.829894339990759e-08, + "loss": 0.2736, + "reward": 0.4977678805589676, + "reward_std": 0.11465507559478283, + "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964477300644, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2741 }, { "clip_ratio": 0.0, - "completion_length": 1585.8013916015625, + "completion_length": 1730.8282165527344, "epoch": 0.8190575759838697, - "grad_norm": 16.774139404296875, - "kl": 0.2568359375, - "learning_rate": 9.628998672659111e-09, - "loss": 0.1323, - "reward": 0.3404018059372902, - "reward_std": 0.18767249584197998, - "rewards/accuracy_reward": 0.026785715017467737, + "grad_norm": 14.599860191345215, + "kl": 3.203125, + "learning_rate": 4.8144993363295554e-08, + "loss": 0.1721, + "reward": 0.4654018059372902, + "reward_std": 0.1146368533372879, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4453125223517418, "step": 2742 }, { "clip_ratio": 0.0, - "completion_length": 1580.8215026855469, + "completion_length": 1799.7210693359375, "epoch": 0.8193562840713913, - "grad_norm": 17.77239990234375, - "kl": 0.26708984375, - "learning_rate": 9.59825258502936e-09, - "loss": 0.1372, - "reward": 0.3844866305589676, - "reward_std": 0.19426696375012398, - "rewards/accuracy_reward": 0.06919643259607255, + "grad_norm": 12.48774528503418, + "kl": 4.96875, + "learning_rate": 4.7991262925146796e-08, + "loss": 0.2771, + "reward": 0.4877232387661934, + "reward_std": 0.14502577856183052, + "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4252232387661934, "step": 2743 }, { "clip_ratio": 0.0, - "completion_length": 1652.8415832519531, + "completion_length": 1864.7991943359375, "epoch": 0.8196549921589127, - "grad_norm": 12.155685424804688, - "kl": 0.26904296875, - "learning_rate": 9.567550450541012e-09, - "loss": 0.0938, - "reward": 0.3593750223517418, - "reward_std": 0.1648692861199379, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 15.269882202148438, + "kl": 4.9453125, + "learning_rate": 4.783775225270506e-08, + "loss": 0.262, + "reward": 0.4620535895228386, + "reward_std": 0.12518161162734032, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357238650322, + "rewards/tag_count_reward": 0.4062500149011612, "step": 2744 }, { "clip_ratio": 0.0, - "completion_length": 1586.8058776855469, + "completion_length": 1771.19873046875, "epoch": 0.8199537002464342, - "grad_norm": 15.286232948303223, - "kl": 0.253173828125, - "learning_rate": 9.536892302594957e-09, - "loss": 0.1076, - "reward": 0.380580373108387, - "reward_std": 0.19862747564911842, - "rewards/accuracy_reward": 0.06696428591385484, + "grad_norm": 7.249768257141113, + "kl": 3.34765625, + "learning_rate": 4.768446151297478e-08, + "loss": 0.2055, + "reward": 0.493861623108387, + "reward_std": 0.17075259797275066, + "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2745 }, { "clip_ratio": 0.0, - "completion_length": 1523.9018249511719, + "completion_length": 1720.6853637695312, "epoch": 0.8202524083339556, - "grad_norm": 18.901100158691406, - "kl": 0.231689453125, - "learning_rate": 9.50627817454427e-09, - "loss": 0.1418, - "reward": 0.4642857387661934, - "reward_std": 0.19716470688581467, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 10.755812644958496, + "kl": 2.3515625, + "learning_rate": 4.753139087272134e-08, + "loss": 0.1564, + "reward": 0.5753348395228386, + "reward_std": 0.16246528550982475, + "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345982164144516, + "rewards/tag_count_reward": 0.4414062723517418, "step": 2746 }, { "clip_ratio": 0.0, - "completion_length": 1538.6340026855469, + "completion_length": 1733.08935546875, "epoch": 0.8205511164214772, - "grad_norm": 13.828526496887207, - "kl": 0.239013671875, - "learning_rate": 9.475708099694125e-09, - "loss": 0.1106, - "reward": 0.3666294813156128, - "reward_std": 0.17756206169724464, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 26.904523849487305, + "kl": 2.625, + "learning_rate": 4.737854049847062e-08, + "loss": 0.1824, + "reward": 0.502232164144516, + "reward_std": 0.13225445337593555, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.4397321566939354, "step": 2747 }, { "clip_ratio": 0.0, - "completion_length": 1565.69873046875, + "completion_length": 1750.7969360351562, "epoch": 0.8208498245089986, - "grad_norm": 16.344032287597656, - "kl": 0.25537109375, - "learning_rate": 9.445182111301747e-09, - "loss": 0.1278, - "reward": 0.3783482238650322, - "reward_std": 0.2255045734345913, - "rewards/accuracy_reward": 0.05803571664728224, + "grad_norm": 8.477344512939453, + "kl": 3.41796875, + "learning_rate": 4.722591055650873e-08, + "loss": 0.2192, + "reward": 0.5106026977300644, + "reward_std": 0.19650974869728088, + "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.423549123108387, "step": 2748 }, { "clip_ratio": 0.0, - "completion_length": 1551.82373046875, + "completion_length": 1760.3840026855469, "epoch": 0.8211485325965201, - "grad_norm": 13.001303672790527, - "kl": 0.249755859375, - "learning_rate": 9.414700242576434e-09, - "loss": 0.1008, - "reward": 0.3956473395228386, - "reward_std": 0.21206364035606384, - "rewards/accuracy_reward": 0.04910714365541935, + "grad_norm": 3.753859519958496, + "kl": 3.125, + "learning_rate": 4.7073501212882175e-08, + "loss": 0.1788, + "reward": 0.4642857387661934, + "reward_std": 0.16779973357915878, + "rewards/accuracy_reward": 0.04241071594879031, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401977300644, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2749 }, { "clip_ratio": 0.0, - "completion_length": 1512.4509582519531, + "completion_length": 1737.915283203125, "epoch": 0.8214472406840415, - "grad_norm": 14.51625919342041, - "kl": 0.220947265625, - "learning_rate": 9.384262526679487e-09, - "loss": 0.1343, - "reward": 0.3420759066939354, - "reward_std": 0.1405128724873066, - "rewards/accuracy_reward": 0.0022321429569274187, + "grad_norm": 19.342784881591797, + "kl": 3.30078125, + "learning_rate": 4.6921312633397435e-08, + "loss": 0.2043, + "reward": 0.4330357387661934, + "reward_std": 0.10739328898489475, + "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2750 }, { "clip_ratio": 0.0, - "completion_length": 1522.7589721679688, + "completion_length": 1728.9822387695312, "epoch": 0.821745948771563, - "grad_norm": 16.549142837524414, - "kl": 0.235595703125, - "learning_rate": 9.353868996724123e-09, - "loss": 0.1267, - "reward": 0.4129464477300644, - "reward_std": 0.21338489279150963, - "rewards/accuracy_reward": 0.08482143329456449, + "grad_norm": 21.5251522064209, + "kl": 2.783203125, + "learning_rate": 4.676934498362062e-08, + "loss": 0.1855, + "reward": 0.5691964626312256, + "reward_std": 0.16651872545480728, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4486607313156128, "step": 2751 }, { "clip_ratio": 0.0, - "completion_length": 1555.5045166015625, + "completion_length": 1762.44873046875, "epoch": 0.8220446568590845, - "grad_norm": 15.006261825561523, - "kl": 0.217041015625, - "learning_rate": 9.323519685775561e-09, - "loss": 0.1351, - "reward": 0.365513414144516, - "reward_std": 0.1795053817331791, - "rewards/accuracy_reward": 0.017857143422588706, + "grad_norm": 10.28956413269043, + "kl": 3.09375, + "learning_rate": 4.66175984288778e-08, + "loss": 0.1942, + "reward": 0.4531250223517418, + "reward_std": 0.1373790167272091, + "rewards/accuracy_reward": 0.024553571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562723517418, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2752 }, { "clip_ratio": 0.0, - "completion_length": 1600.5781860351562, + "completion_length": 1811.2322387695312, "epoch": 0.822343364946606, - "grad_norm": 14.692523956298828, - "kl": 0.266357421875, - "learning_rate": 9.293214626850837e-09, - "loss": 0.1244, - "reward": 0.3381696566939354, - "reward_std": 0.15712940320372581, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 4.028258323669434, + "kl": 4.015625, + "learning_rate": 4.646607313425419e-08, + "loss": 0.2332, + "reward": 0.482700914144516, + "reward_std": 0.14489728212356567, + "rewards/accuracy_reward": 0.05803571571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297991082072258, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2753 }, { "clip_ratio": 0.0, - "completion_length": 1486.2344360351562, + "completion_length": 1698.6808776855469, "epoch": 0.8226420730341274, - "grad_norm": 17.513696670532227, - "kl": 0.22509765625, - "learning_rate": 9.26295385291891e-09, - "loss": 0.1796, - "reward": 0.3789062723517418, - "reward_std": 0.18667571991682053, - "rewards/accuracy_reward": 0.04241071711294353, + "grad_norm": 9.155792236328125, + "kl": 4.0703125, + "learning_rate": 4.631476926459454e-08, + "loss": 0.2402, + "reward": 0.4670759066939354, + "reward_std": 0.14625351503491402, + "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4291294887661934, "step": 2754 }, { "clip_ratio": 0.0, - "completion_length": 1569.6540832519531, + "completion_length": 1766.26123046875, "epoch": 0.8229407811216489, - "grad_norm": 13.248551368713379, - "kl": 0.232666015625, - "learning_rate": 9.232737396900542e-09, - "loss": 0.1245, - "reward": 0.4620535895228386, - "reward_std": 0.20858993381261826, - "rewards/accuracy_reward": 0.1383928619325161, + "grad_norm": 18.793636322021484, + "kl": 4.75390625, + "learning_rate": 4.616368698450271e-08, + "loss": 0.246, + "reward": 0.5463169887661934, + "reward_std": 0.17295417562127113, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2755 }, { "clip_ratio": 0.0, - "completion_length": 1571.8795471191406, + "completion_length": 1757.1116943359375, "epoch": 0.8232394892091703, - "grad_norm": 13.340867042541504, - "kl": 0.2314453125, - "learning_rate": 9.202565291668252e-09, - "loss": 0.1312, - "reward": 0.3811384066939354, - "reward_std": 0.2286497876048088, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 12.060697555541992, + "kl": 4.03125, + "learning_rate": 4.601282645834126e-08, + "loss": 0.2253, + "reward": 0.486049123108387, + "reward_std": 0.18290539272129536, + "rewards/accuracy_reward": 0.058035715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.428013414144516, "step": 2756 }, { "clip_ratio": 0.0, - "completion_length": 1545.7701721191406, + "completion_length": 1816.8215026855469, "epoch": 0.8235381972966918, - "grad_norm": 16.095064163208008, - "kl": 0.213623046875, - "learning_rate": 9.172437570046342e-09, - "loss": 0.1344, - "reward": 0.4090401902794838, - "reward_std": 0.2062562182545662, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 10.645362854003906, + "kl": 3.12890625, + "learning_rate": 4.5862187850231715e-08, + "loss": 0.1804, + "reward": 0.5189732387661934, + "reward_std": 0.17457383126020432, + "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4207589477300644, "step": 2757 }, { "clip_ratio": 0.0, - "completion_length": 1488.4174499511719, + "completion_length": 1682.3281860351562, "epoch": 0.8238369053842133, - "grad_norm": 14.913588523864746, - "kl": 0.212890625, - "learning_rate": 9.142354264810842e-09, - "loss": 0.1416, - "reward": 0.3789062649011612, - "reward_std": 0.1844697706401348, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 8.564470291137695, + "kl": 3.83203125, + "learning_rate": 4.571177132405421e-08, + "loss": 0.2386, + "reward": 0.4888393059372902, + "reward_std": 0.17207377217710018, + "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562574505806, + "rewards/tag_count_reward": 0.4375000223517418, "step": 2758 }, { "clip_ratio": 0.0, - "completion_length": 1624.5826721191406, + "completion_length": 1824.087158203125, "epoch": 0.8241356134717347, - "grad_norm": 11.568017959594727, - "kl": 0.2509765625, - "learning_rate": 9.112315408689414e-09, - "loss": 0.1124, - "reward": 0.3627232238650322, - "reward_std": 0.2047719769179821, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 9.462615966796875, + "kl": 3.92578125, + "learning_rate": 4.556157704344707e-08, + "loss": 0.2122, + "reward": 0.489397332072258, + "reward_std": 0.15512429550290108, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553656578064, + "rewards/tag_count_reward": 0.420200914144516, "step": 2759 }, { "clip_ratio": 0.0, - "completion_length": 1593.6250610351562, + "completion_length": 1709.977783203125, "epoch": 0.8244343215592562, - "grad_norm": 14.136724472045898, - "kl": 0.25537109375, - "learning_rate": 9.08232103436139e-09, - "loss": 0.1317, - "reward": 0.420758955180645, - "reward_std": 0.18457452580332756, - "rewards/accuracy_reward": 0.12276786123402417, + "grad_norm": 13.1007661819458, + "kl": 3.0234375, + "learning_rate": 4.541160517180695e-08, + "loss": 0.1915, + "reward": 0.5915178954601288, + "reward_std": 0.17268083058297634, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297991082072258, + "rewards/tag_count_reward": 0.4375000223517418, "step": 2760 }, { "clip_ratio": 0.0, - "completion_length": 1667.2879943847656, + "completion_length": 1849.9264221191406, "epoch": 0.8247330296467776, - "grad_norm": 12.692039489746094, - "kl": 0.2548828125, - "learning_rate": 9.052371174457735e-09, - "loss": 0.0961, - "reward": 0.3225446566939354, - "reward_std": 0.19119861349463463, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 5.7390265464782715, + "kl": 3.7578125, + "learning_rate": 4.526185587228867e-08, + "loss": 0.1996, + "reward": 0.4508928880095482, + "reward_std": 0.18440180644392967, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196492433548, + "rewards/tag_count_reward": 0.4174107387661934, "step": 2761 }, { "clip_ratio": 0.0, - "completion_length": 1542.9755249023438, + "completion_length": 1797.2054443359375, "epoch": 0.8250317377342992, - "grad_norm": 13.144784927368164, - "kl": 0.228759765625, - "learning_rate": 9.02246586156093e-09, - "loss": 0.1307, - "reward": 0.3900669813156128, - "reward_std": 0.1784358099102974, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 4.378124237060547, + "kl": 4.25390625, + "learning_rate": 4.511232930780465e-08, + "loss": 0.2535, + "reward": 0.4860491305589676, + "reward_std": 0.12500005587935448, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2762 }, { "clip_ratio": 0.0, - "completion_length": 1594.4197082519531, + "completion_length": 1779.680908203125, "epoch": 0.8253304458218206, - "grad_norm": 12.052157402038574, - "kl": 0.224853515625, - "learning_rate": 8.992605128205044e-09, - "loss": 0.107, - "reward": 0.3744419738650322, - "reward_std": 0.21648241579532623, - "rewards/accuracy_reward": 0.040178573690354824, + "grad_norm": 2.2467453479766846, + "kl": 2.603515625, + "learning_rate": 4.496302564102522e-08, + "loss": 0.1524, + "reward": 0.4916294813156128, + "reward_std": 0.20204727724194527, + "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.4335937798023224, "step": 2763 }, { "clip_ratio": 0.0, - "completion_length": 1638.9465026855469, + "completion_length": 1822.1451416015625, "epoch": 0.8256291539093421, - "grad_norm": 12.106576919555664, - "kl": 0.24072265625, - "learning_rate": 8.962789006875616e-09, - "loss": 0.1167, - "reward": 0.4603794887661934, - "reward_std": 0.21171562001109123, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 13.633927345275879, + "kl": 2.81640625, + "learning_rate": 4.4813945034378075e-08, + "loss": 0.1703, + "reward": 0.5613839402794838, + "reward_std": 0.1623811051249504, + "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2764 }, { "clip_ratio": 0.0, - "completion_length": 1644.4219665527344, + "completion_length": 1785.76123046875, "epoch": 0.8259278619968635, - "grad_norm": 10.255744934082031, - "kl": 0.309814453125, - "learning_rate": 8.933017530009668e-09, - "loss": 0.0888, - "reward": 0.3426339328289032, - "reward_std": 0.17234552651643753, - "rewards/accuracy_reward": 0.046875000931322575, + "grad_norm": 18.230432510375977, + "kl": 2.8984375, + "learning_rate": 4.4665087650048336e-08, + "loss": 0.1808, + "reward": 0.4799107387661934, + "reward_std": 0.13493396900594234, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589402794838, + "rewards/tag_count_reward": 0.428571455180645, "step": 2765 }, { "clip_ratio": 0.0, - "completion_length": 1526.0625915527344, + "completion_length": 1704.9197387695312, "epoch": 0.826226570084385, - "grad_norm": 12.821746826171875, - "kl": 0.243408203125, - "learning_rate": 8.90329072999566e-09, - "loss": 0.1108, - "reward": 0.4034598395228386, - "reward_std": 0.19647909700870514, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 5.058344841003418, + "kl": 3.64453125, + "learning_rate": 4.4516453649978304e-08, + "loss": 0.2266, + "reward": 0.5011161044239998, + "reward_std": 0.16021873615682125, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2766 }, { "clip_ratio": 0.0, - "completion_length": 1690.5335388183594, + "completion_length": 1856.7590026855469, "epoch": 0.8265252781719065, - "grad_norm": 14.258857727050781, - "kl": 0.269287109375, - "learning_rate": 8.873608639173419e-09, - "loss": 0.108, - "reward": 0.3152901902794838, - "reward_std": 0.17972948774695396, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 20.13213539123535, + "kl": 4.75, + "learning_rate": 4.436804319586709e-08, + "loss": 0.2462, + "reward": 0.428013414144516, + "reward_std": 0.1505781952291727, + "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.4056919813156128, "step": 2767 }, { "clip_ratio": 0.0, - "completion_length": 1620.0826416015625, + "completion_length": 1811.3193054199219, "epoch": 0.826823986259428, - "grad_norm": 11.526544570922852, - "kl": 0.25537109375, - "learning_rate": 8.843971289834157e-09, - "loss": 0.084, - "reward": 0.3504464402794838, - "reward_std": 0.20914439111948013, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 4.587947368621826, + "kl": 3.54296875, + "learning_rate": 4.421985644917078e-08, + "loss": 0.2035, + "reward": 0.4687500223517418, + "reward_std": 0.19177458062767982, + "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357238650322, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2768 }, { "clip_ratio": 0.0, - "completion_length": 1616.7969665527344, + "completion_length": 1758.3393859863281, "epoch": 0.8271226943469494, - "grad_norm": 13.605602264404297, - "kl": 0.273681640625, - "learning_rate": 8.814378714220421e-09, - "loss": 0.1276, - "reward": 0.4631696715950966, - "reward_std": 0.18097121641039848, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 15.049426078796387, + "kl": 3.84765625, + "learning_rate": 4.4071893571102106e-08, + "loss": 0.2578, + "reward": 0.5803571715950966, + "reward_std": 0.1423000916838646, + "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2769 }, { "clip_ratio": 0.0, - "completion_length": 1605.8639221191406, + "completion_length": 1789.8193054199219, "epoch": 0.8274214024344709, - "grad_norm": 14.871074676513672, - "kl": 0.240478515625, - "learning_rate": 8.784830944526005e-09, - "loss": 0.1151, - "reward": 0.4095982387661934, - "reward_std": 0.19729825481772423, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 12.692499160766602, + "kl": 4.83984375, + "learning_rate": 4.392415472263003e-08, + "loss": 0.2733, + "reward": 0.5178571790456772, + "reward_std": 0.16736486367881298, + "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2770 }, { "clip_ratio": 0.0, - "completion_length": 1629.6652526855469, + "completion_length": 1776.5447082519531, "epoch": 0.8277201105219923, - "grad_norm": 11.85389518737793, - "kl": 0.26123046875, - "learning_rate": 8.755328012896002e-09, - "loss": 0.1086, - "reward": 0.3761160746216774, - "reward_std": 0.19249200820922852, - "rewards/accuracy_reward": 0.06250000488944352, + "grad_norm": 8.108077049255371, + "kl": 4.140625, + "learning_rate": 4.3776640064480006e-08, + "loss": 0.2426, + "reward": 0.4877232313156128, + "reward_std": 0.17130259796977043, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2771 }, { "clip_ratio": 0.0, - "completion_length": 1519.6340026855469, + "completion_length": 1760.24560546875, "epoch": 0.8280188186095139, - "grad_norm": 13.007307052612305, - "kl": 0.250244140625, - "learning_rate": 8.725869951426723e-09, - "loss": 0.1323, - "reward": 0.404017873108387, - "reward_std": 0.23751064389944077, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 10.073029518127441, + "kl": 4.69140625, + "learning_rate": 4.362934975713362e-08, + "loss": 0.2737, + "reward": 0.4860491305589676, + "reward_std": 0.19546386413276196, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2772 }, { "clip_ratio": 0.0, - "completion_length": 1549.8817749023438, + "completion_length": 1746.1741943359375, "epoch": 0.8283175266970353, - "grad_norm": 13.906777381896973, - "kl": 0.245849609375, - "learning_rate": 8.696456792165619e-09, - "loss": 0.1335, - "reward": 0.4687500149011612, - "reward_std": 0.23507516831159592, - "rewards/accuracy_reward": 0.13839286286383867, + "grad_norm": 5.034422874450684, + "kl": 3.4921875, + "learning_rate": 4.348228396082809e-08, + "loss": 0.199, + "reward": 0.6054687649011612, + "reward_std": 0.23008305206894875, + "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2773 }, { "clip_ratio": 0.0, - "completion_length": 1632.9398193359375, + "completion_length": 1800.4644165039062, "epoch": 0.8286162347845568, - "grad_norm": 11.192665100097656, - "kl": 0.250732421875, - "learning_rate": 8.667088567111348e-09, - "loss": 0.1003, - "reward": 0.3939732387661934, - "reward_std": 0.19699643924832344, - "rewards/accuracy_reward": 0.07142857508733869, + "grad_norm": 6.503471374511719, + "kl": 2.771484375, + "learning_rate": 4.3335442835556735e-08, + "loss": 0.1608, + "reward": 0.5022321566939354, + "reward_std": 0.15359157510101795, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4263392984867096, "step": 2774 }, { "clip_ratio": 0.0, - "completion_length": 1585.2143859863281, + "completion_length": 1776.6005554199219, "epoch": 0.8289149428720782, - "grad_norm": 12.146256446838379, - "kl": 0.227783203125, - "learning_rate": 8.637765308213668e-09, - "loss": 0.1192, - "reward": 0.3833705484867096, - "reward_std": 0.2352929711341858, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 6.779937744140625, + "kl": 2.984375, + "learning_rate": 4.318882654106834e-08, + "loss": 0.1681, + "reward": 0.5005580484867096, + "reward_std": 0.1902486328035593, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2775 }, { "clip_ratio": 0.0, - "completion_length": 1478.9152526855469, + "completion_length": 1700.7991943359375, "epoch": 0.8292136509595998, - "grad_norm": 10.638873100280762, - "kl": 0.232666015625, - "learning_rate": 8.608487047373381e-09, - "loss": 0.0989, - "reward": 0.5189732387661934, - "reward_std": 0.20748920366168022, - "rewards/accuracy_reward": 0.1718750074505806, + "grad_norm": 8.174077987670898, + "kl": 3.84375, + "learning_rate": 4.3042435236866906e-08, + "loss": 0.224, + "reward": 0.6004464477300644, + "reward_std": 0.1744564939290285, + "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982387661934, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2776 }, { "clip_ratio": 0.0, - "completion_length": 1572.9911499023438, + "completion_length": 1752.5960388183594, "epoch": 0.8295123590471212, - "grad_norm": 12.153519630432129, - "kl": 0.23779296875, - "learning_rate": 8.5792538164424e-09, - "loss": 0.1088, - "reward": 0.3900669813156128, - "reward_std": 0.20118067041039467, - "rewards/accuracy_reward": 0.0580357164144516, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "grad_norm": 7.453596591949463, + "kl": 3.22265625, + "learning_rate": 4.2896269082212e-08, + "loss": 0.196, + "reward": 0.5200893133878708, + "reward_std": 0.1831824593245983, + "rewards/accuracy_reward": 0.08705357671715319, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4330357387661934, "step": 2777 }, { "clip_ratio": 0.0, - "completion_length": 1534.1897888183594, + "completion_length": 1715.3795471191406, "epoch": 0.8298110671346427, - "grad_norm": 14.396917343139648, - "kl": 0.257568359375, - "learning_rate": 8.55006564722358e-09, - "loss": 0.0985, - "reward": 0.507254496216774, - "reward_std": 0.2322000376880169, - "rewards/accuracy_reward": 0.17410715389996767, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "grad_norm": 9.621438026428223, + "kl": 3.1953125, + "learning_rate": 4.2750328236117906e-08, + "loss": 0.1982, + "reward": 0.6339286044239998, + "reward_std": 0.18803077191114426, + "rewards/accuracy_reward": 0.1941964328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4397321566939354, "step": 2778 }, { "clip_ratio": 0.0, - "completion_length": 1643.9844360351562, + "completion_length": 1774.2054443359375, "epoch": 0.8301097752221641, - "grad_norm": 13.384730339050293, - "kl": 0.21728515625, - "learning_rate": 8.520922571470806e-09, - "loss": 0.1221, - "reward": 0.3537946566939354, - "reward_std": 0.18176624551415443, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 7.126408576965332, + "kl": 3.06640625, + "learning_rate": 4.2604612857354036e-08, + "loss": 0.1858, + "reward": 0.475446455180645, + "reward_std": 0.16569902002811432, + "rewards/accuracy_reward": 0.03794643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4375000223517418, "step": 2779 }, { "clip_ratio": 0.0, - "completion_length": 1594.9732666015625, + "completion_length": 1811.5000915527344, "epoch": 0.8304084833096856, - "grad_norm": 14.51622486114502, - "kl": 0.264404296875, - "learning_rate": 8.491824620888904e-09, - "loss": 0.128, - "reward": 0.337611623108387, - "reward_std": 0.18602729588747025, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 23.061695098876953, + "kl": 4.48046875, + "learning_rate": 4.245912310444452e-08, + "loss": 0.2537, + "reward": 0.4458705559372902, + "reward_std": 0.18903403356671333, + "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4034598395228386, "step": 2780 }, { "clip_ratio": 0.0, - "completion_length": 1621.587158203125, + "completion_length": 1842.8125915527344, "epoch": 0.8307071913972071, - "grad_norm": 15.766510963439941, - "kl": 0.29736328125, - "learning_rate": 8.462771827133563e-09, - "loss": 0.1201, - "reward": 0.4190848395228386, - "reward_std": 0.20209532976150513, - "rewards/accuracy_reward": 0.08705357555299997, + "grad_norm": 6.052119731903076, + "kl": 3.13671875, + "learning_rate": 4.231385913566782e-08, + "loss": 0.1792, + "reward": 0.522879496216774, + "reward_std": 0.16933592408895493, + "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2781 }, { "clip_ratio": 0.0, - "completion_length": 1700.8348693847656, + "completion_length": 1835.8415832519531, "epoch": 0.8310058994847286, - "grad_norm": 12.709386825561523, - "kl": 0.23583984375, - "learning_rate": 8.433764221811385e-09, - "loss": 0.1115, - "reward": 0.3549107313156128, - "reward_std": 0.18617719039320946, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 7.794976711273193, + "kl": 3.8828125, + "learning_rate": 4.2168821109056926e-08, + "loss": 0.2124, + "reward": 0.4709821715950966, + "reward_std": 0.15543937124311924, + "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357238650322, + "rewards/tag_count_reward": 0.4174107387661934, "step": 2782 }, { "clip_ratio": 0.0, - "completion_length": 1639.0536499023438, + "completion_length": 1753.6050109863281, "epoch": 0.83130460757225, - "grad_norm": 16.39934539794922, - "kl": 0.26318359375, - "learning_rate": 8.404801836479808e-09, - "loss": 0.1419, - "reward": 0.4458705559372902, - "reward_std": 0.1989843063056469, - "rewards/accuracy_reward": 0.11830357788130641, + "grad_norm": 8.775994300842285, + "kl": 4.0234375, + "learning_rate": 4.202400918239904e-08, + "loss": 0.2569, + "reward": 0.5429687723517418, + "reward_std": 0.14894486032426357, + "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669664144516, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2783 }, { "clip_ratio": 0.0, - "completion_length": 1621.3415832519531, + "completion_length": 1803.9755249023438, "epoch": 0.8316033156597715, - "grad_norm": 13.603652954101562, - "kl": 0.250732421875, - "learning_rate": 8.375884702647062e-09, - "loss": 0.1232, - "reward": 0.3320312649011612, - "reward_std": 0.18190258741378784, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 10.189074516296387, + "kl": 4.318359375, + "learning_rate": 4.1879423513235315e-08, + "loss": 0.2311, + "reward": 0.4447544813156128, + "reward_std": 0.12989217415452003, + "rewards/accuracy_reward": 0.0223214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2784 }, { "clip_ratio": 0.0, - "completion_length": 1532.3639221191406, + "completion_length": 1739.3460693359375, "epoch": 0.8319020237472929, - "grad_norm": 13.075509071350098, - "kl": 0.245361328125, - "learning_rate": 8.347012851772173e-09, - "loss": 0.1178, - "reward": 0.4726562723517418, - "reward_std": 0.2072011511772871, - "rewards/accuracy_reward": 0.1316964328289032, + "grad_norm": 6.270489692687988, + "kl": 3.0, + "learning_rate": 4.173506425886086e-08, + "loss": 0.1655, + "reward": 0.5781250298023224, + "reward_std": 0.14418099261820316, + "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2785 }, { "clip_ratio": 0.0, - "completion_length": 1530.1295166015625, + "completion_length": 1716.6697692871094, "epoch": 0.8322007318348145, - "grad_norm": 17.76808738708496, - "kl": 0.2744140625, - "learning_rate": 8.318186315264857e-09, - "loss": 0.1619, - "reward": 0.3861607313156128, - "reward_std": 0.19023917987942696, - "rewards/accuracy_reward": 0.053571430733427405, + "grad_norm": 8.07413101196289, + "kl": 4.03125, + "learning_rate": 4.159093157632429e-08, + "loss": 0.2421, + "reward": 0.5011160895228386, + "reward_std": 0.17074968665838242, + "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2786 }, { "clip_ratio": 0.0, - "completion_length": 1642.7813110351562, + "completion_length": 1861.7702026367188, "epoch": 0.8324994399223359, - "grad_norm": 10.922505378723145, - "kl": 0.289794921875, - "learning_rate": 8.289405124485571e-09, - "loss": 0.102, - "reward": 0.352678582072258, - "reward_std": 0.17747922241687775, - "rewards/accuracy_reward": 0.0535714291036129, + "grad_norm": 14.408561706542969, + "kl": 4.7265625, + "learning_rate": 4.144702562242785e-08, + "loss": 0.2497, + "reward": 0.4843750149011612, + "reward_std": 0.16984355077147484, + "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071492433548, + "rewards/tag_count_reward": 0.4017857313156128, "step": 2787 }, { "clip_ratio": 0.0, - "completion_length": 1538.9978637695312, + "completion_length": 1718.3036499023438, "epoch": 0.8327981480098574, - "grad_norm": 16.97675895690918, - "kl": 0.255126953125, - "learning_rate": 8.260669310745444e-09, - "loss": 0.1268, - "reward": 0.4090401977300644, - "reward_std": 0.18580953404307365, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 4.0564141273498535, + "kl": 3.41796875, + "learning_rate": 4.130334655372722e-08, + "loss": 0.1969, + "reward": 0.487723246216774, + "reward_std": 0.14724065363407135, + "rewards/accuracy_reward": 0.06919643003493547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2788 }, { "clip_ratio": 0.0, - "completion_length": 1612.6585693359375, + "completion_length": 1767.4732971191406, "epoch": 0.8330968560973788, - "grad_norm": 14.014906883239746, - "kl": 0.290283203125, - "learning_rate": 8.231978905306204e-09, - "loss": 0.1216, - "reward": 0.352678582072258, - "reward_std": 0.201266847550869, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 13.340109825134277, + "kl": 3.0, + "learning_rate": 4.115989452653101e-08, + "loss": 0.1906, + "reward": 0.5005580559372902, + "reward_std": 0.15062959492206573, + "rewards/accuracy_reward": 0.08035714458674192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357313156128, + "rewards/tag_count_reward": 0.4202009066939354, "step": 2789 }, { "clip_ratio": 0.0, - "completion_length": 1511.4911499023438, + "completion_length": 1713.8036804199219, "epoch": 0.8333955641849004, - "grad_norm": 15.98135757446289, - "kl": 0.241943359375, - "learning_rate": 8.203333939380207e-09, - "loss": 0.1384, - "reward": 0.4402901977300644, - "reward_std": 0.17116467095911503, - "rewards/accuracy_reward": 0.09821428847499192, + "grad_norm": 11.27281665802002, + "kl": 3.28515625, + "learning_rate": 4.101666969690104e-08, + "loss": 0.2106, + "reward": 0.5479911044239998, + "reward_std": 0.15904320031404495, + "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3420759066939354, + "rewards/tag_count_reward": 0.427455373108387, "step": 2790 }, { "clip_ratio": 0.0, - "completion_length": 1604.0647888183594, + "completion_length": 1769.5804443359375, "epoch": 0.8336942722724218, - "grad_norm": 16.035985946655273, - "kl": 0.267822265625, - "learning_rate": 8.174734444130382e-09, - "loss": 0.1415, - "reward": 0.420200914144516, - "reward_std": 0.18106476962566376, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 7.325322151184082, + "kl": 4.3203125, + "learning_rate": 4.087367222065191e-08, + "loss": 0.2469, + "reward": 0.5111607313156128, + "reward_std": 0.1578002106398344, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4062500149011612, "step": 2791 }, { "clip_ratio": 0.0, - "completion_length": 1647.7210388183594, + "completion_length": 1805.24560546875, "epoch": 0.8339929803599433, - "grad_norm": 14.340829849243164, - "kl": 0.25048828125, - "learning_rate": 8.146180450670153e-09, - "loss": 0.1351, - "reward": 0.3660714402794838, - "reward_std": 0.18106476217508316, - "rewards/accuracy_reward": 0.05133928940631449, + "grad_norm": 8.508357048034668, + "kl": 3.8671875, + "learning_rate": 4.073090225335077e-08, + "loss": 0.1999, + "reward": 0.4916294813156128, + "reward_std": 0.16134170815348625, + "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.420200914144516, "step": 2792 }, { "clip_ratio": 0.0, - "completion_length": 1646.4375610351562, + "completion_length": 1820.8237609863281, "epoch": 0.8342916884474647, - "grad_norm": 15.260019302368164, - "kl": 0.29052734375, - "learning_rate": 8.117671990063485e-09, - "loss": 0.1288, - "reward": 0.3537946492433548, - "reward_std": 0.189312145113945, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 6.702682018280029, + "kl": 4.33203125, + "learning_rate": 4.058835995031742e-08, + "loss": 0.2384, + "reward": 0.4838169887661934, + "reward_std": 0.1821067165583372, + "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3002232238650322, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2793 }, { "clip_ratio": 0.0, - "completion_length": 1582.060302734375, + "completion_length": 1753.2835693359375, "epoch": 0.8345903965349862, - "grad_norm": 14.09676742553711, - "kl": 0.2587890625, - "learning_rate": 8.089209093324784e-09, - "loss": 0.1485, - "reward": 0.423549123108387, - "reward_std": 0.19569650292396545, - "rewards/accuracy_reward": 0.10491072246804833, + "grad_norm": 7.991125583648682, + "kl": 3.328125, + "learning_rate": 4.044604546662392e-08, + "loss": 0.204, + "reward": 0.550781287252903, + "reward_std": 0.16032158955931664, + "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2794 }, { "clip_ratio": 0.0, - "completion_length": 1586.0781860351562, + "completion_length": 1772.5625610351562, "epoch": 0.8348891046225076, - "grad_norm": 15.846341133117676, - "kl": 0.2626953125, - "learning_rate": 8.060791791418886e-09, - "loss": 0.1221, - "reward": 0.4118303805589676, - "reward_std": 0.20412638783454895, - "rewards/accuracy_reward": 0.07366071688011289, + "grad_norm": 13.692147254943848, + "kl": 2.90625, + "learning_rate": 4.030395895709443e-08, + "loss": 0.195, + "reward": 0.5083705559372902, + "reward_std": 0.14044577442109585, + "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4391741305589676, "step": 2795 }, { "clip_ratio": 0.0, - "completion_length": 1628.5380249023438, + "completion_length": 1813.9487609863281, "epoch": 0.8351878127100292, - "grad_norm": 16.714340209960938, - "kl": 0.232421875, - "learning_rate": 8.032420115261046e-09, - "loss": 0.1305, - "reward": 0.4213169813156128, - "reward_std": 0.2155333273112774, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 4.373557090759277, + "kl": 3.44140625, + "learning_rate": 4.016210057630523e-08, + "loss": 0.1953, + "reward": 0.502232164144516, + "reward_std": 0.1702708825469017, + "rewards/accuracy_reward": 0.07366071920841932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.428571455180645, "step": 2796 }, { "clip_ratio": 0.0, - "completion_length": 1538.8326721191406, + "completion_length": 1740.2947082519531, "epoch": 0.8354865207975506, - "grad_norm": 13.590836524963379, - "kl": 0.21484375, - "learning_rate": 8.004094095716841e-09, - "loss": 0.1208, - "reward": 0.3750000149011612, - "reward_std": 0.21087028086185455, - "rewards/accuracy_reward": 0.03348214435391128, + "grad_norm": 12.968836784362793, + "kl": 2.59375, + "learning_rate": 4.00204704785842e-08, + "loss": 0.1494, + "reward": 0.4988839477300644, + "reward_std": 0.17823892831802368, + "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.443080373108387, "step": 2797 }, { "clip_ratio": 0.0, - "completion_length": 1561.8772888183594, + "completion_length": 1769.4085388183594, "epoch": 0.8357852288850721, - "grad_norm": 9.992125511169434, - "kl": 0.21533203125, - "learning_rate": 7.975813763602218e-09, - "loss": 0.0983, - "reward": 0.4665178656578064, - "reward_std": 0.2117176540195942, - "rewards/accuracy_reward": 0.12276786006987095, + "grad_norm": 18.100200653076172, + "kl": 4.03515625, + "learning_rate": 3.987906881801109e-08, + "loss": 0.2161, + "reward": 0.5558035969734192, + "reward_std": 0.2106059268116951, + "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3437500149011612, + "rewards/tag_count_reward": 0.4218750149011612, "step": 2798 }, { "clip_ratio": 0.0, - "completion_length": 1551.9933471679688, + "completion_length": 1752.9531860351562, "epoch": 0.8360839369725935, - "grad_norm": 14.453038215637207, - "kl": 0.27099609375, - "learning_rate": 7.94757914968342e-09, - "loss": 0.1262, - "reward": 0.5267857387661934, - "reward_std": 0.18370817601680756, - "rewards/accuracy_reward": 0.20089286379516125, + "grad_norm": 24.076189041137695, + "kl": 4.9765625, + "learning_rate": 3.97378957484171e-08, + "loss": 0.2729, + "reward": 0.6188616305589676, + "reward_std": 0.15001879073679447, + "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.415736623108387, "step": 2799 }, { "clip_ratio": 0.0, - "completion_length": 1543.9755249023438, + "completion_length": 1757.7009887695312, "epoch": 0.836382645060115, - "grad_norm": 17.44178009033203, - "kl": 0.2607421875, - "learning_rate": 7.91939028467692e-09, - "loss": 0.1361, - "reward": 0.3655133992433548, - "reward_std": 0.21364489942789078, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 25.359663009643555, + "kl": 4.5859375, + "learning_rate": 3.9596951423384596e-08, + "loss": 0.2659, + "reward": 0.4832589477300644, + "reward_std": 0.19660427793860435, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4140625223517418, "step": 2800 }, { "clip_ratio": 0.0, - "completion_length": 1574.9509582519531, + "completion_length": 1767.0156860351562, "epoch": 0.8366813531476365, - "grad_norm": 15.991138458251953, - "kl": 0.254638671875, - "learning_rate": 7.89124719924944e-09, - "loss": 0.1409, - "reward": 0.5898437649011612, - "reward_std": 0.21256392821669579, - "rewards/accuracy_reward": 0.2589285895228386, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "grad_norm": 13.452777862548828, + "kl": 4.53515625, + "learning_rate": 3.94562359962472e-08, + "loss": 0.2648, + "reward": 0.718191996216774, + "reward_std": 0.19460217095911503, + "rewards/accuracy_reward": 0.2879464440047741, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4302455559372902, "step": 2801 }, { "clip_ratio": 0.0, - "completion_length": 1685.7255249023438, + "completion_length": 1863.6697692871094, "epoch": 0.8369800612351579, - "grad_norm": 12.31572437286377, - "kl": 0.241455078125, - "learning_rate": 7.86314992401792e-09, - "loss": 0.1028, - "reward": 0.4252232238650322, - "reward_std": 0.18398860469460487, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 28.666614532470703, + "kl": 4.2109375, + "learning_rate": 3.93157496200896e-08, + "loss": 0.2109, + "reward": 0.5429687798023224, + "reward_std": 0.16688236221671104, + "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2802 }, { "clip_ratio": 0.0, - "completion_length": 1612.5603332519531, + "completion_length": 1801.5000610351562, "epoch": 0.8372787693226794, - "grad_norm": 13.246668815612793, - "kl": 0.27490234375, - "learning_rate": 7.835098489549419e-09, - "loss": 0.1241, - "reward": 0.4034598395228386, - "reward_std": 0.1794111207127571, - "rewards/accuracy_reward": 0.08705357508733869, + "grad_norm": 4.676569938659668, + "kl": 3.70703125, + "learning_rate": 3.917549244774709e-08, + "loss": 0.2008, + "reward": 0.4893973469734192, + "reward_std": 0.10330197587609291, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2803 }, { "clip_ratio": 0.0, - "completion_length": 1591.2545166015625, + "completion_length": 1796.9732971191406, "epoch": 0.8375774774102008, - "grad_norm": 12.196746826171875, - "kl": 0.229736328125, - "learning_rate": 7.807092926361152e-09, - "loss": 0.1113, - "reward": 0.3565848395228386, - "reward_std": 0.20065505802631378, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 9.16559886932373, + "kl": 3.6953125, + "learning_rate": 3.903546463180577e-08, + "loss": 0.2251, + "reward": 0.4408482313156128, + "reward_std": 0.15753020904958248, + "rewards/accuracy_reward": 0.033482145285233855, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.4073660895228386, "step": 2804 }, { "clip_ratio": 0.0, - "completion_length": 1576.3348693847656, + "completion_length": 1743.8750610351562, "epoch": 0.8378761854977224, - "grad_norm": 12.154316902160645, - "kl": 0.263671875, - "learning_rate": 7.779133264920446e-09, - "loss": 0.0937, - "reward": 0.4229910895228386, - "reward_std": 0.17533785849809647, - "rewards/accuracy_reward": 0.1049107164144516, + "grad_norm": 29.317298889160156, + "kl": 2.93359375, + "learning_rate": 3.889566632460223e-08, + "loss": 0.1926, + "reward": 0.5407366380095482, + "reward_std": 0.14402149058878422, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3180803656578064, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2805 }, { "clip_ratio": 0.0, - "completion_length": 1615.7098999023438, + "completion_length": 1784.9710388183594, "epoch": 0.8381748935852438, - "grad_norm": 15.249969482421875, - "kl": 0.249267578125, - "learning_rate": 7.751219535644648e-09, - "loss": 0.1158, - "reward": 0.3995535969734192, - "reward_std": 0.19137867912650108, - "rewards/accuracy_reward": 0.07366071734577417, + "grad_norm": 10.296802520751953, + "kl": 3.52734375, + "learning_rate": 3.8756097678223246e-08, + "loss": 0.197, + "reward": 0.521205373108387, + "reward_std": 0.20996534451842308, + "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2806 }, { "clip_ratio": 0.0, - "completion_length": 1521.8304138183594, + "completion_length": 1732.69873046875, "epoch": 0.8384736016727653, - "grad_norm": 11.174019813537598, - "kl": 0.21142578125, - "learning_rate": 7.723351768901171e-09, - "loss": 0.1121, - "reward": 0.4458705559372902, - "reward_std": 0.18330435454845428, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 16.911495208740234, + "kl": 3.23828125, + "learning_rate": 3.8616758844505854e-08, + "loss": 0.2049, + "reward": 0.535156287252903, + "reward_std": 0.14113422855734825, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2807 }, { "clip_ratio": 0.0, - "completion_length": 1526.2300109863281, + "completion_length": 1731.1116638183594, "epoch": 0.8387723097602867, - "grad_norm": 16.20201873779297, - "kl": 0.211181640625, - "learning_rate": 7.695529995007416e-09, - "loss": 0.1595, - "reward": 0.4268973395228386, - "reward_std": 0.22240883484482765, - "rewards/accuracy_reward": 0.0870535746216774, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "grad_norm": 14.176715850830078, + "kl": 2.4052734375, + "learning_rate": 3.847764997503708e-08, + "loss": 0.1675, + "reward": 0.537388414144516, + "reward_std": 0.18824812024831772, + "rewards/accuracy_reward": 0.10267857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2808 }, { "clip_ratio": 0.0, - "completion_length": 1568.5848999023438, + "completion_length": 1775.8348999023438, "epoch": 0.8390710178478082, - "grad_norm": 12.446717262268066, - "kl": 0.224365234375, - "learning_rate": 7.66775424423074e-09, - "loss": 0.1315, - "reward": 0.4079241305589676, - "reward_std": 0.19289325550198555, - "rewards/accuracy_reward": 0.07589286402799189, + "grad_norm": 10.756903648376465, + "kl": 3.265625, + "learning_rate": 3.83387712211537e-08, + "loss": 0.1964, + "reward": 0.5111607387661934, + "reward_std": 0.15385663695633411, + "rewards/accuracy_reward": 0.08482143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2809 }, { "clip_ratio": 0.0, - "completion_length": 1627.0335388183594, + "completion_length": 1758.2701416015625, "epoch": 0.8393697259353297, - "grad_norm": 13.479866981506348, - "kl": 0.253173828125, - "learning_rate": 7.640024546788448e-09, - "loss": 0.129, - "reward": 0.4151785895228386, - "reward_std": 0.18174635246396065, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 17.201772689819336, + "kl": 3.79296875, + "learning_rate": 3.820012273394224e-08, + "loss": 0.2336, + "reward": 0.5636160969734192, + "reward_std": 0.14474987797439098, + "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.294642873108387, + "rewards/tag_count_reward": 0.4252232387661934, "step": 2810 }, { "clip_ratio": 0.0, - "completion_length": 1574.1697387695312, + "completion_length": 1759.4465026855469, "epoch": 0.8396684340228512, - "grad_norm": 11.932221412658691, - "kl": 0.21142578125, - "learning_rate": 7.612340932847711e-09, - "loss": 0.1262, - "reward": 0.4213169887661934, - "reward_std": 0.25017375126481056, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 11.889358520507812, + "kl": 3.53125, + "learning_rate": 3.806170466423855e-08, + "loss": 0.1892, + "reward": 0.5385044887661934, + "reward_std": 0.20358259975910187, + "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2811 }, { "clip_ratio": 0.0, - "completion_length": 1525.7902221679688, + "completion_length": 1690.9152526855469, "epoch": 0.8399671421103726, - "grad_norm": 14.782822608947754, - "kl": 0.25, - "learning_rate": 7.584703432525608e-09, - "loss": 0.1307, - "reward": 0.4525669887661934, - "reward_std": 0.16518716514110565, - "rewards/accuracy_reward": 0.12053571827709675, + "grad_norm": 8.393016815185547, + "kl": 3.30859375, + "learning_rate": 3.792351716262804e-08, + "loss": 0.2163, + "reward": 0.5770089477300644, + "reward_std": 0.12579200975596905, + "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312723517418, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2812 }, { "clip_ratio": 0.0, - "completion_length": 1577.4509582519531, + "completion_length": 1777.9956359863281, "epoch": 0.8402658501978941, - "grad_norm": 14.988221168518066, - "kl": 0.25732421875, - "learning_rate": 7.557112075889032e-09, - "loss": 0.1372, - "reward": 0.3766741305589676, - "reward_std": 0.16587775945663452, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 8.034296035766602, + "kl": 4.0703125, + "learning_rate": 3.778556037944516e-08, + "loss": 0.2399, + "reward": 0.4854910969734192, + "reward_std": 0.11718124151229858, + "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4319196715950966, "step": 2813 }, { "clip_ratio": 0.0, - "completion_length": 1632.1585388183594, + "completion_length": 1811.7746276855469, "epoch": 0.8405645582854155, - "grad_norm": 13.373738288879395, - "kl": 0.24658203125, - "learning_rate": 7.529566892954659e-09, - "loss": 0.1292, - "reward": 0.3537946492433548, - "reward_std": 0.17423346638679504, - "rewards/accuracy_reward": 0.0535714328289032, + "grad_norm": 15.394757270812988, + "kl": 4.9765625, + "learning_rate": 3.764783446477329e-08, + "loss": 0.2787, + "reward": 0.4637276977300644, + "reward_std": 0.12020159512758255, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3002232238650322, + "rewards/tag_count_reward": 0.4146205633878708, "step": 2814 }, { "clip_ratio": 0.0, - "completion_length": 1562.7121276855469, + "completion_length": 1747.3772888183594, "epoch": 0.8408632663729371, - "grad_norm": 14.704797744750977, - "kl": 0.22607421875, - "learning_rate": 7.502067913688952e-09, - "loss": 0.1292, - "reward": 0.341517873108387, - "reward_std": 0.17057053744792938, - "rewards/accuracy_reward": 0.008928572060540318, + "grad_norm": 6.204975605010986, + "kl": 3.70703125, + "learning_rate": 3.7510339568444766e-08, + "loss": 0.23, + "reward": 0.4570312723517418, + "reward_std": 0.13439291529357433, + "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325892984867096, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2815 }, { "clip_ratio": 0.0, - "completion_length": 1573.6920471191406, + "completion_length": 1736.1116943359375, "epoch": 0.8411619744604585, - "grad_norm": 11.92954158782959, - "kl": 0.228759765625, - "learning_rate": 7.474615168008125e-09, - "loss": 0.1381, - "reward": 0.4715401977300644, - "reward_std": 0.19319279119372368, - "rewards/accuracy_reward": 0.14955357648432255, + "grad_norm": 8.071277618408203, + "kl": 4.376953125, + "learning_rate": 3.7373075840040625e-08, + "loss": 0.2444, + "reward": 0.5892857313156128, + "reward_std": 0.1566626150161028, + "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2816 }, { "clip_ratio": 0.0, - "completion_length": 1655.5580749511719, + "completion_length": 1754.7500610351562, "epoch": 0.84146068254798, - "grad_norm": 18.911205291748047, - "kl": 0.30517578125, - "learning_rate": 7.447208685778045e-09, - "loss": 0.132, - "reward": 0.4224330484867096, - "reward_std": 0.1743636541068554, - "rewards/accuracy_reward": 0.11830357555299997, + "grad_norm": 21.16781997680664, + "kl": 5.0234375, + "learning_rate": 3.723604342889022e-08, + "loss": 0.2835, + "reward": 0.558035746216774, + "reward_std": 0.15717510506510735, + "rewards/accuracy_reward": 0.14285714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.4151785895228386, "step": 2817 }, { "clip_ratio": 0.0, - "completion_length": 1628.6875610351562, + "completion_length": 1827.1161499023438, "epoch": 0.8417593906355014, - "grad_norm": 14.30870532989502, - "kl": 0.248291015625, - "learning_rate": 7.419848496814296e-09, - "loss": 0.1463, - "reward": 0.3091517984867096, - "reward_std": 0.16924520581960678, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 26.719785690307617, + "kl": 5.19140625, + "learning_rate": 3.709924248407148e-08, + "loss": 0.2564, + "reward": 0.4453125149011612, + "reward_std": 0.1519513614475727, + "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875074505806, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2818 }, { "clip_ratio": 0.0, - "completion_length": 1558.8482666015625, + "completion_length": 1738.41748046875, "epoch": 0.842058098723023, - "grad_norm": 13.419061660766602, - "kl": 0.2548828125, - "learning_rate": 7.392534630882091e-09, - "loss": 0.1336, - "reward": 0.3794642984867096, - "reward_std": 0.16753199696540833, - "rewards/accuracy_reward": 0.04910714365541935, + "grad_norm": 6.47824764251709, + "kl": 3.75390625, + "learning_rate": 3.6962673154410457e-08, + "loss": 0.2191, + "reward": 0.4843750223517418, + "reward_std": 0.11692117527127266, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571492433548, + "rewards/tag_count_reward": 0.4352678805589676, "step": 2819 }, { "clip_ratio": 0.0, - "completion_length": 1657.2232666015625, + "completion_length": 1782.1630249023438, "epoch": 0.8423568068105444, - "grad_norm": 13.031111717224121, - "kl": 0.291748046875, - "learning_rate": 7.3652671176962165e-09, - "loss": 0.1098, - "reward": 0.3577008992433548, - "reward_std": 0.20809736847877502, - "rewards/accuracy_reward": 0.06250000116415322, + "grad_norm": 22.394481658935547, + "kl": 4.875, + "learning_rate": 3.682633558848108e-08, + "loss": 0.2574, + "reward": 0.4821428805589676, + "reward_std": 0.15902639366686344, + "rewards/accuracy_reward": 0.06919643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952008992433548, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2820 }, { "clip_ratio": 0.0, - "completion_length": 1607.13623046875, + "completion_length": 1785.6719665527344, "epoch": 0.8426555148980659, - "grad_norm": 13.186058044433594, - "kl": 0.23828125, - "learning_rate": 7.33804598692106e-09, - "loss": 0.1303, - "reward": 0.439732164144516, - "reward_std": 0.18840665370225906, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 25.251895904541016, + "kl": 2.78125, + "learning_rate": 3.66902299346053e-08, + "loss": 0.1716, + "reward": 0.5440848469734192, + "reward_std": 0.1367931067943573, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4414062723517418, "step": 2821 }, { "clip_ratio": 0.0, - "completion_length": 1608.13623046875, + "completion_length": 1811.1719665527344, "epoch": 0.8429542229855873, - "grad_norm": 12.36564826965332, - "kl": 0.240478515625, - "learning_rate": 7.310871268170565e-09, - "loss": 0.1259, - "reward": 0.3906250223517418, - "reward_std": 0.20008156448602676, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 5.394992828369141, + "kl": 4.23828125, + "learning_rate": 3.6554356340852826e-08, + "loss": 0.2367, + "reward": 0.490513414144516, + "reward_std": 0.1593309435993433, + "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.423549123108387, "step": 2822 }, { "clip_ratio": 0.0, - "completion_length": 1579.4375915527344, + "completion_length": 1695.4576416015625, "epoch": 0.8432529310731088, - "grad_norm": 15.967065811157227, - "kl": 0.244873046875, - "learning_rate": 7.283742991008129e-09, - "loss": 0.1489, - "reward": 0.5150669887661934, - "reward_std": 0.19416387751698494, - "rewards/accuracy_reward": 0.17410715529695153, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.340959832072258, + "grad_norm": 15.126260757446289, + "kl": 2.8359375, + "learning_rate": 3.6418714955040644e-08, + "loss": 0.1952, + "reward": 0.628348246216774, + "reward_std": 0.1594865694642067, + "rewards/accuracy_reward": 0.18526786845177412, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4430803880095482, "step": 2823 }, { "clip_ratio": 0.0, - "completion_length": 1524.0648193359375, + "completion_length": 1687.1741638183594, "epoch": 0.8435516391606303, - "grad_norm": 16.138280868530273, - "kl": 0.236572265625, - "learning_rate": 7.256661184946694e-09, - "loss": 0.146, - "reward": 0.459263414144516, - "reward_std": 0.18847481161355972, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 13.725850105285645, + "kl": 2.72265625, + "learning_rate": 3.628330592473347e-08, + "loss": 0.1807, + "reward": 0.565290205180645, + "reward_std": 0.13737407699227333, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526977300644, + "rewards/tag_count_reward": 0.4492187723517418, "step": 2824 }, { "clip_ratio": 0.0, - "completion_length": 1531.0402526855469, + "completion_length": 1661.8639221191406, "epoch": 0.8438503472481518, - "grad_norm": 12.82735824584961, - "kl": 0.21630859375, - "learning_rate": 7.229625879448575e-09, - "loss": 0.1333, - "reward": 0.4877232313156128, - "reward_std": 0.17211240902543068, - "rewards/accuracy_reward": 0.145089291036129, + "grad_norm": 27.825334548950195, + "kl": 2.48828125, + "learning_rate": 3.614812939724288e-08, + "loss": 0.1973, + "reward": 0.5982142984867096, + "reward_std": 0.1384568866342306, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.444196455180645, "step": 2825 }, { "clip_ratio": 0.0, - "completion_length": 1516.9777526855469, + "completion_length": 1680.8371276855469, "epoch": 0.8441490553356732, - "grad_norm": 18.338987350463867, - "kl": 0.236328125, - "learning_rate": 7.202637103925557e-09, - "loss": 0.1738, - "reward": 0.4034598469734192, - "reward_std": 0.17672087624669075, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 8.884342193603516, + "kl": 2.513671875, + "learning_rate": 3.601318551962779e-08, + "loss": 0.1639, + "reward": 0.5167411044239998, + "reward_std": 0.13020514883100986, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4341518133878708, "step": 2826 }, { "clip_ratio": 0.0, - "completion_length": 1558.2255249023438, + "completion_length": 1795.41748046875, "epoch": 0.8444477634231947, - "grad_norm": 12.144181251525879, - "kl": 0.234130859375, - "learning_rate": 7.1756948877387865e-09, - "loss": 0.1027, - "reward": 0.4570312798023224, - "reward_std": 0.21803918480873108, - "rewards/accuracy_reward": 0.10937500465661287, + "grad_norm": 4.7163262367248535, + "kl": 3.07421875, + "learning_rate": 3.587847443869393e-08, + "loss": 0.1752, + "reward": 0.602678582072258, + "reward_std": 0.24865194782614708, + "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.435267873108387, "step": 2827 }, { "clip_ratio": 0.0, - "completion_length": 1600.2745971679688, + "completion_length": 1760.5134582519531, "epoch": 0.8447464715107161, - "grad_norm": 14.924469947814941, - "kl": 0.245361328125, - "learning_rate": 7.148799260198735e-09, - "loss": 0.1425, - "reward": 0.3911830633878708, - "reward_std": 0.2061803601682186, - "rewards/accuracy_reward": 0.07142857508733869, + "grad_norm": 18.098613739013672, + "kl": 4.6015625, + "learning_rate": 3.574399630099367e-08, + "loss": 0.2707, + "reward": 0.4843750223517418, + "reward_std": 0.15744599886238575, + "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4174107387661934, "step": 2828 }, { "clip_ratio": 0.0, - "completion_length": 1623.3906860351562, + "completion_length": 1845.9063110351562, "epoch": 0.8450451795982377, - "grad_norm": 12.433372497558594, - "kl": 0.28076171875, - "learning_rate": 7.121950250565217e-09, - "loss": 0.121, - "reward": 0.3939732387661934, - "reward_std": 0.1699027679860592, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 17.663509368896484, + "kl": 5.265625, + "learning_rate": 3.560975125282609e-08, + "loss": 0.2827, + "reward": 0.5094866380095482, + "reward_std": 0.1355590522289276, + "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2829 }, { "clip_ratio": 0.0, - "completion_length": 1572.0313415527344, + "completion_length": 1727.9197387695312, "epoch": 0.8453438876857591, - "grad_norm": 12.08668041229248, - "kl": 0.232666015625, - "learning_rate": 7.095147888047337e-09, - "loss": 0.1124, - "reward": 0.4129464402794838, - "reward_std": 0.18612005934119225, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 6.348749160766602, + "kl": 3.4765625, + "learning_rate": 3.5475739440236685e-08, + "loss": 0.2117, + "reward": 0.5479910895228386, + "reward_std": 0.1451041642576456, + "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2830 }, { "clip_ratio": 0.0, - "completion_length": 1600.74560546875, + "completion_length": 1815.4085998535156, "epoch": 0.8456425957732806, - "grad_norm": 14.211133003234863, - "kl": 0.26513671875, - "learning_rate": 7.068392201803419e-09, - "loss": 0.1096, - "reward": 0.3688616305589676, - "reward_std": 0.1645352765917778, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 3.880139112472534, + "kl": 3.54296875, + "learning_rate": 3.534196100901709e-08, + "loss": 0.2059, + "reward": 0.4720982313156128, + "reward_std": 0.12230434268712997, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2831 }, { "clip_ratio": 0.0, - "completion_length": 1514.16748046875, + "completion_length": 1724.9688110351562, "epoch": 0.845941303860802, - "grad_norm": 13.525617599487305, - "kl": 0.264404296875, - "learning_rate": 7.041683220941036e-09, - "loss": 0.1278, - "reward": 0.454799123108387, - "reward_std": 0.16747530177235603, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 14.58448314666748, + "kl": 4.65625, + "learning_rate": 3.520841610470518e-08, + "loss": 0.2733, + "reward": 0.556919664144516, + "reward_std": 0.17150320298969746, + "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2832 }, { "clip_ratio": 0.0, - "completion_length": 1638.1384582519531, + "completion_length": 1770.3840026855469, "epoch": 0.8462400119483235, - "grad_norm": 11.068228721618652, - "kl": 0.255615234375, - "learning_rate": 7.01502097451695e-09, - "loss": 0.1199, - "reward": 0.4358259066939354, - "reward_std": 0.17292535305023193, - "rewards/accuracy_reward": 0.11383929336443543, + "grad_norm": 4.627825736999512, + "kl": 4.23046875, + "learning_rate": 3.507510487258475e-08, + "loss": 0.2299, + "reward": 0.5563616305589676, + "reward_std": 0.14239880815148354, + "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.4291294813156128, "step": 2833 }, { "clip_ratio": 0.0, - "completion_length": 1528.7188110351562, + "completion_length": 1763.6005249023438, "epoch": 0.846538720035845, - "grad_norm": 13.73755168914795, - "kl": 0.251220703125, - "learning_rate": 6.988405491537053e-09, - "loss": 0.128, - "reward": 0.3984375223517418, - "reward_std": 0.18110502883791924, + "grad_norm": 24.971054077148438, + "kl": 5.4296875, + "learning_rate": 3.494202745768526e-08, + "loss": 0.3124, + "reward": 0.4771205559372902, + "reward_std": 0.19081398472189903, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4101562649011612, "step": 2834 }, { "clip_ratio": 0.0, - "completion_length": 1653.8505249023438, + "completion_length": 1818.9710693359375, "epoch": 0.8468374281233665, - "grad_norm": 14.965655326843262, - "kl": 0.268798828125, - "learning_rate": 6.9618368009563885e-09, - "loss": 0.1223, - "reward": 0.400111623108387, - "reward_std": 0.17025798559188843, - "rewards/accuracy_reward": 0.0825892873108387, + "grad_norm": 14.570950508117676, + "kl": 4.47265625, + "learning_rate": 3.480918400478194e-08, + "loss": 0.2367, + "reward": 0.5072544887661934, + "reward_std": 0.1360940933227539, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.420200914144516, "step": 2835 }, { "clip_ratio": 0.0, - "completion_length": 1575.4375610351562, + "completion_length": 1762.0157165527344, "epoch": 0.8471361362108879, - "grad_norm": 13.957771301269531, - "kl": 0.220703125, - "learning_rate": 6.935314931679109e-09, - "loss": 0.1302, - "reward": 0.3554687649011612, - "reward_std": 0.1813652254641056, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 10.195008277893066, + "kl": 3.46484375, + "learning_rate": 3.4676574658395543e-08, + "loss": 0.1947, + "reward": 0.4458705484867096, + "reward_std": 0.12237017042934895, + "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3376116156578064, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2836 }, { "clip_ratio": 0.0, - "completion_length": 1622.3326416015625, + "completion_length": 1820.0425109863281, "epoch": 0.8474348442984094, - "grad_norm": 12.854877471923828, - "kl": 0.243896484375, - "learning_rate": 6.908839912558373e-09, - "loss": 0.1073, - "reward": 0.4090401977300644, - "reward_std": 0.20629842951893806, - "rewards/accuracy_reward": 0.09375000349245965, + "grad_norm": 12.358247756958008, + "kl": 4.20703125, + "learning_rate": 3.4544199562791866e-08, + "loss": 0.2186, + "reward": 0.5530134215950966, + "reward_std": 0.18798187002539635, + "rewards/accuracy_reward": 0.12276785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2837 }, { "clip_ratio": 0.0, - "completion_length": 1636.2210998535156, + "completion_length": 1843.7009582519531, "epoch": 0.8477335523859308, - "grad_norm": 12.81078815460205, - "kl": 0.263671875, - "learning_rate": 6.88241177239644e-09, - "loss": 0.1049, - "reward": 0.455915205180645, - "reward_std": 0.22223465517163277, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 9.467040061950684, + "kl": 4.46484375, + "learning_rate": 3.44120588619822e-08, + "loss": 0.2407, + "reward": 0.5770089402794838, + "reward_std": 0.18057376332581043, + "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2838 }, { "clip_ratio": 0.0, - "completion_length": 1541.4152526855469, + "completion_length": 1781.6741638183594, "epoch": 0.8480322604734524, - "grad_norm": 11.666107177734375, - "kl": 0.240966796875, - "learning_rate": 6.856030539944507e-09, - "loss": 0.0921, - "reward": 0.5039062798023224, - "reward_std": 0.22055545076727867, - "rewards/accuracy_reward": 0.1651785832364112, + "grad_norm": 4.170590877532959, + "kl": 3.71875, + "learning_rate": 3.428015269972254e-08, + "loss": 0.1974, + "reward": 0.573102705180645, + "reward_std": 0.1913245003670454, + "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2839 }, { "clip_ratio": 0.0, - "completion_length": 1644.7768859863281, + "completion_length": 1763.8728637695312, "epoch": 0.8483309685609738, - "grad_norm": 12.191849708557129, - "kl": 0.240478515625, - "learning_rate": 6.829696243902783e-09, - "loss": 0.1167, - "reward": 0.3973214477300644, - "reward_std": 0.22547805309295654, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 4.940290451049805, + "kl": 3.72265625, + "learning_rate": 3.4148481219513916e-08, + "loss": 0.2111, + "reward": 0.5033482387661934, + "reward_std": 0.17690503410995007, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4185267984867096, "step": 2840 }, { "clip_ratio": 0.0, - "completion_length": 1545.7299499511719, + "completion_length": 1738.4777526855469, "epoch": 0.8486296766484953, - "grad_norm": 12.646320343017578, - "kl": 0.220703125, - "learning_rate": 6.803408912920417e-09, - "loss": 0.1298, - "reward": 0.469866082072258, - "reward_std": 0.17739191092550755, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 14.314323425292969, + "kl": 2.587890625, + "learning_rate": 3.401704456460208e-08, + "loss": 0.1491, + "reward": 0.5680803805589676, + "reward_std": 0.14791902713477612, + "rewards/accuracy_reward": 0.13616072060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732387661934, + "rewards/tag_count_reward": 0.4319196566939354, "step": 2841 }, { "clip_ratio": 0.0, - "completion_length": 1554.1161193847656, + "completion_length": 1750.9130249023438, "epoch": 0.8489283847360167, - "grad_norm": 12.555505752563477, - "kl": 0.222900390625, - "learning_rate": 6.777168575595416e-09, - "loss": 0.1249, - "reward": 0.424665205180645, - "reward_std": 0.1786670759320259, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 7.777742385864258, + "kl": 3.05078125, + "learning_rate": 3.388584287797708e-08, + "loss": 0.1697, + "reward": 0.5212053805589676, + "reward_std": 0.12548474594950676, + "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333147332072258, + "rewards/tag_count_reward": 0.4341517984867096, "step": 2842 }, { "clip_ratio": 0.0, - "completion_length": 1577.1273193359375, + "completion_length": 1792.2322082519531, "epoch": 0.8492270928235381, - "grad_norm": 15.373595237731934, - "kl": 0.2314453125, - "learning_rate": 6.750975260474717e-09, - "loss": 0.1261, - "reward": 0.4335937723517418, - "reward_std": 0.18431542068719864, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 10.728815078735352, + "kl": 3.3046875, + "learning_rate": 3.3754876302373584e-08, + "loss": 0.1954, + "reward": 0.5401786044239998, + "reward_std": 0.15113292448222637, + "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4285714402794838, "step": 2843 }, { "clip_ratio": 0.0, - "completion_length": 1530.5826721191406, + "completion_length": 1740.18310546875, "epoch": 0.8495258009110597, - "grad_norm": 18.95304298400879, - "kl": 0.22314453125, - "learning_rate": 6.724828996054083e-09, - "loss": 0.1676, - "reward": 0.479352705180645, - "reward_std": 0.2283444181084633, - "rewards/accuracy_reward": 0.1294642873108387, + "grad_norm": 14.258487701416016, + "kl": 2.84765625, + "learning_rate": 3.362414498027041e-08, + "loss": 0.1908, + "reward": 0.5703125223517418, + "reward_std": 0.159937035292387, + "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349888414144516, + "rewards/tag_count_reward": 0.4386160895228386, "step": 2844 }, { "clip_ratio": 0.0, - "completion_length": 1525.6629943847656, + "completion_length": 1727.9420471191406, "epoch": 0.8498245089985811, - "grad_norm": 14.158774375915527, - "kl": 0.22265625, - "learning_rate": 6.698729810778064e-09, - "loss": 0.1495, - "reward": 0.3671875149011612, - "reward_std": 0.1769343763589859, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 9.724454879760742, + "kl": 3.046875, + "learning_rate": 3.349364905389032e-08, + "loss": 0.1951, + "reward": 0.4598214477300644, + "reward_std": 0.16079909354448318, + "rewards/accuracy_reward": 0.03348214505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2845 }, { "clip_ratio": 0.0, - "completion_length": 1693.1139526367188, + "completion_length": 1811.3304443359375, "epoch": 0.8501232170861026, - "grad_norm": 11.500258445739746, - "kl": 0.241943359375, - "learning_rate": 6.672677733040038e-09, - "loss": 0.0926, - "reward": 0.3208705484867096, - "reward_std": 0.1730261668562889, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 6.999212741851807, + "kl": 4.05859375, + "learning_rate": 3.336338866520019e-08, + "loss": 0.245, + "reward": 0.4475446715950966, + "reward_std": 0.1660367138683796, + "rewards/accuracy_reward": 0.026785715948790312, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.420758955180645, "step": 2846 }, { "clip_ratio": 0.0, - "completion_length": 1563.1094360351562, + "completion_length": 1747.8282165527344, "epoch": 0.850421925173624, - "grad_norm": 11.718170166015625, - "kl": 0.21923828125, - "learning_rate": 6.646672791182112e-09, - "loss": 0.1158, - "reward": 0.407924123108387, - "reward_std": 0.22847989574074745, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 6.5364837646484375, + "kl": 3.38671875, + "learning_rate": 3.323336395591056e-08, + "loss": 0.2158, + "reward": 0.5396205559372902, + "reward_std": 0.21204089932143688, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4369419813156128, "step": 2847 }, { "clip_ratio": 0.0, - "completion_length": 1573.4777526855469, + "completion_length": 1749.9353637695312, "epoch": 0.8507206332611456, - "grad_norm": 12.516551971435547, - "kl": 0.2451171875, - "learning_rate": 6.620715013495093e-09, - "loss": 0.1074, - "reward": 0.4274553880095482, - "reward_std": 0.18444900773465633, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 3.7056779861450195, + "kl": 3.658203125, + "learning_rate": 3.310357506747546e-08, + "loss": 0.2161, + "reward": 0.5479910969734192, + "reward_std": 0.17284457385540009, + "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4319196715950966, "step": 2848 }, { "clip_ratio": 0.0, - "completion_length": 1538.8906860351562, + "completion_length": 1706.5558776855469, "epoch": 0.851019341348667, - "grad_norm": 13.318621635437012, - "kl": 0.247802734375, - "learning_rate": 6.594804428218526e-09, - "loss": 0.1277, - "reward": 0.4547991305589676, - "reward_std": 0.16560348123311996, - "rewards/accuracy_reward": 0.1116071455180645, + "grad_norm": 10.318049430847168, + "kl": 3.67578125, + "learning_rate": 3.297402214109263e-08, + "loss": 0.2286, + "reward": 0.5664062649011612, + "reward_std": 0.1484462097287178, + "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4391741305589676, "step": 2849 }, { "clip_ratio": 0.0, - "completion_length": 1562.9152221679688, + "completion_length": 1779.9688110351562, "epoch": 0.8513180494361885, - "grad_norm": 10.31305980682373, - "kl": 0.2255859375, - "learning_rate": 6.568941063540561e-09, - "loss": 0.1034, - "reward": 0.4654018059372902, - "reward_std": 0.2156667485833168, - "rewards/accuracy_reward": 0.14285715040750802, + "grad_norm": 16.316965103149414, + "kl": 3.81640625, + "learning_rate": 3.284470531770281e-08, + "loss": 0.1954, + "reward": 0.5792410969734192, + "reward_std": 0.1879999078810215, + "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4229910969734192, "step": 2850 }, { "clip_ratio": 0.0, - "completion_length": 1508.9286193847656, + "completion_length": 1698.9465026855469, "epoch": 0.8516167575237099, - "grad_norm": 17.3002872467041, - "kl": 0.20166015625, - "learning_rate": 6.543124947598022e-09, - "loss": 0.1294, - "reward": 0.5468750298023224, - "reward_std": 0.1815769448876381, - "rewards/accuracy_reward": 0.1852678693830967, + "grad_norm": 11.02546215057373, + "kl": 4.48828125, + "learning_rate": 3.2715624737990114e-08, + "loss": 0.2647, + "reward": 0.6227678880095482, + "reward_std": 0.16439899429678917, + "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.361607164144516, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2851 }, { "clip_ratio": 0.0, - "completion_length": 1528.5982666015625, + "completion_length": 1700.6563110351562, "epoch": 0.8519154656112314, - "grad_norm": 11.721868515014648, - "kl": 0.224365234375, - "learning_rate": 6.517356108476313e-09, - "loss": 0.1348, - "reward": 0.3989955559372902, - "reward_std": 0.1673626098781824, - "rewards/accuracy_reward": 0.06473214412108064, + "grad_norm": 14.703664779663086, + "kl": 4.44140625, + "learning_rate": 3.258678054238156e-08, + "loss": 0.2763, + "reward": 0.513950914144516, + "reward_std": 0.16689816117286682, + "rewards/accuracy_reward": 0.09151785890571773, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4224330484867096, "step": 2852 }, { "clip_ratio": 0.0, - "completion_length": 1564.7813110351562, + "completion_length": 1759.7388916015625, "epoch": 0.8522141736987529, - "grad_norm": 13.473031997680664, - "kl": 0.2177734375, - "learning_rate": 6.491634574209398e-09, - "loss": 0.1138, - "reward": 0.4609375149011612, - "reward_std": 0.2058698572218418, - "rewards/accuracy_reward": 0.12723214738070965, + "grad_norm": 8.985806465148926, + "kl": 3.41796875, + "learning_rate": 3.2458172871046993e-08, + "loss": 0.1969, + "reward": 0.5998884290456772, + "reward_std": 0.15873457677662373, + "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.4369419813156128, "step": 2853 }, { "clip_ratio": 0.0, - "completion_length": 1636.8416137695312, + "completion_length": 1737.6116943359375, "epoch": 0.8525128817862744, - "grad_norm": 16.384923934936523, - "kl": 0.2255859375, - "learning_rate": 6.465960372779783e-09, - "loss": 0.1386, - "reward": 0.3828125223517418, - "reward_std": 0.22754962742328644, - "rewards/accuracy_reward": 0.05357142933644354, + "grad_norm": 7.6839470863342285, + "kl": 3.392578125, + "learning_rate": 3.2329801863898914e-08, + "loss": 0.2112, + "reward": 0.5133928880095482, + "reward_std": 0.22248199954628944, + "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4352678805589676, "step": 2854 }, { "clip_ratio": 0.0, - "completion_length": 1576.7634582519531, + "completion_length": 1737.8013916015625, "epoch": 0.8528115898737958, - "grad_norm": 19.036884307861328, - "kl": 0.2255859375, - "learning_rate": 6.440333532118502e-09, - "loss": 0.1442, - "reward": 0.3844866305589676, - "reward_std": 0.18796968087553978, - "rewards/accuracy_reward": 0.055803575087338686, + "grad_norm": 11.61851692199707, + "kl": 3.09375, + "learning_rate": 3.2201667660592514e-08, + "loss": 0.2034, + "reward": 0.4994419887661934, + "reward_std": 0.13821317628026009, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4369419887661934, "step": 2855 }, { "clip_ratio": 0.0, - "completion_length": 1599.8505249023438, + "completion_length": 1794.0782470703125, "epoch": 0.8531102979613173, - "grad_norm": 14.864404678344727, - "kl": 0.242431640625, - "learning_rate": 6.414754080105017e-09, - "loss": 0.0977, - "reward": 0.4419643059372902, - "reward_std": 0.24081972986459732, - "rewards/accuracy_reward": 0.13616071944124997, + "grad_norm": 5.741254806518555, + "kl": 4.39453125, + "learning_rate": 3.207377040052509e-08, + "loss": 0.2476, + "reward": 0.5664062649011612, + "reward_std": 0.1770616639405489, + "rewards/accuracy_reward": 0.15401786682195961, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3058035746216774, + "rewards/tag_count_reward": 0.412388414144516, "step": 2856 }, { "clip_ratio": 0.0, - "completion_length": 1656.7880249023438, + "completion_length": 1815.8795471191406, "epoch": 0.8534090060488387, - "grad_norm": 11.519875526428223, - "kl": 0.239013671875, - "learning_rate": 6.389222044567283e-09, - "loss": 0.1199, - "reward": 0.3275669813156128, - "reward_std": 0.15988541767001152, - "rewards/accuracy_reward": 0.013392857508733869, + "grad_norm": 13.1563081741333, + "kl": 4.5390625, + "learning_rate": 3.194611022283642e-08, + "loss": 0.2439, + "reward": 0.4308035895228386, + "reward_std": 0.1460218634456396, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.4107143059372902, "step": 2857 }, { "clip_ratio": 0.0, - "completion_length": 1591.3951721191406, + "completion_length": 1799.8326721191406, "epoch": 0.8537077141363603, - "grad_norm": 16.172548294067383, - "kl": 0.269287109375, - "learning_rate": 6.363737453281659e-09, - "loss": 0.146, - "reward": 0.3727678805589676, - "reward_std": 0.18493405357003212, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 18.441537857055664, + "kl": 5.6953125, + "learning_rate": 3.18186872664083e-08, + "loss": 0.2954, + "reward": 0.4575893059372902, + "reward_std": 0.12582561746239662, + "rewards/accuracy_reward": 0.0446428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4129464402794838, "step": 2858 }, { "clip_ratio": 0.0, - "completion_length": 1591.5848999023438, + "completion_length": 1758.341552734375, "epoch": 0.8540064222238817, - "grad_norm": 14.64763355255127, - "kl": 0.233154296875, - "learning_rate": 6.338300333972857e-09, - "loss": 0.1375, - "reward": 0.4162946715950966, - "reward_std": 0.1927139349281788, - "rewards/accuracy_reward": 0.09598214668221772, + "grad_norm": 6.884352684020996, + "kl": 3.1015625, + "learning_rate": 3.169150166986429e-08, + "loss": 0.1871, + "reward": 0.5318080633878708, + "reward_std": 0.14498044177889824, + "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4335937723517418, "step": 2859 }, { "clip_ratio": 0.0, - "completion_length": 1640.57373046875, + "completion_length": 1769.0536499023438, "epoch": 0.8543051303114032, - "grad_norm": 11.373429298400879, - "kl": 0.23583984375, - "learning_rate": 6.3129107143139904e-09, - "loss": 0.1232, - "reward": 0.3856026902794838, - "reward_std": 0.15309536829590797, - "rewards/accuracy_reward": 0.0758928582072258, + "grad_norm": 6.953408718109131, + "kl": 3.6796875, + "learning_rate": 3.156455357156995e-08, + "loss": 0.2183, + "reward": 0.4972098469734192, + "reward_std": 0.09995691105723381, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.4257812649011612, "step": 2860 }, { "clip_ratio": 0.0, - "completion_length": 1611.5447082519531, + "completion_length": 1785.52685546875, "epoch": 0.8546038383989246, - "grad_norm": 10.585898399353027, - "kl": 0.262939453125, - "learning_rate": 6.287568621926481e-09, - "loss": 0.1291, - "reward": 0.3515625223517418, - "reward_std": 0.17754123359918594, - "rewards/accuracy_reward": 0.049107146449387074, + "grad_norm": 11.905069351196289, + "kl": 4.20703125, + "learning_rate": 3.143784310963241e-08, + "loss": 0.2276, + "reward": 0.4681919738650322, + "reward_std": 0.13415836542844772, + "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4190848395228386, "step": 2861 }, { "clip_ratio": 0.0, - "completion_length": 1564.8594055175781, + "completion_length": 1751.0134887695312, "epoch": 0.8549025464864461, - "grad_norm": 13.19269847869873, - "kl": 0.23291015625, - "learning_rate": 6.262274084380021e-09, - "loss": 0.1226, - "reward": 0.4196428805589676, - "reward_std": 0.1702243834733963, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 27.463790893554688, + "kl": 3.7578125, + "learning_rate": 3.1311370421900104e-08, + "loss": 0.241, + "reward": 0.5329241380095482, + "reward_std": 0.14076276309788227, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2862 }, { "clip_ratio": 0.0, - "completion_length": 1592.3259582519531, + "completion_length": 1793.9933776855469, "epoch": 0.8552012545739676, - "grad_norm": 13.260708808898926, - "kl": 0.235595703125, - "learning_rate": 6.237027129192613e-09, - "loss": 0.1057, - "reward": 0.365513414144516, - "reward_std": 0.18319441750645638, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 11.871695518493652, + "kl": 3.796875, + "learning_rate": 3.118513564596306e-08, + "loss": 0.2236, + "reward": 0.4983259215950966, + "reward_std": 0.17828672751784325, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2863 }, { "clip_ratio": 0.0, - "completion_length": 1635.200927734375, + "completion_length": 1809.4130249023438, "epoch": 0.8554999626614891, - "grad_norm": 16.35620880126953, - "kl": 0.2451171875, - "learning_rate": 6.211827783830442e-09, - "loss": 0.1415, - "reward": 0.3928571566939354, - "reward_std": 0.20820242166519165, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 11.549400329589844, + "kl": 3.748046875, + "learning_rate": 3.105913891915221e-08, + "loss": 0.2179, + "reward": 0.5530134066939354, + "reward_std": 0.19054916314780712, + "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4257812723517418, "step": 2864 }, { "clip_ratio": 0.0, - "completion_length": 1647.4532165527344, + "completion_length": 1831.8572387695312, "epoch": 0.8557986707490105, - "grad_norm": 13.57750129699707, - "kl": 0.275390625, - "learning_rate": 6.186676075707942e-09, - "loss": 0.1243, - "reward": 0.3777901977300644, - "reward_std": 0.1731134131550789, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 6.425656795501709, + "kl": 5.0703125, + "learning_rate": 3.093338037853971e-08, + "loss": 0.2816, + "reward": 0.482142873108387, + "reward_std": 0.14176494255661964, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651977300644, + "rewards/tag_count_reward": 0.3973214477300644, "step": 2865 }, { "clip_ratio": 0.0, - "completion_length": 1630.2545471191406, + "completion_length": 1749.2634887695312, "epoch": 0.856097378836532, - "grad_norm": 13.99380874633789, - "kl": 0.2880859375, - "learning_rate": 6.161572032187718e-09, - "loss": 0.1266, - "reward": 0.349888414144516, - "reward_std": 0.15632053092122078, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 16.93515968322754, + "kl": 4.54296875, + "learning_rate": 3.0807860160938585e-08, + "loss": 0.2579, + "reward": 0.4793526977300644, + "reward_std": 0.14044139534235, + "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.303013414144516, + "rewards/tag_count_reward": 0.4168526977300644, "step": 2866 }, { "clip_ratio": 0.0, - "completion_length": 1629.80810546875, + "completion_length": 1799.22998046875, "epoch": 0.8563960869240534, - "grad_norm": 13.302021980285645, - "kl": 0.27587890625, - "learning_rate": 6.1365156805804786e-09, - "loss": 0.1277, - "reward": 0.3621651977300644, - "reward_std": 0.20807256922125816, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 10.220853805541992, + "kl": 5.01171875, + "learning_rate": 3.0682578402902394e-08, + "loss": 0.2834, + "reward": 0.4933035969734192, + "reward_std": 0.15539340302348137, + "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.424107164144516, "step": 2867 }, { "clip_ratio": 0.0, - "completion_length": 1587.5246276855469, + "completion_length": 1763.6161804199219, "epoch": 0.856694795011575, - "grad_norm": 15.256914138793945, - "kl": 0.250732421875, - "learning_rate": 6.111507048145098e-09, - "loss": 0.1212, - "reward": 0.501116082072258, - "reward_std": 0.18085945025086403, - "rewards/accuracy_reward": 0.165178582072258, + "grad_norm": 7.331393718719482, + "kl": 3.234375, + "learning_rate": 3.055753524072549e-08, + "loss": 0.2141, + "reward": 0.6060268208384514, + "reward_std": 0.15215518698096275, + "rewards/accuracy_reward": 0.17857143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2868 }, { "clip_ratio": 0.0, - "completion_length": 1610.4308776855469, + "completion_length": 1775.1451721191406, "epoch": 0.8569935030990964, - "grad_norm": 15.477853775024414, - "kl": 0.259765625, - "learning_rate": 6.086546162088524e-09, - "loss": 0.1398, - "reward": 0.4101562723517418, - "reward_std": 0.19676457345485687, - "rewards/accuracy_reward": 0.07812500488944352, + "grad_norm": 11.532556533813477, + "kl": 3.38671875, + "learning_rate": 3.043273081044262e-08, + "loss": 0.2169, + "reward": 0.5200893059372902, + "reward_std": 0.16754340194165707, + "rewards/accuracy_reward": 0.09151785913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2869 }, { "clip_ratio": 0.0, - "completion_length": 1632.8683776855469, + "completion_length": 1804.7165832519531, "epoch": 0.8572922111866179, - "grad_norm": 12.541914939880371, - "kl": 0.282470703125, - "learning_rate": 6.061633049565734e-09, - "loss": 0.1224, - "reward": 0.3225446566939354, - "reward_std": 0.1771504320204258, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 11.502321243286133, + "kl": 4.51171875, + "learning_rate": 3.030816524782867e-08, + "loss": 0.2482, + "reward": 0.4397321715950966, + "reward_std": 0.15233152359724045, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091518059372902, + "rewards/tag_count_reward": 0.4174107387661934, "step": 2870 }, { "clip_ratio": 0.0, - "completion_length": 1560.7545471191406, + "completion_length": 1762.6473999023438, "epoch": 0.8575909192741393, - "grad_norm": 15.184069633483887, - "kl": 0.23828125, - "learning_rate": 6.036767737679765e-09, - "loss": 0.1418, - "reward": 0.4229910969734192, - "reward_std": 0.2146899364888668, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 18.157146453857422, + "kl": 2.869140625, + "learning_rate": 3.0183838688398826e-08, + "loss": 0.1877, + "reward": 0.5100446566939354, + "reward_std": 0.1675581280142069, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2871 }, { "clip_ratio": 0.0, - "completion_length": 1612.2411499023438, + "completion_length": 1769.7567749023438, "epoch": 0.8578896273616609, - "grad_norm": 14.322404861450195, - "kl": 0.255126953125, - "learning_rate": 6.0119502534816456e-09, - "loss": 0.148, - "reward": 0.4430803805589676, - "reward_std": 0.21417028829455376, - "rewards/accuracy_reward": 0.11160715017467737, + "grad_norm": 11.073336601257324, + "kl": 2.91015625, + "learning_rate": 3.005975126740823e-08, + "loss": 0.165, + "reward": 0.571986623108387, + "reward_std": 0.16309433057904243, + "rewards/accuracy_reward": 0.13169643562287092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.440290205180645, "step": 2872 }, { "clip_ratio": 0.0, - "completion_length": 1593.8370971679688, + "completion_length": 1818.5759582519531, "epoch": 0.8581883354491823, - "grad_norm": 12.585186004638672, - "kl": 0.262939453125, - "learning_rate": 5.9871806239703506e-09, - "loss": 0.1033, - "reward": 0.431919664144516, - "reward_std": 0.1941440049558878, - "rewards/accuracy_reward": 0.09821428940631449, + "grad_norm": 6.701773166656494, + "kl": 3.693359375, + "learning_rate": 2.993590311985175e-08, + "loss": 0.2021, + "reward": 0.5156250223517418, + "reward_std": 0.15023531764745712, + "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.424107164144516, "step": 2873 }, { "clip_ratio": 0.0, - "completion_length": 1567.1361999511719, + "completion_length": 1700.0781860351562, "epoch": 0.8584870435367038, - "grad_norm": 12.835759162902832, - "kl": 0.24853515625, - "learning_rate": 5.962458876092813e-09, - "loss": 0.1098, - "reward": 0.4977678805589676, - "reward_std": 0.18465480208396912, - "rewards/accuracy_reward": 0.15178571757860482, + "grad_norm": 11.576151847839355, + "kl": 2.4609375, + "learning_rate": 2.981229438046406e-08, + "loss": 0.1528, + "reward": 0.6099330633878708, + "reward_std": 0.13702805899083614, + "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3459821566939354, + "rewards/tag_count_reward": 0.4425223469734192, "step": 2874 }, { "clip_ratio": 0.0, - "completion_length": 1575.4620971679688, + "completion_length": 1724.3170471191406, "epoch": 0.8587857516242252, - "grad_norm": 15.668646812438965, - "kl": 0.23974609375, - "learning_rate": 5.937785036743892e-09, - "loss": 0.1413, - "reward": 0.4196428805589676, - "reward_std": 0.1935274600982666, - "rewards/accuracy_reward": 0.08035714738070965, + "grad_norm": 6.459187984466553, + "kl": 3.1953125, + "learning_rate": 2.968892518371946e-08, + "loss": 0.2099, + "reward": 0.5546875149011612, + "reward_std": 0.1941454131156206, + "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857387661934, + "rewards/tag_count_reward": 0.443080373108387, "step": 2875 }, { "clip_ratio": 0.0, - "completion_length": 1603.9487609863281, + "completion_length": 1776.1719360351562, "epoch": 0.8590844597117467, - "grad_norm": 12.720071792602539, - "kl": 0.272705078125, - "learning_rate": 5.913159132766271e-09, - "loss": 0.113, - "reward": 0.337611623108387, - "reward_std": 0.1981237642467022, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 8.698086738586426, + "kl": 4.51171875, + "learning_rate": 2.9565795663831357e-08, + "loss": 0.2484, + "reward": 0.4620535969734192, + "reward_std": 0.15839644148945808, + "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.428571455180645, "step": 2876 }, { "clip_ratio": 0.0, - "completion_length": 1611.5402526855469, + "completion_length": 1791.9353332519531, "epoch": 0.8593831677992682, - "grad_norm": 12.402349472045898, - "kl": 0.29296875, - "learning_rate": 5.8885811909505534e-09, - "loss": 0.1146, - "reward": 0.4771205559372902, - "reward_std": 0.19334588572382927, - "rewards/accuracy_reward": 0.16294643701985478, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3141741156578064, + "grad_norm": 24.997295379638672, + "kl": 4.76953125, + "learning_rate": 2.9442905954752768e-08, + "loss": 0.2623, + "reward": 0.5993303880095482, + "reward_std": 0.18251998722553253, + "rewards/accuracy_reward": 0.18526786006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4140625223517418, "step": 2877 }, { "clip_ratio": 0.0, - "completion_length": 1657.37060546875, + "completion_length": 1822.5514221191406, "epoch": 0.8596818758867897, - "grad_norm": 12.04929256439209, - "kl": 0.29052734375, - "learning_rate": 5.864051238035106e-09, - "loss": 0.1112, - "reward": 0.4726562723517418, - "reward_std": 0.2056572027504444, - "rewards/accuracy_reward": 0.16294643469154835, + "grad_norm": 8.625914573669434, + "kl": 4.06640625, + "learning_rate": 2.932025619017553e-08, + "loss": 0.2276, + "reward": 0.5987723618745804, + "reward_std": 0.19428959116339684, + "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.420200914144516, "step": 2878 }, { "clip_ratio": 0.0, - "completion_length": 1553.3861999511719, + "completion_length": 1765.0402526855469, "epoch": 0.8599805839743111, - "grad_norm": 10.274303436279297, - "kl": 0.208984375, - "learning_rate": 5.8395693007061266e-09, - "loss": 0.1134, - "reward": 0.3789062574505806, - "reward_std": 0.17618810385465622, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 9.86147689819336, + "kl": 3.6171875, + "learning_rate": 2.9197846503530632e-08, + "loss": 0.1936, + "reward": 0.4419643059372902, + "reward_std": 0.12080085463821888, + "rewards/accuracy_reward": 0.020089285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562574505806, + "rewards/tag_count_reward": 0.4218750223517418, "step": 2879 }, { "clip_ratio": 0.0, - "completion_length": 1609.3594055175781, + "completion_length": 1773.3817749023438, "epoch": 0.8602792920618326, - "grad_norm": 13.843689918518066, - "kl": 0.2841796875, - "learning_rate": 5.8151354055975656e-09, - "loss": 0.1241, - "reward": 0.405133955180645, - "reward_std": 0.2243237979710102, - "rewards/accuracy_reward": 0.0915178582072258, + "grad_norm": 14.128868103027344, + "kl": 4.4921875, + "learning_rate": 2.9075677027987826e-08, + "loss": 0.2685, + "reward": 0.4799107387661934, + "reward_std": 0.15419276431202888, + "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4107142984867096, "step": 2880 }, { "clip_ratio": 0.0, - "completion_length": 1556.7098999023438, + "completion_length": 1748.8080749511719, "epoch": 0.860578000149354, - "grad_norm": 13.423145294189453, - "kl": 0.243896484375, - "learning_rate": 5.790749579291093e-09, - "loss": 0.1327, - "reward": 0.3420759066939354, - "reward_std": 0.18767234310507774, - "rewards/accuracy_reward": 0.022321428870782256, + "grad_norm": 6.208010673522949, + "kl": 4.17578125, + "learning_rate": 2.8953747896455465e-08, + "loss": 0.2376, + "reward": 0.4693080484867096, + "reward_std": 0.16070923022925854, + "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544738650322, + "rewards/tag_count_reward": 0.4068080559372902, "step": 2881 }, { "clip_ratio": 0.0, - "completion_length": 1615.0603332519531, + "completion_length": 1847.79248046875, "epoch": 0.8608767082368756, - "grad_norm": 12.520122528076172, - "kl": 0.28564453125, - "learning_rate": 5.766411848316111e-09, - "loss": 0.1105, - "reward": 0.329241082072258, - "reward_std": 0.16489263251423836, - "rewards/accuracy_reward": 0.0133928582072258, + "grad_norm": 3.9581429958343506, + "kl": 4.9140625, + "learning_rate": 2.883205924158055e-08, + "loss": 0.2563, + "reward": 0.412946455180645, + "reward_std": 0.14471461437642574, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.3995535895228386, "step": 2882 }, { "clip_ratio": 0.0, - "completion_length": 1600.2723693847656, + "completion_length": 1755.2947082519531, "epoch": 0.861175416324397, - "grad_norm": 14.403778076171875, - "kl": 0.237548828125, - "learning_rate": 5.742122239149683e-09, - "loss": 0.1288, - "reward": 0.443638414144516, - "reward_std": 0.22406217455863953, - "rewards/accuracy_reward": 0.09821428847499192, + "grad_norm": 9.693257331848145, + "kl": 3.033203125, + "learning_rate": 2.8710611195748414e-08, + "loss": 0.189, + "reward": 0.5680803805589676, + "reward_std": 0.1811600998044014, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2883 }, { "clip_ratio": 0.0, - "completion_length": 1591.32373046875, + "completion_length": 1762.8661499023438, "epoch": 0.8614741244119185, - "grad_norm": 12.675947189331055, - "kl": 0.223876953125, - "learning_rate": 5.717880778216516e-09, - "loss": 0.1186, - "reward": 0.4419642984867096, - "reward_std": 0.18430664390325546, - "rewards/accuracy_reward": 0.10714286123402417, + "grad_norm": 13.252032279968262, + "kl": 2.912109375, + "learning_rate": 2.858940389108258e-08, + "loss": 0.1773, + "reward": 0.550223246216774, + "reward_std": 0.1731241550296545, + "rewards/accuracy_reward": 0.12276786146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.427455373108387, "step": 2884 }, { "clip_ratio": 0.0, - "completion_length": 1533.4509582519531, + "completion_length": 1700.6630249023438, "epoch": 0.8617728324994399, - "grad_norm": 14.3395414352417, - "kl": 0.2236328125, - "learning_rate": 5.693687491888943e-09, - "loss": 0.132, - "reward": 0.4363839402794838, - "reward_std": 0.24857207387685776, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 9.025867462158203, + "kl": 1.9765625, + "learning_rate": 2.8468437459444717e-08, + "loss": 0.1166, + "reward": 0.5664062798023224, + "reward_std": 0.23437238484621048, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3537946566939354, + "rewards/tag_count_reward": 0.4458705559372902, "step": 2885 }, { "clip_ratio": 0.0, - "completion_length": 1607.13623046875, + "completion_length": 1798.5134887695312, "epoch": 0.8620715405869613, - "grad_norm": 13.971430778503418, - "kl": 0.2294921875, - "learning_rate": 5.66954240648691e-09, - "loss": 0.1349, - "reward": 0.337053582072258, - "reward_std": 0.189704030752182, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 12.436566352844238, + "kl": 3.51171875, + "learning_rate": 2.834771203243455e-08, + "loss": 0.2076, + "reward": 0.431361623108387, + "reward_std": 0.1361066121608019, + "rewards/accuracy_reward": 0.01562500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4157366305589676, "step": 2886 }, { "clip_ratio": 0.0, - "completion_length": 1598.3014221191406, + "completion_length": 1788.5693054199219, "epoch": 0.8623702486744829, - "grad_norm": 12.55685806274414, - "kl": 0.24755859375, - "learning_rate": 5.645445548277872e-09, - "loss": 0.099, - "reward": 0.416852705180645, - "reward_std": 0.17590845003724098, - "rewards/accuracy_reward": 0.08258928963914514, + "grad_norm": 6.05305290222168, + "kl": 2.984375, + "learning_rate": 2.822722774138936e-08, + "loss": 0.1635, + "reward": 0.502232164144516, + "reward_std": 0.12791924364864826, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4174107313156128, "step": 2887 }, { "clip_ratio": 0.0, - "completion_length": 1553.6384582519531, + "completion_length": 1692.63623046875, "epoch": 0.8626689567620043, - "grad_norm": 14.256936073303223, - "kl": 0.250732421875, - "learning_rate": 5.621396943476864e-09, - "loss": 0.1481, - "reward": 0.4670759066939354, - "reward_std": 0.19655340164899826, - "rewards/accuracy_reward": 0.14508929220028222, + "grad_norm": 22.986787796020508, + "kl": 2.8125, + "learning_rate": 2.810698471738432e-08, + "loss": 0.1917, + "reward": 0.6216518133878708, + "reward_std": 0.14770063944160938, + "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.443080373108387, "step": 2888 }, { "clip_ratio": 0.0, - "completion_length": 1558.4911804199219, + "completion_length": 1770.7322082519531, "epoch": 0.8629676648495258, - "grad_norm": 11.303112030029297, - "kl": 0.232177734375, - "learning_rate": 5.597396618246419e-09, - "loss": 0.108, - "reward": 0.3437500149011612, - "reward_std": 0.16683512926101685, - "rewards/accuracy_reward": 0.020089287078008056, + "grad_norm": 5.892788410186768, + "kl": 3.8671875, + "learning_rate": 2.7986983091232098e-08, + "loss": 0.2131, + "reward": 0.4581473395228386, + "reward_std": 0.15718774870038033, + "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4179687649011612, "step": 2889 }, { "clip_ratio": 0.0, - "completion_length": 1606.7389221191406, + "completion_length": 1766.7813110351562, "epoch": 0.8632663729370472, - "grad_norm": 12.05361557006836, - "kl": 0.25732421875, - "learning_rate": 5.5734445986965254e-09, - "loss": 0.1083, - "reward": 0.4095982313156128, - "reward_std": 0.1789424680173397, - "rewards/accuracy_reward": 0.08035714388825, + "grad_norm": 15.965812683105469, + "kl": 2.9765625, + "learning_rate": 2.7867222993482626e-08, + "loss": 0.1681, + "reward": 0.5306919887661934, + "reward_std": 0.15369248017668724, + "rewards/accuracy_reward": 0.10044643562287092, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2890 }, { "clip_ratio": 0.0, - "completion_length": 1629.4911193847656, + "completion_length": 1797.2902526855469, "epoch": 0.8635650810245687, - "grad_norm": 11.474428176879883, - "kl": 0.254638671875, - "learning_rate": 5.549540910884648e-09, - "loss": 0.1204, - "reward": 0.4224330633878708, - "reward_std": 0.1887640692293644, - "rewards/accuracy_reward": 0.10491072200238705, + "grad_norm": 6.335684776306152, + "kl": 3.974609375, + "learning_rate": 2.774770455442324e-08, + "loss": 0.2259, + "reward": 0.5502232387661934, + "reward_std": 0.18266365304589272, + "rewards/accuracy_reward": 0.12276786658912897, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3175223395228386, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2891 }, { "clip_ratio": 0.0, - "completion_length": 1522.3728332519531, + "completion_length": 1708.0134582519531, "epoch": 0.8638637891120902, - "grad_norm": 13.080491065979004, - "kl": 0.252685546875, - "learning_rate": 5.525685580815631e-09, - "loss": 0.1486, - "reward": 0.3716518059372902, - "reward_std": 0.18890369683504105, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 11.674386024475098, + "kl": 4.46875, + "learning_rate": 2.7628427904078156e-08, + "loss": 0.263, + "reward": 0.4720982313156128, + "reward_std": 0.1682984046638012, + "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.431919664144516, "step": 2892 }, { "clip_ratio": 0.0, - "completion_length": 1571.3504943847656, + "completion_length": 1746.9933776855469, "epoch": 0.8641624971996117, - "grad_norm": 14.034618377685547, - "kl": 0.236328125, - "learning_rate": 5.501878634441759e-09, - "loss": 0.1672, - "reward": 0.364397332072258, - "reward_std": 0.2170727215707302, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 10.590655326843262, + "kl": 4.69140625, + "learning_rate": 2.7509393172208794e-08, + "loss": 0.2688, + "reward": 0.478794664144516, + "reward_std": 0.20007147267460823, + "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2893 }, { "clip_ratio": 0.0, - "completion_length": 1673.7969360351562, + "completion_length": 1807.8482971191406, "epoch": 0.8644612052871331, - "grad_norm": 15.276105880737305, - "kl": 0.234619140625, - "learning_rate": 5.478120097662653e-09, - "loss": 0.1341, - "reward": 0.3141741156578064, - "reward_std": 0.18245990201830864, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 13.005739212036133, + "kl": 4.26171875, + "learning_rate": 2.7390600488313265e-08, + "loss": 0.2341, + "reward": 0.4347098395228386, + "reward_std": 0.12821108847856522, + "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.4190848469734192, "step": 2894 }, { "clip_ratio": 0.0, - "completion_length": 1582.7969055175781, + "completion_length": 1756.9308776855469, "epoch": 0.8647599133746546, - "grad_norm": 15.94528579711914, - "kl": 0.221923828125, - "learning_rate": 5.454409996325255e-09, - "loss": 0.1259, - "reward": 0.4620535895228386, - "reward_std": 0.2403925657272339, - "rewards/accuracy_reward": 0.13169643888249993, + "grad_norm": 9.230589866638184, + "kl": 2.9375, + "learning_rate": 2.727204998162627e-08, + "loss": 0.176, + "reward": 0.6004464626312256, + "reward_std": 0.20939970389008522, + "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4464285969734192, "step": 2895 }, { "clip_ratio": 0.0, - "completion_length": 1648.0447082519531, + "completion_length": 1810.5246276855469, "epoch": 0.865058621462176, - "grad_norm": 11.991477966308594, - "kl": 0.217041015625, - "learning_rate": 5.430748356223847e-09, - "loss": 0.0992, - "reward": 0.3828125298023224, - "reward_std": 0.17421892285346985, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 13.729401588439941, + "kl": 2.658203125, + "learning_rate": 2.7153741781119232e-08, + "loss": 0.1406, + "reward": 0.490513414144516, + "reward_std": 0.10783707350492477, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2896 }, { "clip_ratio": 0.0, - "completion_length": 1668.8594665527344, + "completion_length": 1802.5425109863281, "epoch": 0.8653573295496976, - "grad_norm": 14.430756568908691, - "kl": 0.306396484375, - "learning_rate": 5.407135203099983e-09, - "loss": 0.13, - "reward": 0.3917410895228386, - "reward_std": 0.21917220205068588, - "rewards/accuracy_reward": 0.10044643376022577, + "grad_norm": 11.336731910705566, + "kl": 3.828125, + "learning_rate": 2.7035676015499913e-08, + "loss": 0.2178, + "reward": 0.5245535895228386, + "reward_std": 0.15631429478526115, + "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.291294664144516, + "rewards/tag_count_reward": 0.4241071566939354, "step": 2897 }, { "clip_ratio": 0.0, - "completion_length": 1650.5335388183594, + "completion_length": 1787.0001220703125, "epoch": 0.865656037637219, - "grad_norm": 13.939082145690918, - "kl": 0.2724609375, - "learning_rate": 5.383570562642437e-09, - "loss": 0.1461, - "reward": 0.3844866305589676, - "reward_std": 0.16787252202630043, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 16.975807189941406, + "kl": 4.53125, + "learning_rate": 2.691785281321218e-08, + "loss": 0.2668, + "reward": 0.4933035895228386, + "reward_std": 0.1127147227525711, + "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937723517418, + "rewards/tag_count_reward": 0.419642873108387, "step": 2898 }, { "clip_ratio": 0.0, - "completion_length": 1590.2210388183594, + "completion_length": 1756.33935546875, "epoch": 0.8659547457247405, - "grad_norm": 14.977103233337402, - "kl": 0.274169921875, - "learning_rate": 5.3600544604872345e-09, - "loss": 0.1459, - "reward": 0.431361623108387, - "reward_std": 0.21347641199827194, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 9.351993560791016, + "kl": 3.51171875, + "learning_rate": 2.6800272302436174e-08, + "loss": 0.2228, + "reward": 0.5675223544239998, + "reward_std": 0.1866442058235407, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4380580484867096, "step": 2899 }, { "clip_ratio": 0.0, - "completion_length": 1560.2344360351562, + "completion_length": 1708.5134582519531, "epoch": 0.8662534538122619, - "grad_norm": 15.770025253295898, - "kl": 0.24658203125, - "learning_rate": 5.336586922217606e-09, - "loss": 0.1445, - "reward": 0.4056919813156128, - "reward_std": 0.21135051921010017, - "rewards/accuracy_reward": 0.06919643003493547, + "grad_norm": 7.9692254066467285, + "kl": 3.8515625, + "learning_rate": 2.6682934611088032e-08, + "loss": 0.2288, + "reward": 0.5212053805589676, + "reward_std": 0.15833407267928123, + "rewards/accuracy_reward": 0.08928571990691125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.431919664144516, "step": 2900 }, { "clip_ratio": 0.0, - "completion_length": 1573.0469360351562, + "completion_length": 1715.5245971679688, "epoch": 0.8665521618997835, - "grad_norm": 19.755407333374023, - "kl": 0.310791015625, - "learning_rate": 5.313167973363913e-09, - "loss": 0.1431, - "reward": 0.4503348395228386, - "reward_std": 0.20519373938441277, - "rewards/accuracy_reward": 0.11160714668221772, + "grad_norm": 14.396929740905762, + "kl": 4.49609375, + "learning_rate": 2.6565839866819567e-08, + "loss": 0.2526, + "reward": 0.5295759215950966, + "reward_std": 0.16348044201731682, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2901 }, { "clip_ratio": 0.0, - "completion_length": 1634.993408203125, + "completion_length": 1812.9375610351562, "epoch": 0.8668508699873049, - "grad_norm": 11.503289222717285, - "kl": 0.2705078125, - "learning_rate": 5.289797639403687e-09, - "loss": 0.1036, - "reward": 0.345424123108387, - "reward_std": 0.16331008449196815, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 10.073188781738281, + "kl": 3.697265625, + "learning_rate": 2.6448988197018434e-08, + "loss": 0.2222, + "reward": 0.493861623108387, + "reward_std": 0.14102050475776196, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2985491156578064, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2902 }, { "clip_ratio": 0.0, - "completion_length": 1580.7746276855469, + "completion_length": 1711.2143859863281, "epoch": 0.8671495780748264, - "grad_norm": 13.726664543151855, - "kl": 0.238037109375, - "learning_rate": 5.2664759457615614e-09, - "loss": 0.1534, - "reward": 0.3772321566939354, - "reward_std": 0.20300839841365814, - "rewards/accuracy_reward": 0.040178573690354824, + "grad_norm": 8.379871368408203, + "kl": 3.6015625, + "learning_rate": 2.6332379728807806e-08, + "loss": 0.235, + "reward": 0.4860491305589676, + "reward_std": 0.1665329448878765, + "rewards/accuracy_reward": 0.05133928870782256, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2903 }, { "clip_ratio": 0.0, - "completion_length": 1636.2991638183594, + "completion_length": 1742.7701721191406, "epoch": 0.8674482861623478, - "grad_norm": 13.435416221618652, - "kl": 0.26416015625, - "learning_rate": 5.243202917809248e-09, - "loss": 0.1238, - "reward": 0.3984375074505806, - "reward_std": 0.15543904341757298, - "rewards/accuracy_reward": 0.078125, + "grad_norm": 9.887556076049805, + "kl": 4.150390625, + "learning_rate": 2.6216014589046235e-08, + "loss": 0.2633, + "reward": 0.5089285895228386, + "reward_std": 0.11807173304259777, + "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2904 }, { "clip_ratio": 0.0, - "completion_length": 1616.3616943359375, + "completion_length": 1808.384033203125, "epoch": 0.8677469942498693, - "grad_norm": 15.66750431060791, - "kl": 0.25146484375, - "learning_rate": 5.21997858086553e-09, - "loss": 0.1338, - "reward": 0.3722098395228386, - "reward_std": 0.15964319184422493, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 13.65540885925293, + "kl": 4.48046875, + "learning_rate": 2.6099892904327653e-08, + "loss": 0.2657, + "reward": 0.4659598395228386, + "reward_std": 0.13506022468209267, + "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.419084832072258, "step": 2905 }, { "clip_ratio": 0.0, - "completion_length": 1578.0983276367188, + "completion_length": 1735.9532165527344, "epoch": 0.8680457023373908, - "grad_norm": 14.354774475097656, - "kl": 0.22900390625, - "learning_rate": 5.196802960196189e-09, - "loss": 0.1553, - "reward": 0.4135044813156128, - "reward_std": 0.1799180917441845, - "rewards/accuracy_reward": 0.08705357578583062, + "grad_norm": 10.001546859741211, + "kl": 3.91015625, + "learning_rate": 2.5984014800980948e-08, + "loss": 0.2438, + "reward": 0.5323660895228386, + "reward_std": 0.13430923223495483, + "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2906 }, { "clip_ratio": 0.0, - "completion_length": 1645.72998046875, + "completion_length": 1790.6630249023438, "epoch": 0.8683444104249123, - "grad_norm": 12.39021110534668, - "kl": 0.236083984375, - "learning_rate": 5.173676081014044e-09, - "loss": 0.1079, - "reward": 0.4101562649011612, - "reward_std": 0.18723544850945473, - "rewards/accuracy_reward": 0.08705357648432255, + "grad_norm": 23.342836380004883, + "kl": 5.203125, + "learning_rate": 2.5868380405070223e-08, + "loss": 0.2715, + "reward": 0.4860491305589676, + "reward_std": 0.11992811970412731, + "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4101562723517418, "step": 2907 }, { "clip_ratio": 0.0, - "completion_length": 1640.4130249023438, + "completion_length": 1817.872802734375, "epoch": 0.8686431185124337, - "grad_norm": 10.390195846557617, - "kl": 0.241455078125, - "learning_rate": 5.150597968478882e-09, - "loss": 0.1096, - "reward": 0.3247767984867096, - "reward_std": 0.18146073073148727, - "rewards/accuracy_reward": 0.020089287078008056, + "grad_norm": 31.480501174926758, + "kl": 6.1875, + "learning_rate": 2.575298984239441e-08, + "loss": 0.317, + "reward": 0.4257812649011612, + "reward_std": 0.1364359501749277, + "rewards/accuracy_reward": 0.024553573224693537, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.4012276977300644, "step": 2908 }, { "clip_ratio": 0.0, - "completion_length": 1655.3527526855469, + "completion_length": 1829.2121276855469, "epoch": 0.8689418265999552, - "grad_norm": 12.241140365600586, - "kl": 0.243408203125, - "learning_rate": 5.127568647697406e-09, - "loss": 0.1129, - "reward": 0.353794664144516, - "reward_std": 0.18107172474265099, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 24.802799224853516, + "kl": 5.359375, + "learning_rate": 2.5637843238487027e-08, + "loss": 0.267, + "reward": 0.5117187723517418, + "reward_std": 0.18450995348393917, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.4224330559372902, "step": 2909 }, { "clip_ratio": 0.0, - "completion_length": 1675.6250610351562, + "completion_length": 1787.22998046875, "epoch": 0.8692405346874766, - "grad_norm": 13.01878547668457, - "kl": 0.28662109375, - "learning_rate": 5.104588143723271e-09, - "loss": 0.1076, - "reward": 0.377232164144516, - "reward_std": 0.17827618680894375, - "rewards/accuracy_reward": 0.0647321455180645, + "grad_norm": 7.211831569671631, + "kl": 4.0703125, + "learning_rate": 2.5522940718616354e-08, + "loss": 0.2249, + "reward": 0.4866071790456772, + "reward_std": 0.13803520798683167, + "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000074505806, + "rewards/tag_count_reward": 0.4241071715950966, "step": 2910 }, { "clip_ratio": 0.0, - "completion_length": 1644.51123046875, + "completion_length": 1766.055908203125, "epoch": 0.8695392427749982, - "grad_norm": 14.091371536254883, - "kl": 0.300048828125, - "learning_rate": 5.081656481557017e-09, - "loss": 0.1448, - "reward": 0.3431919813156128, - "reward_std": 0.1783677339553833, - "rewards/accuracy_reward": 0.031250000931322575, + "grad_norm": 10.406194686889648, + "kl": 5.12890625, + "learning_rate": 2.5408282407785085e-08, + "loss": 0.2952, + "reward": 0.4520089402794838, + "reward_std": 0.13141670264303684, + "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2911 }, { "clip_ratio": 0.0, - "completion_length": 1596.5023193359375, + "completion_length": 1778.7991943359375, "epoch": 0.8698379508625196, - "grad_norm": 13.65075969696045, - "kl": 0.266357421875, - "learning_rate": 5.05877368614604e-09, - "loss": 0.1224, - "reward": 0.415178582072258, - "reward_std": 0.2151094563305378, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 3.437451124191284, + "kl": 3.673828125, + "learning_rate": 2.52938684307302e-08, + "loss": 0.2092, + "reward": 0.5066964402794838, + "reward_std": 0.13929622434079647, + "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2912 }, { "clip_ratio": 0.0, - "completion_length": 1615.1384582519531, + "completion_length": 1801.1563415527344, "epoch": 0.8701366589500411, - "grad_norm": 13.00080680847168, - "kl": 0.26318359375, - "learning_rate": 5.035939782384569e-09, - "loss": 0.1309, - "reward": 0.4291294887661934, - "reward_std": 0.20719130337238312, - "rewards/accuracy_reward": 0.10714286426082253, + "grad_norm": 23.609464645385742, + "kl": 5.4765625, + "learning_rate": 2.5179698911922842e-08, + "loss": 0.3168, + "reward": 0.5167411044239998, + "reward_std": 0.1806628257036209, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.3984375223517418, "step": 2913 }, { "clip_ratio": 0.0, - "completion_length": 1653.1161499023438, + "completion_length": 1829.7388916015625, "epoch": 0.8704353670375625, - "grad_norm": 12.6930570602417, - "kl": 0.275390625, - "learning_rate": 5.0131547951136775e-09, - "loss": 0.1166, - "reward": 0.384486623108387, - "reward_std": 0.16531528159976006, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 17.717758178710938, + "kl": 5.77734375, + "learning_rate": 2.506577397556839e-08, + "loss": 0.296, + "reward": 0.487165205180645, + "reward_std": 0.13587961718440056, + "rewards/accuracy_reward": 0.08258928591385484, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937574505806, + "rewards/tag_count_reward": 0.404575914144516, "step": 2914 }, { "clip_ratio": 0.0, - "completion_length": 1571.7657165527344, + "completion_length": 1782.0848999023438, "epoch": 0.870734075125084, - "grad_norm": 13.123603820800781, - "kl": 0.24365234375, - "learning_rate": 4.990418749121178e-09, - "loss": 0.1319, - "reward": 0.4676339626312256, - "reward_std": 0.2035743035376072, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 89.76165771484375, + "kl": 3.30859375, + "learning_rate": 2.495209374560589e-08, + "loss": 0.2081, + "reward": 0.5736607536673546, + "reward_std": 0.19215238094329834, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2915 }, { "clip_ratio": 0.0, - "completion_length": 1526.5938110351562, - "epoch": 0.8710327832126055, - "grad_norm": 10.301504135131836, - "kl": 0.25634765625, - "learning_rate": 4.9677316691416615e-09, - "loss": 0.1259, - "reward": 0.4347098395228386, - "reward_std": 0.1895349696278572, - "rewards/accuracy_reward": 0.0959821492433548, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "completion_length": 1709.2166137695312, + "epoch": 0.8710327832126055, + "grad_norm": 3.1122586727142334, + "kl": 2.77734375, + "learning_rate": 2.4838658345708307e-08, + "loss": 0.1557, + "reward": 0.5150669813156128, + "reward_std": 0.14744308032095432, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4347098395228386, "step": 2916 }, { "clip_ratio": 0.0, - "completion_length": 1596.4554138183594, + "completion_length": 1782.3125610351562, "epoch": 0.871331491300127, - "grad_norm": 13.427556037902832, - "kl": 0.24951171875, - "learning_rate": 4.945093579856463e-09, - "loss": 0.1283, - "reward": 0.4369419887661934, - "reward_std": 0.2043488509953022, - "rewards/accuracy_reward": 0.11160714854486287, + "grad_norm": 8.200718879699707, + "kl": 2.6875, + "learning_rate": 2.4725467899282315e-08, + "loss": 0.1557, + "reward": 0.561383955180645, + "reward_std": 0.17512557283043861, + "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4296875149011612, "step": 2917 }, { "clip_ratio": 0.0, - "completion_length": 1622.63623046875, + "completion_length": 1821.0893859863281, "epoch": 0.8716301993876484, - "grad_norm": 10.79004955291748, - "kl": 0.27880859375, - "learning_rate": 4.922504505893582e-09, - "loss": 0.1129, - "reward": 0.3309151902794838, - "reward_std": 0.1974078267812729, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 13.514201164245605, + "kl": 2.96484375, + "learning_rate": 2.461252252946791e-08, + "loss": 0.1747, + "reward": 0.4648437723517418, + "reward_std": 0.1687844954431057, + "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2929687574505806, + "rewards/tag_count_reward": 0.4157366156578064, "step": 2918 }, { "clip_ratio": 0.0, - "completion_length": 1589.4732971191406, + "completion_length": 1734.6451721191406, "epoch": 0.8719289074751699, - "grad_norm": 11.31098747253418, - "kl": 0.267578125, - "learning_rate": 4.899964471827744e-09, - "loss": 0.1248, - "reward": 0.3510044813156128, - "reward_std": 0.18175174295902252, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 17.268817901611328, + "kl": 2.423828125, + "learning_rate": 2.449982235913872e-08, + "loss": 0.1662, + "reward": 0.5323660969734192, + "reward_std": 0.20988789200782776, + "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4386160969734192, "step": 2919 }, { "clip_ratio": 0.0, - "completion_length": 1710.5826721191406, + "completion_length": 1888.1094665527344, "epoch": 0.8722276155626913, - "grad_norm": 12.823222160339355, - "kl": 0.260986328125, - "learning_rate": 4.877473502180269e-09, - "loss": 0.1012, - "reward": 0.3398437574505806, - "reward_std": 0.18589911609888077, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 5.91530704498291, + "kl": 2.869140625, + "learning_rate": 2.4387367510901342e-08, + "loss": 0.1492, + "reward": 0.4453125149011612, + "reward_std": 0.11745469272136688, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544738650322, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2920 }, { "clip_ratio": 0.0, - "completion_length": 1524.7813110351562, + "completion_length": 1662.2411499023438, "epoch": 0.8725263236502129, - "grad_norm": 12.257218360900879, - "kl": 0.229736328125, - "learning_rate": 4.855031621419143e-09, - "loss": 0.1304, - "reward": 0.4029018059372902, - "reward_std": 0.17749993689358234, - "rewards/accuracy_reward": 0.05580357415601611, + "grad_norm": 20.541122436523438, + "kl": 1.78515625, + "learning_rate": 2.427515810709571e-08, + "loss": 0.123, + "reward": 0.5094866380095482, + "reward_std": 0.12080631777644157, + "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982238650322, + "rewards/tag_count_reward": 0.4536830559372902, "step": 2921 }, { "clip_ratio": 0.0, - "completion_length": 1615.6183776855469, + "completion_length": 1790.8482971191406, "epoch": 0.8728250317377343, - "grad_norm": 11.125859260559082, - "kl": 0.2646484375, - "learning_rate": 4.832638853958937e-09, - "loss": 0.1105, - "reward": 0.4162946715950966, - "reward_std": 0.1795135922729969, - "rewards/accuracy_reward": 0.09598214644938707, + "grad_norm": 14.063139915466309, + "kl": 3.62109375, + "learning_rate": 2.4163194269794686e-08, + "loss": 0.2145, + "reward": 0.5200893059372902, + "reward_std": 0.15847296826541424, + "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4129464477300644, "step": 2922 }, { "clip_ratio": 0.0, - "completion_length": 1477.9509887695312, + "completion_length": 1619.0625610351562, "epoch": 0.8731237398252558, - "grad_norm": 15.16588306427002, - "kl": 0.223388671875, - "learning_rate": 4.8102952241607764e-09, - "loss": 0.1597, - "reward": 0.4525669887661934, - "reward_std": 0.1797909215092659, - "rewards/accuracy_reward": 0.09821429220028222, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526977300644, + "grad_norm": 18.12051010131836, + "kl": 2.103515625, + "learning_rate": 2.4051476120803883e-08, + "loss": 0.1477, + "reward": 0.5602678954601288, + "reward_std": 0.13968565501272678, + "rewards/accuracy_reward": 0.11160714644938707, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4486607387661934, "step": 2923 }, { "clip_ratio": 0.0, - "completion_length": 1611.2969360351562, + "completion_length": 1784.4777526855469, "epoch": 0.8734224479127772, - "grad_norm": 12.0881986618042, - "kl": 0.2724609375, - "learning_rate": 4.788000756332339e-09, - "loss": 0.1165, - "reward": 0.344866082072258, - "reward_std": 0.19979829341173172, - "rewards/accuracy_reward": 0.03348214365541935, + "grad_norm": 18.13631248474121, + "kl": 2.9140625, + "learning_rate": 2.3940003781661693e-08, + "loss": 0.1683, + "reward": 0.487723246216774, + "reward_std": 0.16260675713419914, + "rewards/accuracy_reward": 0.058035718742758036, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3113839402794838, + "rewards/tag_count_reward": 0.4296875223517418, "step": 2924 }, { "clip_ratio": 0.0, - "completion_length": 1580.9375610351562, + "completion_length": 1788.4532470703125, "epoch": 0.8737211560002988, - "grad_norm": 13.115598678588867, - "kl": 0.24609375, - "learning_rate": 4.765755474727839e-09, - "loss": 0.1406, - "reward": 0.383928582072258, - "reward_std": 0.1762678548693657, - "rewards/accuracy_reward": 0.051339288242161274, + "grad_norm": 4.15519905090332, + "kl": 3.7265625, + "learning_rate": 2.3828777373639196e-08, + "loss": 0.2088, + "reward": 0.4821428880095482, + "reward_std": 0.15781277790665627, + "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.332589291036129, + "rewards/tag_count_reward": 0.419642873108387, "step": 2925 }, { "clip_ratio": 0.0, - "completion_length": 1543.9264221191406, + "completion_length": 1731.6764221191406, "epoch": 0.8740198640878202, - "grad_norm": 11.49685001373291, - "kl": 0.19921875, - "learning_rate": 4.743559403547942e-09, - "loss": 0.1252, - "reward": 0.4068080559372902, - "reward_std": 0.21442000940442085, - "rewards/accuracy_reward": 0.05357143096625805, + "grad_norm": 11.197689056396484, + "kl": 2.5078125, + "learning_rate": 2.371779701773971e-08, + "loss": 0.162, + "reward": 0.5217634290456772, + "reward_std": 0.17779066413640976, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.353236623108387, + "rewards/tag_count_reward": 0.4458705559372902, "step": 2926 }, { "clip_ratio": 0.0, - "completion_length": 1628.0291137695312, + "completion_length": 1761.0848999023438, "epoch": 0.8743185721753417, - "grad_norm": 12.326004981994629, - "kl": 0.263671875, - "learning_rate": 4.721412566939803e-09, - "loss": 0.1394, - "reward": 0.4045759066939354, - "reward_std": 0.19287248328328133, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 5.045070648193359, + "kl": 3.57421875, + "learning_rate": 2.3607062834699017e-08, + "loss": 0.2252, + "reward": 0.5401785969734192, + "reward_std": 0.17989646829664707, + "rewards/accuracy_reward": 0.10937500838190317, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4308035969734192, "step": 2927 }, { "clip_ratio": 0.0, - "completion_length": 1546.5067443847656, + "completion_length": 1755.65185546875, "epoch": 0.8746172802628631, - "grad_norm": 14.756258964538574, - "kl": 0.24267578125, - "learning_rate": 4.699314988997027e-09, - "loss": 0.1369, - "reward": 0.3722098395228386, - "reward_std": 0.17092572152614594, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 14.022778511047363, + "kl": 3.19921875, + "learning_rate": 2.3496574944985136e-08, + "loss": 0.2116, + "reward": 0.4843750298023224, + "reward_std": 0.1548378560692072, + "rewards/accuracy_reward": 0.04241071711294353, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169813156128, + "rewards/tag_count_reward": 0.4419643059372902, "step": 2928 }, { "clip_ratio": 0.0, - "completion_length": 1458.8393249511719, + "completion_length": 1613.7300109863281, "epoch": 0.8749159883503845, - "grad_norm": 12.826144218444824, - "kl": 0.2392578125, - "learning_rate": 4.677266693759585e-09, - "loss": 0.1369, - "reward": 0.4386160895228386, - "reward_std": 0.1933407261967659, - "rewards/accuracy_reward": 0.08482143096625805, + "grad_norm": 8.716017723083496, + "kl": 3.453125, + "learning_rate": 2.3386333468797924e-08, + "loss": 0.234, + "reward": 0.5368303880095482, + "reward_std": 0.17432688735425472, + "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.353794664144516, + "rewards/tag_count_reward": 0.4408482387661934, "step": 2929 }, { "clip_ratio": 0.0, - "completion_length": 1673.1451416015625, + "completion_length": 1815.9286499023438, "epoch": 0.8752146964379061, - "grad_norm": 11.104615211486816, - "kl": 0.294921875, - "learning_rate": 4.6552677052138835e-09, - "loss": 0.1121, - "reward": 0.3934151977300644, - "reward_std": 0.18998592346906662, - "rewards/accuracy_reward": 0.09821428963914514, + "grad_norm": 43.31866455078125, + "kl": 5.8515625, + "learning_rate": 2.3276338526069417e-08, + "loss": 0.3139, + "reward": 0.534598246216774, + "reward_std": 0.16836016438901424, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952008992433548, + "rewards/tag_count_reward": 0.4029017984867096, "step": 2930 }, { "clip_ratio": 0.0, - "completion_length": 1622.6027526855469, + "completion_length": 1798.8728637695312, "epoch": 0.8755134045254275, - "grad_norm": 13.527922630310059, - "kl": 0.242919921875, - "learning_rate": 4.63331804729265e-09, - "loss": 0.1228, - "reward": 0.3398437723517418, - "reward_std": 0.1883390098810196, - "rewards/accuracy_reward": 0.022321428870782256, + "grad_norm": 9.349672317504883, + "kl": 4.46875, + "learning_rate": 2.3166590236463253e-08, + "loss": 0.248, + "reward": 0.479352705180645, + "reward_std": 0.1662589255720377, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2931 }, { "clip_ratio": 0.0, - "completion_length": 1592.0536499023438, + "completion_length": 1771.3259582519531, "epoch": 0.875812112612949, - "grad_norm": 14.226743698120117, - "kl": 0.257568359375, - "learning_rate": 4.611417743874968e-09, - "loss": 0.1456, - "reward": 0.4771205633878708, - "reward_std": 0.22928787022829056, + "grad_norm": 45.313385009765625, + "kl": 5.58203125, + "learning_rate": 2.305708871937484e-08, + "loss": 0.3028, + "reward": 0.5664062723517418, + "reward_std": 0.1782206017524004, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.334263414144516, + "rewards/tag_count_reward": 0.4235491305589676, "step": 2932 }, { "clip_ratio": 0.0, - "completion_length": 1570.3683776855469, + "completion_length": 1778.3951721191406, "epoch": 0.8761108207004704, - "grad_norm": 13.2888765335083, - "kl": 0.235595703125, - "learning_rate": 4.589566818786228e-09, - "loss": 0.1037, - "reward": 0.4079241305589676, - "reward_std": 0.22367654368281364, - "rewards/accuracy_reward": 0.08482143469154835, + "grad_norm": 22.255882263183594, + "kl": 5.15234375, + "learning_rate": 2.294783409393114e-08, + "loss": 0.2913, + "reward": 0.4760044813156128, + "reward_std": 0.20515748113393784, + "rewards/accuracy_reward": 0.06250000488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.4135044813156128, "step": 2933 }, { "clip_ratio": 0.0, - "completion_length": 1627.1451416015625, + "completion_length": 1802.6116638183594, "epoch": 0.8764095287879919, - "grad_norm": 13.67842960357666, - "kl": 0.235107421875, - "learning_rate": 4.567765295798082e-09, - "loss": 0.1229, - "reward": 0.3593750223517418, - "reward_std": 0.20183765515685081, - "rewards/accuracy_reward": 0.03794643096625805, + "grad_norm": 19.747251510620117, + "kl": 4.20703125, + "learning_rate": 2.283882647899041e-08, + "loss": 0.218, + "reward": 0.4771205633878708, + "reward_std": 0.1563674509525299, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4302455484867096, "step": 2934 }, { "clip_ratio": 0.0, - "completion_length": 1631.5402221679688, + "completion_length": 1783.9063415527344, "epoch": 0.8767082368755134, - "grad_norm": 13.490703582763672, - "kl": 0.26513671875, - "learning_rate": 4.546013198628457e-09, - "loss": 0.1345, - "reward": 0.3549107313156128, - "reward_std": 0.20872655138373375, - "rewards/accuracy_reward": 0.02901785890571773, + "grad_norm": 7.149814605712891, + "kl": 4.01953125, + "learning_rate": 2.2730065993142283e-08, + "loss": 0.2336, + "reward": 0.482142873108387, + "reward_std": 0.1735913883894682, + "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.435267873108387, "step": 2935 }, { "clip_ratio": 0.0, - "completion_length": 1600.7790832519531, + "completion_length": 1735.1295471191406, "epoch": 0.8770069449630349, - "grad_norm": 11.82103157043457, - "kl": 0.273681640625, - "learning_rate": 4.524310550941512e-09, - "loss": 0.1396, - "reward": 0.5000000223517418, - "reward_std": 0.18130707740783691, - "rewards/accuracy_reward": 0.18526786682195961, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "grad_norm": 8.458356857299805, + "kl": 4.57421875, + "learning_rate": 2.2621552754707562e-08, + "loss": 0.2651, + "reward": 0.6294643133878708, + "reward_std": 0.14252236299216747, + "rewards/accuracy_reward": 0.20089286286383867, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4285714477300644, "step": 2936 }, { "clip_ratio": 0.0, - "completion_length": 1655.4219360351562, + "completion_length": 1792.9978332519531, "epoch": 0.8773056530505563, - "grad_norm": 11.565348625183105, - "kl": 0.271240234375, - "learning_rate": 4.502657376347579e-09, - "loss": 0.1103, - "reward": 0.3454241156578064, - "reward_std": 0.14942358061671257, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 16.847572326660156, + "kl": 4.9375, + "learning_rate": 2.25132868817379e-08, + "loss": 0.2557, + "reward": 0.4681919887661934, + "reward_std": 0.11617675051093102, + "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.309709832072258, + "rewards/tag_count_reward": 0.4213169813156128, "step": 2937 }, { "clip_ratio": 0.0, - "completion_length": 1660.8415832519531, + "completion_length": 1801.0089721679688, "epoch": 0.8776043611380778, - "grad_norm": 14.992732048034668, - "kl": 0.24365234375, - "learning_rate": 4.481053698403203e-09, - "loss": 0.1356, - "reward": 0.3515625223517418, - "reward_std": 0.15970000624656677, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 9.531877517700195, + "kl": 4.625, + "learning_rate": 2.2405268492016015e-08, + "loss": 0.2494, + "reward": 0.463169664144516, + "reward_std": 0.1300626490265131, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4185268059372902, "step": 2938 }, { "clip_ratio": 0.0, - "completion_length": 1599.5090026855469, + "completion_length": 1779.6920166015625, "epoch": 0.8779030692255992, - "grad_norm": 11.918253898620605, - "kl": 0.279296875, - "learning_rate": 4.459499540611078e-09, - "loss": 0.1316, - "reward": 0.318080373108387, - "reward_std": 0.1818283125758171, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 4.441956996917725, + "kl": 4.31640625, + "learning_rate": 2.229749770305539e-08, + "loss": 0.242, + "reward": 0.455915205180645, + "reward_std": 0.1365038137882948, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.420200914144516, "step": 2939 }, { "clip_ratio": 0.0, - "completion_length": 1574.0536499023438, + "completion_length": 1797.0826721191406, "epoch": 0.8782017773131208, - "grad_norm": 15.44473648071289, - "kl": 0.22265625, - "learning_rate": 4.437994926419991e-09, - "loss": 0.138, - "reward": 0.3833705484867096, - "reward_std": 0.1717231199145317, - "rewards/accuracy_reward": 0.04687500209547579, + "grad_norm": 13.035404205322266, + "kl": 2.236328125, + "learning_rate": 2.2189974632099955e-08, + "loss": 0.1537, + "reward": 0.4810268059372902, + "reward_std": 0.09315787442028522, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.4430803805589676, "step": 2940 }, { "clip_ratio": 0.0, - "completion_length": 1611.341552734375, + "completion_length": 1745.415283203125, "epoch": 0.8785004854006422, - "grad_norm": 15.464777946472168, - "kl": 0.226806640625, - "learning_rate": 4.416539879224873e-09, - "loss": 0.1356, - "reward": 0.5262276977300644, - "reward_std": 0.23328286781907082, - "rewards/accuracy_reward": 0.19642858020961285, + "grad_norm": 9.836873054504395, + "kl": 2.74609375, + "learning_rate": 2.2082699396124364e-08, + "loss": 0.1659, + "reward": 0.663504496216774, + "reward_std": 0.1744316667318344, + "rewards/accuracy_reward": 0.22321429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4402901977300644, "step": 2941 }, { "clip_ratio": 0.0, - "completion_length": 1586.0848999023438, + "completion_length": 1776.7925109863281, "epoch": 0.8787991934881637, - "grad_norm": 27.332942962646484, - "kl": 0.753662109375, - "learning_rate": 4.3951344223667144e-09, - "loss": 0.1421, - "reward": 0.3699776977300644, - "reward_std": 0.1862749569118023, - "rewards/accuracy_reward": 0.029017859138548374, + "grad_norm": 13.220605850219727, + "kl": 3.07421875, + "learning_rate": 2.197567211183357e-08, + "loss": 0.1766, + "reward": 0.4927455708384514, + "reward_std": 0.1520634312182665, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.4347098469734192, "step": 2942 }, { "clip_ratio": 0.0, - "completion_length": 1579.4755249023438, + "completion_length": 1749.2121276855469, "epoch": 0.8790979015756851, - "grad_norm": 13.61863899230957, - "kl": 0.23876953125, - "learning_rate": 4.3737785791325366e-09, - "loss": 0.1416, - "reward": 0.3694196566939354, - "reward_std": 0.20163971185684204, - "rewards/accuracy_reward": 0.026785715483129025, + "grad_norm": 6.716597080230713, + "kl": 3.1767578125, + "learning_rate": 2.186889289566268e-08, + "loss": 0.1721, + "reward": 0.4960937723517418, + "reward_std": 0.1546249557286501, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.4425223395228386, "step": 2943 }, { "clip_ratio": 0.0, - "completion_length": 1621.29248046875, + "completion_length": 1778.3861999511719, "epoch": 0.8793966096632067, - "grad_norm": 12.612519264221191, - "kl": 0.241943359375, - "learning_rate": 4.3524723727554226e-09, - "loss": 0.1297, - "reward": 0.4720982387661934, - "reward_std": 0.20927994325757027, - "rewards/accuracy_reward": 0.1361607164144516, + "grad_norm": 14.6886625289917, + "kl": 3.05078125, + "learning_rate": 2.1762361863777114e-08, + "loss": 0.1776, + "reward": 0.5708705633878708, + "reward_std": 0.16768885403871536, + "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375223517418, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2944 }, { "clip_ratio": 0.0, - "completion_length": 1602.1340026855469, + "completion_length": 1756.1540832519531, "epoch": 0.8796953177507281, - "grad_norm": 11.086021423339844, - "kl": 0.276123046875, - "learning_rate": 4.331215826414419e-09, - "loss": 0.1281, - "reward": 0.3828125149011612, - "reward_std": 0.17424282431602478, - "rewards/accuracy_reward": 0.06919643259607255, + "grad_norm": 5.546716690063477, + "kl": 3.890625, + "learning_rate": 2.1656079132072098e-08, + "loss": 0.2314, + "reward": 0.5089285895228386, + "reward_std": 0.13526790775358677, + "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2945 }, { "clip_ratio": 0.0, - "completion_length": 1656.9264221191406, + "completion_length": 1769.1943054199219, "epoch": 0.8799940258382496, - "grad_norm": 11.719416618347168, - "kl": 0.2783203125, - "learning_rate": 4.3100089632345736e-09, - "loss": 0.1151, - "reward": 0.4179687649011612, - "reward_std": 0.19071536883711815, - "rewards/accuracy_reward": 0.11383928917348385, + "grad_norm": 9.530869483947754, + "kl": 3.150390625, + "learning_rate": 2.155004481617287e-08, + "loss": 0.1896, + "reward": 0.5580357238650322, + "reward_std": 0.1627616547048092, + "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.4330357313156128, "step": 2946 }, { "clip_ratio": 0.0, - "completion_length": 1632.6898193359375, + "completion_length": 1775.8661499023438, "epoch": 0.880292733925771, - "grad_norm": 24.769166946411133, - "kl": 0.324951171875, - "learning_rate": 4.2888518062868914e-09, - "loss": 0.1468, - "reward": 0.3733259066939354, - "reward_std": 0.20282499492168427, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 4.572495937347412, + "kl": 3.53125, + "learning_rate": 2.1444259031434458e-08, + "loss": 0.2076, + "reward": 0.4866071790456772, + "reward_std": 0.1618131473660469, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4285714402794838, "step": 2947 }, { "clip_ratio": 0.0, - "completion_length": 1601.3170166015625, + "completion_length": 1798.9911193847656, "epoch": 0.8805914420132925, - "grad_norm": 14.225119590759277, - "kl": 0.25341796875, - "learning_rate": 4.267744378588256e-09, - "loss": 0.144, - "reward": 0.3872767984867096, - "reward_std": 0.18777664378285408, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 6.047021865844727, + "kl": 4.08984375, + "learning_rate": 2.133872189294128e-08, + "loss": 0.2408, + "reward": 0.521205373108387, + "reward_std": 0.1670054979622364, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089402794838, + "rewards/tag_count_reward": 0.4319196566939354, "step": 2948 }, { "clip_ratio": 0.0, - "completion_length": 1499.8973999023438, + "completion_length": 1685.97998046875, "epoch": 0.880890150100814, - "grad_norm": 13.252019882202148, - "kl": 0.202392578125, - "learning_rate": 4.246686703101493e-09, - "loss": 0.134, - "reward": 0.463727705180645, - "reward_std": 0.21310625970363617, - "rewards/accuracy_reward": 0.08928572246804833, + "grad_norm": 3.665703058242798, + "kl": 1.806640625, + "learning_rate": 2.1233433515507466e-08, + "loss": 0.1244, + "reward": 0.550781287252903, + "reward_std": 0.1722656711935997, + "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3744419813156128, + "rewards/tag_count_reward": 0.4481026902794838, "step": 2949 }, { "clip_ratio": 0.0, - "completion_length": 1565.96435546875, + "completion_length": 1736.3170471191406, "epoch": 0.8811888581883355, - "grad_norm": 15.391103744506836, - "kl": 0.2470703125, - "learning_rate": 4.2256788027353e-09, - "loss": 0.1469, - "reward": 0.4542410969734192, - "reward_std": 0.17912666499614716, - "rewards/accuracy_reward": 0.11830357694998384, + "grad_norm": 7.223992347717285, + "kl": 3.658203125, + "learning_rate": 2.1128394013676497e-08, + "loss": 0.2365, + "reward": 0.5725446790456772, + "reward_std": 0.1554953046143055, + "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4363839477300644, "step": 2950 }, { "clip_ratio": 0.0, - "completion_length": 1609.41748046875, + "completion_length": 1719.5670471191406, "epoch": 0.8814875662758569, - "grad_norm": 13.12231159210205, - "kl": 0.264892578125, - "learning_rate": 4.2047207003441995e-09, - "loss": 0.1599, - "reward": 0.392857164144516, - "reward_std": 0.20500428974628448, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 4.55910587310791, + "kl": 3.94140625, + "learning_rate": 2.1023603501721e-08, + "loss": 0.2597, + "reward": 0.5390625223517418, + "reward_std": 0.17795638740062714, + "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4341518059372902, "step": 2951 }, { "clip_ratio": 0.0, - "completion_length": 1637.5156860351562, + "completion_length": 1800.57373046875, "epoch": 0.8817862743633784, - "grad_norm": 13.943061828613281, - "kl": 0.27587890625, - "learning_rate": 4.18381241872855e-09, - "loss": 0.1316, - "reward": 0.352678582072258, - "reward_std": 0.1898425780236721, - "rewards/accuracy_reward": 0.053571430034935474, + "grad_norm": 17.533878326416016, + "kl": 4.78125, + "learning_rate": 2.0919062093642748e-08, + "loss": 0.2545, + "reward": 0.486049123108387, + "reward_std": 0.1829831525683403, + "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071492433548, + "rewards/tag_count_reward": 0.403459832072258, "step": 2952 }, { "clip_ratio": 0.0, - "completion_length": 1609.5000915527344, + "completion_length": 1799.2389221191406, "epoch": 0.8820849824508998, - "grad_norm": 12.198466300964355, - "kl": 0.248779296875, - "learning_rate": 4.1629539806345314e-09, - "loss": 0.1058, - "reward": 0.4637276902794838, - "reward_std": 0.23575520515441895, - "rewards/accuracy_reward": 0.1227678619325161, + "grad_norm": 3.896245002746582, + "kl": 3.525390625, + "learning_rate": 2.0814769903172657e-08, + "loss": 0.1931, + "reward": 0.5786830708384514, + "reward_std": 0.20268763601779938, + "rewards/accuracy_reward": 0.14955357508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598246216774, + "rewards/tag_count_reward": 0.4291294887661934, "step": 2953 }, { "clip_ratio": 0.0, - "completion_length": 1697.0603332519531, + "completion_length": 1863.2031860351562, "epoch": 0.8823836905384214, - "grad_norm": 11.046046257019043, - "kl": 0.27490234375, - "learning_rate": 4.142145408754061e-09, - "loss": 0.1014, - "reward": 0.3013392984867096, - "reward_std": 0.16690774261951447, - "rewards/accuracy_reward": 0.004464285913854837, + "grad_norm": 9.193558692932129, + "kl": 4.49609375, + "learning_rate": 2.0710727043770303e-08, + "loss": 0.2302, + "reward": 0.4285714477300644, + "reward_std": 0.1481450628489256, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2968750149011612, + "rewards/tag_count_reward": 0.415178582072258, "step": 2954 }, { "clip_ratio": 0.0, - "completion_length": 1605.6138916015625, + "completion_length": 1763.9531860351562, "epoch": 0.8826823986259428, - "grad_norm": 12.511260032653809, - "kl": 0.2685546875, - "learning_rate": 4.121386725724835e-09, - "loss": 0.134, - "reward": 0.4062500149011612, - "reward_std": 0.18485190905630589, - "rewards/accuracy_reward": 0.0870535746216774, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.3169642984867096, + "grad_norm": 18.595230102539062, + "kl": 5.09375, + "learning_rate": 2.0606933628624174e-08, + "loss": 0.301, + "reward": 0.525111623108387, + "reward_std": 0.17531871423125267, + "rewards/accuracy_reward": 0.11160714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4135044887661934, "step": 2955 }, { "clip_ratio": 0.0, - "completion_length": 1621.4777526855469, + "completion_length": 1756.9107971191406, "epoch": 0.8829811067134643, - "grad_norm": 13.802545547485352, - "kl": 0.263427734375, - "learning_rate": 4.100677954130271e-09, - "loss": 0.1482, - "reward": 0.3699776902794838, - "reward_std": 0.19251956790685654, - "rewards/accuracy_reward": 0.05357143119908869, + "grad_norm": 6.613658905029297, + "kl": 4.76953125, + "learning_rate": 2.0503389770651357e-08, + "loss": 0.269, + "reward": 0.4910714477300644, + "reward_std": 0.1360656190663576, + "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062574505806, + "rewards/tag_count_reward": 0.428571455180645, "step": 2956 }, { "clip_ratio": 0.0, - "completion_length": 1620.1563415527344, + "completion_length": 1774.3304443359375, "epoch": 0.8832798148009857, - "grad_norm": 13.88314151763916, - "kl": 0.254150390625, - "learning_rate": 4.080019116499467e-09, - "loss": 0.1588, - "reward": 0.4034598395228386, - "reward_std": 0.19790156185626984, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 17.44745635986328, + "kl": 4.703125, + "learning_rate": 2.0400095582497333e-08, + "loss": 0.2429, + "reward": 0.5050223618745804, + "reward_std": 0.1283576302230358, + "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.420200914144516, "step": 2957 }, { "clip_ratio": 0.0, - "completion_length": 1707.5692749023438, + "completion_length": 1838.9085693359375, "epoch": 0.8835785228885072, - "grad_norm": 12.02652359008789, - "kl": 0.2958984375, - "learning_rate": 4.059410235307231e-09, - "loss": 0.1085, - "reward": 0.3242187574505806, - "reward_std": 0.19873931631445885, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 35.27434539794922, + "kl": 6.1328125, + "learning_rate": 2.0297051176536158e-08, + "loss": 0.3377, + "reward": 0.4375000223517418, + "reward_std": 0.15447713807225227, + "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2996651902794838, + "rewards/tag_count_reward": 0.4062500074505806, "step": 2958 }, { "clip_ratio": 0.0, - "completion_length": 1669.1161193847656, + "completion_length": 1861.2188415527344, "epoch": 0.8838772309760287, - "grad_norm": 13.071640014648438, - "kl": 0.2587890625, - "learning_rate": 4.038851332973986e-09, - "loss": 0.126, - "reward": 0.419642873108387, - "reward_std": 0.2250657081604004, - "rewards/accuracy_reward": 0.1049107201397419, + "grad_norm": 6.645206451416016, + "kl": 4.0078125, + "learning_rate": 2.019425666486993e-08, + "loss": 0.2221, + "reward": 0.5597098544239998, + "reward_std": 0.21596502512693405, + "rewards/accuracy_reward": 0.14508929592557251, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2959 }, { "clip_ratio": 0.0, - "completion_length": 1624.3036499023438, + "completion_length": 1795.9755249023438, "epoch": 0.8841759390635502, - "grad_norm": 12.874189376831055, - "kl": 0.259765625, - "learning_rate": 4.018342431865818e-09, - "loss": 0.1415, - "reward": 0.4162946715950966, - "reward_std": 0.20030834525823593, - "rewards/accuracy_reward": 0.0870535746216774, + "grad_norm": 6.7406134605407715, + "kl": 3.65234375, + "learning_rate": 2.0091712159329087e-08, + "loss": 0.1953, + "reward": 0.5117187723517418, + "reward_std": 0.162877157330513, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2960 }, { "clip_ratio": 0.0, - "completion_length": 1698.8951721191406, + "completion_length": 1876.0982971191406, "epoch": 0.8844746471510716, - "grad_norm": 11.248661994934082, - "kl": 0.283203125, - "learning_rate": 3.997883554294406e-09, - "loss": 0.1199, - "reward": 0.3398437649011612, - "reward_std": 0.23327895253896713, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 15.827983856201172, + "kl": 4.734375, + "learning_rate": 1.9989417771472035e-08, + "loss": 0.2644, + "reward": 0.4921875149011612, + "reward_std": 0.24312212690711021, + "rewards/accuracy_reward": 0.09598214458674192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952009066939354, + "rewards/tag_count_reward": 0.396205373108387, "step": 2961 }, { "clip_ratio": 0.0, - "completion_length": 1570.0714721679688, + "completion_length": 1739.0157165527344, "epoch": 0.8847733552385931, - "grad_norm": 10.875776290893555, - "kl": 0.26611328125, - "learning_rate": 3.977474722516983e-09, - "loss": 0.1224, - "reward": 0.443080373108387, - "reward_std": 0.16008334420621395, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 6.16668701171875, + "kl": 3.732421875, + "learning_rate": 1.9887373612584912e-08, + "loss": 0.2107, + "reward": 0.5491071790456772, + "reward_std": 0.12249704450368881, + "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482238650322, + "rewards/tag_count_reward": 0.4196428805589676, "step": 2962 }, { "clip_ratio": 0.0, - "completion_length": 1628.5023193359375, + "completion_length": 1812.7813415527344, "epoch": 0.8850720633261145, - "grad_norm": 11.785562515258789, - "kl": 0.259765625, - "learning_rate": 3.957115958736373e-09, - "loss": 0.1068, - "reward": 0.3392857313156128, - "reward_std": 0.1761317476630211, - "rewards/accuracy_reward": 0.017857144121080637, + "grad_norm": 3.481977701187134, + "kl": 3.9765625, + "learning_rate": 1.9785579793681866e-08, + "loss": 0.2334, + "reward": 0.4503348469734192, + "reward_std": 0.17221926152706146, + "rewards/accuracy_reward": 0.042410714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.407924123108387, "step": 2963 }, { "clip_ratio": 0.0, - "completion_length": 1479.0045166015625, + "completion_length": 1646.5759582519531, "epoch": 0.8853707714136361, - "grad_norm": 11.797910690307617, - "kl": 0.259033203125, - "learning_rate": 3.9368072851009185e-09, - "loss": 0.1476, - "reward": 0.5083705484867096, - "reward_std": 0.2072592992335558, - "rewards/accuracy_reward": 0.1651785746216774, + "grad_norm": 31.167213439941406, + "kl": 2.60546875, + "learning_rate": 1.968403642550459e-08, + "loss": 0.1945, + "reward": 0.5954241305589676, + "reward_std": 0.15118969045579433, + "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.443638414144516, "step": 2964 }, { "clip_ratio": 0.0, - "completion_length": 1582.18310546875, + "completion_length": 1787.9197387695312, "epoch": 0.8856694795011575, - "grad_norm": 10.527793884277344, - "kl": 0.260498046875, - "learning_rate": 3.916548723704455e-09, - "loss": 0.1324, - "reward": 0.3632812723517418, - "reward_std": 0.1826188936829567, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 7.259477615356445, + "kl": 3.046875, + "learning_rate": 1.9582743618522273e-08, + "loss": 0.1485, + "reward": 0.4520089477300644, + "reward_std": 0.1397387720644474, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2965 }, { "clip_ratio": 0.0, - "completion_length": 1619.0179138183594, + "completion_length": 1790.8639221191406, "epoch": 0.885968187588679, - "grad_norm": 12.413215637207031, - "kl": 0.2607421875, - "learning_rate": 3.896340296586309e-09, - "loss": 0.1066, - "reward": 0.388392873108387, - "reward_std": 0.1862165778875351, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 17.066267013549805, + "kl": 2.126953125, + "learning_rate": 1.9481701482931546e-08, + "loss": 0.1373, + "reward": 0.5050223544239998, + "reward_std": 0.14415013790130615, + "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2966 }, { "clip_ratio": 0.0, - "completion_length": 1639.3884582519531, + "completion_length": 1790.665283203125, "epoch": 0.8862668956762004, - "grad_norm": 12.610956192016602, - "kl": 0.256103515625, - "learning_rate": 3.87618202573129e-09, - "loss": 0.1261, - "reward": 0.3934151902794838, - "reward_std": 0.15643484517931938, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 9.304194450378418, + "kl": 3.41796875, + "learning_rate": 1.9380910128656448e-08, + "loss": 0.1993, + "reward": 0.4994419887661934, + "reward_std": 0.13237342424690723, + "rewards/accuracy_reward": 0.08482143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4146205559372902, "step": 2967 }, { "clip_ratio": 0.0, - "completion_length": 1531.7790832519531, + "completion_length": 1732.9978637695312, "epoch": 0.886565603763722, - "grad_norm": 13.27579116821289, - "kl": 0.232666015625, - "learning_rate": 3.856073933069598e-09, - "loss": 0.1379, - "reward": 0.3833705559372902, - "reward_std": 0.19079622998833656, - "rewards/accuracy_reward": 0.035714288242161274, + "grad_norm": 25.842185974121094, + "kl": 2.55078125, + "learning_rate": 1.928036966534799e-08, + "loss": 0.1695, + "reward": 0.4453125223517418, + "reward_std": 0.13349316455423832, + "rewards/accuracy_reward": 0.020089286845177412, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.4252232313156128, "step": 2968 }, { "clip_ratio": 0.0, - "completion_length": 1630.9777526855469, + "completion_length": 1807.9242248535156, "epoch": 0.8868643118512434, - "grad_norm": 11.837176322937012, - "kl": 0.30517578125, - "learning_rate": 3.8360160404768745e-09, - "loss": 0.1082, - "reward": 0.3822544813156128, - "reward_std": 0.15306618437170982, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 16.512422561645508, + "kl": 3.30078125, + "learning_rate": 1.9180080202384374e-08, + "loss": 0.179, + "reward": 0.5000000223517418, + "reward_std": 0.12078810669481754, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.419642873108387, "step": 2969 }, { "clip_ratio": 0.0, - "completion_length": 1523.3527526855469, + "completion_length": 1701.9710693359375, "epoch": 0.8871630199387649, - "grad_norm": 15.604958534240723, - "kl": 0.228515625, - "learning_rate": 3.816008369774154e-09, - "loss": 0.1651, - "reward": 0.3750000074505806, - "reward_std": 0.16556290537118912, - "rewards/accuracy_reward": 0.01785714295692742, + "grad_norm": 13.941060066223145, + "kl": 2.203125, + "learning_rate": 1.908004184887077e-08, + "loss": 0.1268, + "reward": 0.4642857313156128, + "reward_std": 0.11777893453836441, + "rewards/accuracy_reward": 0.017857143422588706, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3571428656578064, + "rewards/tag_count_reward": 0.4464285895228386, "step": 2970 }, { "clip_ratio": 0.0, - "completion_length": 1618.4286804199219, + "completion_length": 1748.9532165527344, "epoch": 0.8874617280262863, - "grad_norm": 12.559847831726074, - "kl": 0.24658203125, - "learning_rate": 3.796050942727796e-09, - "loss": 0.1297, - "reward": 0.3956473395228386, - "reward_std": 0.18693815544247627, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 17.219240188598633, + "kl": 2.8828125, + "learning_rate": 1.8980254713638983e-08, + "loss": 0.1782, + "reward": 0.5279017984867096, + "reward_std": 0.14571836963295937, + "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.443080373108387, "step": 2971 }, { "clip_ratio": 0.0, - "completion_length": 1701.6786499023438, + "completion_length": 1898.57373046875, "epoch": 0.8877604361138077, - "grad_norm": 12.526589393615723, - "kl": 0.22998046875, - "learning_rate": 3.776143781049551e-09, - "loss": 0.1181, - "reward": 0.3532366305589676, - "reward_std": 0.16629188507795334, - "rewards/accuracy_reward": 0.0401785746216774, + "grad_norm": 6.729400157928467, + "kl": 3.34375, + "learning_rate": 1.8880718905247756e-08, + "loss": 0.1674, + "reward": 0.4564732313156128, + "reward_std": 0.11615396291017532, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.4162946566939354, "step": 2972 }, { "clip_ratio": 0.0, - "completion_length": 1664.7054138183594, + "completion_length": 1811.5045471191406, "epoch": 0.8880591442013293, - "grad_norm": 12.233601570129395, - "kl": 0.254150390625, - "learning_rate": 3.756286906396438e-09, - "loss": 0.1159, - "reward": 0.4369419887661934, - "reward_std": 0.1861405409872532, - "rewards/accuracy_reward": 0.13839286426082253, + "grad_norm": 7.560126304626465, + "kl": 3.953125, + "learning_rate": 1.8781434531982193e-08, + "loss": 0.2412, + "reward": 0.5636161044239998, + "reward_std": 0.19242243096232414, + "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.298549123108387, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2973 }, { "clip_ratio": 0.0, - "completion_length": 1610.9978332519531, + "completion_length": 1770.1942749023438, "epoch": 0.8883578522888507, - "grad_norm": 10.44741153717041, - "kl": 0.220703125, - "learning_rate": 3.736480340370801e-09, - "loss": 0.1388, - "reward": 0.509486623108387, - "reward_std": 0.23156652972102165, - "rewards/accuracy_reward": 0.18526786658912897, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "grad_norm": 8.634564399719238, + "kl": 3.75, + "learning_rate": 1.8682401701854005e-08, + "loss": 0.2257, + "reward": 0.6171875298023224, + "reward_std": 0.1864912286400795, + "rewards/accuracy_reward": 0.1964285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4207589402794838, "step": 2974 }, { "clip_ratio": 0.0, - "completion_length": 1599.99560546875, + "completion_length": 1732.1607971191406, "epoch": 0.8886565603763722, - "grad_norm": 11.330629348754883, - "kl": 0.21728515625, - "learning_rate": 3.7167241045202468e-09, - "loss": 0.1195, - "reward": 0.3716517984867096, - "reward_std": 0.21560485288500786, - "rewards/accuracy_reward": 0.04017857299186289, + "grad_norm": 8.900465965270996, + "kl": 3.435546875, + "learning_rate": 1.8583620522601234e-08, + "loss": 0.2249, + "reward": 0.4821428805589676, + "reward_std": 0.1603305283933878, + "rewards/accuracy_reward": 0.04910714388824999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.4330357387661934, "step": 2975 }, { "clip_ratio": 0.0, - "completion_length": 1505.716552734375, + "completion_length": 1720.6452026367188, "epoch": 0.8889552684638936, - "grad_norm": 12.591317176818848, - "kl": 0.2041015625, - "learning_rate": 3.697018220337611e-09, - "loss": 0.1361, - "reward": 0.506696455180645, - "reward_std": 0.2279038019478321, - "rewards/accuracy_reward": 0.15401786658912897, + "grad_norm": 11.141268730163574, + "kl": 3.39453125, + "learning_rate": 1.8485091101688055e-08, + "loss": 0.2218, + "reward": 0.622209832072258, + "reward_std": 0.197070199996233, + "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.352678582072258, + "rewards/tag_count_reward": 0.4324776977300644, "step": 2976 }, { "clip_ratio": 0.0, - "completion_length": 1575.9465026855469, + "completion_length": 1708.51123046875, "epoch": 0.8892539765514151, - "grad_norm": 13.250279426574707, - "kl": 0.22509765625, - "learning_rate": 3.6773627092609725e-09, - "loss": 0.1474, - "reward": 0.480468787252903, - "reward_std": 0.2084486410021782, - "rewards/accuracy_reward": 0.14732143399305642, + "grad_norm": 12.334859848022461, + "kl": 4.6875, + "learning_rate": 1.8386813546304862e-08, + "loss": 0.2838, + "reward": 0.5731026977300644, + "reward_std": 0.15574834123253822, + "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3331473395228386, + "rewards/tag_count_reward": 0.4213169813156128, "step": 2977 }, { "clip_ratio": 0.0, - "completion_length": 1580.68310546875, + "completion_length": 1761.8706359863281, "epoch": 0.8895526846389366, - "grad_norm": 10.428624153137207, - "kl": 0.214599609375, - "learning_rate": 3.6577575926736105e-09, - "loss": 0.1176, - "reward": 0.4765625223517418, - "reward_std": 0.2427169717848301, - "rewards/accuracy_reward": 0.14508928963914514, + "grad_norm": 7.9743218421936035, + "kl": 3.248046875, + "learning_rate": 1.8288787963368053e-08, + "loss": 0.184, + "reward": 0.6171875298023224, + "reward_std": 0.23580187559127808, + "rewards/accuracy_reward": 0.1897321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4274553805589676, "step": 2978 }, { "clip_ratio": 0.0, - "completion_length": 1598.5045471191406, + "completion_length": 1776.3616943359375, "epoch": 0.8898513927264581, - "grad_norm": 11.654958724975586, - "kl": 0.218994140625, - "learning_rate": 3.6382028919039464e-09, - "loss": 0.1413, - "reward": 0.427455373108387, - "reward_std": 0.21903003752231598, - "rewards/accuracy_reward": 0.0892857164144516, + "grad_norm": 22.566457748413086, + "kl": 4.9140625, + "learning_rate": 1.8191014459519733e-08, + "loss": 0.2881, + "reward": 0.5022321566939354, + "reward_std": 0.16758285649120808, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4084821566939354, "step": 2979 }, { "clip_ratio": 0.0, - "completion_length": 1729.6340026855469, + "completion_length": 1825.5737609863281, "epoch": 0.8901501008139795, - "grad_norm": 10.219399452209473, - "kl": 0.256591796875, - "learning_rate": 3.6186986282255973e-09, - "loss": 0.1154, - "reward": 0.2879464402794838, - "reward_std": 0.19237860292196274, - "rewards/accuracy_reward": 0.017857143422588706, + "grad_norm": 44.56852340698242, + "kl": 5.8828125, + "learning_rate": 1.8093493141127984e-08, + "loss": 0.3137, + "reward": 0.428013414144516, + "reward_std": 0.16742690466344357, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2700892984867096, + "rewards/tag_count_reward": 0.3967634066939354, "step": 2980 }, { "clip_ratio": 0.0, - "completion_length": 1591.2522888183594, + "completion_length": 1765.1764526367188, "epoch": 0.890448808901501, - "grad_norm": 12.082350730895996, - "kl": 0.21630859375, - "learning_rate": 3.599244822857289e-09, - "loss": 0.122, - "reward": 0.3911830559372902, - "reward_std": 0.15273521281778812, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 7.553714752197266, + "kl": 4.1484375, + "learning_rate": 1.7996224114286446e-08, + "loss": 0.2453, + "reward": 0.4737723395228386, + "reward_std": 0.11272862739861012, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044813156128, + "rewards/tag_count_reward": 0.431361623108387, "step": 2981 }, { "clip_ratio": 0.0, - "completion_length": 1573.2478637695312, + "completion_length": 1765.3148193359375, "epoch": 0.8907475169890224, - "grad_norm": 11.627763748168945, - "kl": 0.204345703125, - "learning_rate": 3.5798414969628434e-09, - "loss": 0.1157, - "reward": 0.420758955180645, - "reward_std": 0.2209169715642929, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 19.89854621887207, + "kl": 4.50390625, + "learning_rate": 1.7899207484814217e-08, + "loss": 0.2607, + "reward": 0.5273437574505806, + "reward_std": 0.19599131867289543, + "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982313156128, + "rewards/tag_count_reward": 0.4246651977300644, "step": 2982 }, { "clip_ratio": 0.0, - "completion_length": 1681.0715026855469, + "completion_length": 1829.44873046875, "epoch": 0.891046225076544, - "grad_norm": 10.441166877746582, - "kl": 0.241943359375, - "learning_rate": 3.5604886716511795e-09, - "loss": 0.1037, - "reward": 0.3643973469734192, - "reward_std": 0.22804228961467743, - "rewards/accuracy_reward": 0.0580357164144516, + "grad_norm": 17.692481994628906, + "kl": 4.50390625, + "learning_rate": 1.78024433582559e-08, + "loss": 0.2467, + "reward": 0.4832589402794838, + "reward_std": 0.21332794427871704, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.4140625149011612, "step": 2983 }, { "clip_ratio": 0.0, - "completion_length": 1665.26123046875, + "completion_length": 1802.1764221191406, "epoch": 0.8913449331640654, - "grad_norm": 10.362273216247559, - "kl": 0.251220703125, - "learning_rate": 3.5411863679762956e-09, - "loss": 0.1176, - "reward": 0.302455373108387, - "reward_std": 0.17634929716587067, - "rewards/accuracy_reward": 0.0133928582072258, + "grad_norm": 23.558889389038086, + "kl": 5.62109375, + "learning_rate": 1.7705931839881476e-08, + "loss": 0.2927, + "reward": 0.4308035895228386, + "reward_std": 0.15326131507754326, + "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2890625074505806, + "rewards/tag_count_reward": 0.4040178805589676, "step": 2984 }, { "clip_ratio": 0.0, - "completion_length": 1616.0603332519531, + "completion_length": 1766.0179443359375, "epoch": 0.8916436412515869, - "grad_norm": 12.91524600982666, - "kl": 0.2255859375, - "learning_rate": 3.521934606937177e-09, - "loss": 0.1266, - "reward": 0.3632812649011612, - "reward_std": 0.19943008199334145, - "rewards/accuracy_reward": 0.0334821455180645, + "grad_norm": 16.315526962280273, + "kl": 3.86328125, + "learning_rate": 1.7609673034685885e-08, + "loss": 0.2083, + "reward": 0.4849330559372902, + "reward_std": 0.1722598746418953, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3297991156578064, + "rewards/tag_count_reward": 0.4268973395228386, "step": 2985 }, { "clip_ratio": 0.0, - "completion_length": 1643.5759887695312, + "completion_length": 1770.6116943359375, "epoch": 0.8919423493391083, - "grad_norm": 11.187044143676758, - "kl": 0.236328125, - "learning_rate": 3.5027334094778758e-09, - "loss": 0.1245, - "reward": 0.3643973395228386, - "reward_std": 0.20951270684599876, - "rewards/accuracy_reward": 0.05357143119908869, + "grad_norm": 5.627732753753662, + "kl": 3.76171875, + "learning_rate": 1.751366704738938e-08, + "loss": 0.212, + "reward": 0.4910714477300644, + "reward_std": 0.17865683883428574, + "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310825914144516, + "rewards/tag_count_reward": 0.4263393059372902, "step": 2986 }, { "clip_ratio": 0.0, - "completion_length": 1572.1272888183594, + "completion_length": 1760.9107666015625, "epoch": 0.8922410574266298, - "grad_norm": 13.561488151550293, - "kl": 0.213623046875, - "learning_rate": 3.4835827964873942e-09, - "loss": 0.1471, - "reward": 0.4029017984867096, - "reward_std": 0.23285957798361778, - "rewards/accuracy_reward": 0.06919643096625805, + "grad_norm": 3.4575247764587402, + "kl": 4.0546875, + "learning_rate": 1.741791398243697e-08, + "loss": 0.2264, + "reward": 0.5011160969734192, + "reward_std": 0.18788821622729301, + "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.4229910895228386, "step": 2987 }, { "clip_ratio": 0.0, - "completion_length": 1589.18310546875, + "completion_length": 1787.1407165527344, "epoch": 0.8925397655141513, - "grad_norm": 12.232766151428223, - "kl": 0.220703125, - "learning_rate": 3.464482788799733e-09, - "loss": 0.1236, - "reward": 0.4542410969734192, - "reward_std": 0.17584041506052017, - "rewards/accuracy_reward": 0.1250000074505806, + "grad_norm": 6.4187541007995605, + "kl": 4.140625, + "learning_rate": 1.7322413943998666e-08, + "loss": 0.2338, + "reward": 0.5474330559372902, + "reward_std": 0.1545313596725464, + "rewards/accuracy_reward": 0.13616072130389512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4112723395228386, "step": 2988 }, { "clip_ratio": 0.0, - "completion_length": 1607.2255249023438, + "completion_length": 1778.4822387695312, "epoch": 0.8928384736016728, - "grad_norm": 12.141207695007324, - "kl": 0.221923828125, - "learning_rate": 3.445433407193837e-09, - "loss": 0.1336, - "reward": 0.3789062723517418, - "reward_std": 0.18904399871826172, - "rewards/accuracy_reward": 0.06026785937137902, + "grad_norm": 6.561193466186523, + "kl": 3.31640625, + "learning_rate": 1.7227167035969188e-08, + "loss": 0.1937, + "reward": 0.4893973469734192, + "reward_std": 0.15546206384897232, + "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.411272332072258, "step": 2989 }, { "clip_ratio": 0.0, - "completion_length": 1583.41748046875, + "completion_length": 1749.9375610351562, "epoch": 0.8931371816891942, - "grad_norm": 11.814096450805664, - "kl": 0.27783203125, - "learning_rate": 3.426434672393541e-09, - "loss": 0.1236, - "reward": 0.3984375223517418, - "reward_std": 0.17844903096556664, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 11.033413887023926, + "kl": 3.0859375, + "learning_rate": 1.7132173361967704e-08, + "loss": 0.1851, + "reward": 0.5418526902794838, + "reward_std": 0.16524029150605202, + "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.428013414144516, "step": 2990 }, { "clip_ratio": 0.0, - "completion_length": 1669.3416137695312, + "completion_length": 1816.8103637695312, "epoch": 0.8934358897767157, - "grad_norm": 10.642291069030762, - "kl": 0.245849609375, - "learning_rate": 3.4074866050676277e-09, - "loss": 0.1008, - "reward": 0.3906250223517418, - "reward_std": 0.14395221136510372, - "rewards/accuracy_reward": 0.07589285937137902, + "grad_norm": 10.531332969665527, + "kl": 4.046875, + "learning_rate": 1.7037433025338138e-08, + "loss": 0.2373, + "reward": 0.4988839477300644, + "reward_std": 0.1308812964707613, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4095982313156128, "step": 2991 }, { "clip_ratio": 0.0, - "completion_length": 1563.3281860351562, + "completion_length": 1735.6764221191406, "epoch": 0.8937345978642371, - "grad_norm": 11.863767623901367, - "kl": 0.2353515625, - "learning_rate": 3.3885892258297353e-09, - "loss": 0.131, - "reward": 0.3537946566939354, - "reward_std": 0.20892281085252762, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 3.7154836654663086, + "kl": 3.07421875, + "learning_rate": 1.6942946129148676e-08, + "loss": 0.1677, + "reward": 0.4637276977300644, + "reward_std": 0.15698243957012892, + "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.423549123108387, "step": 2992 }, { "clip_ratio": 0.0, - "completion_length": 1607.4442749023438, + "completion_length": 1754.1942443847656, "epoch": 0.8940333059517587, - "grad_norm": 11.397245407104492, - "kl": 0.20947265625, - "learning_rate": 3.369742555238353e-09, - "loss": 0.1206, - "reward": 0.3554687649011612, - "reward_std": 0.18389492481946945, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 20.348743438720703, + "kl": 3.02734375, + "learning_rate": 1.6848712776191765e-08, + "loss": 0.1971, + "reward": 0.4441964477300644, + "reward_std": 0.14076597429811954, + "rewards/accuracy_reward": 0.013392857741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3420758992433548, + "rewards/tag_count_reward": 0.4308035895228386, "step": 2993 }, { "clip_ratio": 0.0, - "completion_length": 1650.0536499023438, + "completion_length": 1851.2701721191406, "epoch": 0.8943320140392801, - "grad_norm": 10.632128715515137, - "kl": 0.2216796875, - "learning_rate": 3.350946613796807e-09, - "loss": 0.1071, - "reward": 0.3750000149011612, - "reward_std": 0.18283455818891525, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 8.984735488891602, + "kl": 3.22265625, + "learning_rate": 1.6754733068984035e-08, + "loss": 0.1707, + "reward": 0.4648437649011612, + "reward_std": 0.13831565342843533, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.415736623108387, "step": 2994 }, { "clip_ratio": 0.0, - "completion_length": 1558.9129943847656, + "completion_length": 1720.7210693359375, "epoch": 0.8946307221268016, - "grad_norm": 14.085603713989258, - "kl": 0.263427734375, - "learning_rate": 3.332201421953257e-09, - "loss": 0.1655, - "reward": 0.3621651977300644, - "reward_std": 0.21492353454232216, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 9.068717956542969, + "kl": 3.7734375, + "learning_rate": 1.6661007109766283e-08, + "loss": 0.2381, + "reward": 0.4849330559372902, + "reward_std": 0.17593863233923912, + "rewards/accuracy_reward": 0.05357143213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4313616305589676, "step": 2995 }, { "clip_ratio": 0.0, - "completion_length": 1641.7947387695312, + "completion_length": 1759.6295166015625, "epoch": 0.894929430214323, - "grad_norm": 11.939397811889648, - "kl": 0.268310546875, - "learning_rate": 3.313507000100618e-09, - "loss": 0.1046, - "reward": 0.4207589402794838, - "reward_std": 0.1945769041776657, - "rewards/accuracy_reward": 0.10714286472648382, + "grad_norm": 6.387885093688965, + "kl": 3.314453125, + "learning_rate": 1.6567535000503092e-08, + "loss": 0.1854, + "reward": 0.580915205180645, + "reward_std": 0.20527694001793861, + "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4246651902794838, "step": 2996 }, { "clip_ratio": 0.0, - "completion_length": 1604.1429138183594, + "completion_length": 1803.1764221191406, "epoch": 0.8952281383018446, - "grad_norm": 12.5408935546875, - "kl": 0.275146484375, - "learning_rate": 3.294863368576595e-09, - "loss": 0.1181, - "reward": 0.3404018059372902, - "reward_std": 0.17506415396928787, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 3.8826239109039307, + "kl": 4.03125, + "learning_rate": 1.6474316842882973e-08, + "loss": 0.2262, + "reward": 0.4609375223517418, + "reward_std": 0.17871228978037834, + "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089402794838, + "rewards/tag_count_reward": 0.4185268133878708, "step": 2997 }, { "clip_ratio": 0.0, - "completion_length": 1700.88623046875, + "completion_length": 1864.72998046875, "epoch": 0.895526846389366, - "grad_norm": 9.74870777130127, - "kl": 0.22607421875, - "learning_rate": 3.27627054766364e-09, - "loss": 0.0959, - "reward": 0.3504464402794838, - "reward_std": 0.1807398907840252, - "rewards/accuracy_reward": 0.031250001629814506, + "grad_norm": 5.681728839874268, + "kl": 4.2421875, + "learning_rate": 1.63813527383182e-08, + "loss": 0.2383, + "reward": 0.4497767984867096, + "reward_std": 0.16325656697154045, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4029017984867096, "step": 2998 }, { "clip_ratio": 0.0, - "completion_length": 1550.7813110351562, + "completion_length": 1703.1116943359375, "epoch": 0.8958255544768875, - "grad_norm": 11.371783256530762, - "kl": 0.21142578125, - "learning_rate": 3.257728557588901e-09, - "loss": 0.1191, - "reward": 0.360491082072258, - "reward_std": 0.17018113285303116, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 8.530016899108887, + "kl": 3.26171875, + "learning_rate": 1.6288642787944507e-08, + "loss": 0.1854, + "reward": 0.4676339477300644, + "reward_std": 0.14518877491354942, + "rewards/accuracy_reward": 0.037946431431919336, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4296875298023224, "step": 2999 }, { "clip_ratio": 0.0, - "completion_length": 1611.90185546875, + "completion_length": 1781.8952026367188, "epoch": 0.8961242625644089, - "grad_norm": 11.615347862243652, - "kl": 0.227294921875, - "learning_rate": 3.2392374185242653e-09, - "loss": 0.1293, - "reward": 0.4252232313156128, - "reward_std": 0.17605966702103615, - "rewards/accuracy_reward": 0.09821428917348385, + "grad_norm": 7.317563533782959, + "kl": 3.83203125, + "learning_rate": 1.6196187092621327e-08, + "loss": 0.2195, + "reward": 0.5145089477300644, + "reward_std": 0.1594829149544239, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4140625149011612, "step": 3000 }, { "clip_ratio": 0.0, - "completion_length": 1637.1228332519531, + "completion_length": 1784.9063415527344, "epoch": 0.8964229706519304, - "grad_norm": 9.753799438476562, - "kl": 0.24267578125, - "learning_rate": 3.220797150586263e-09, - "loss": 0.11, - "reward": 0.400669664144516, - "reward_std": 0.18881821259856224, - "rewards/accuracy_reward": 0.0982142873108387, + "grad_norm": 24.33642578125, + "kl": 5.44140625, + "learning_rate": 1.6103985752931314e-08, + "loss": 0.3198, + "reward": 0.5122768133878708, + "reward_std": 0.16183251701295376, + "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553656578064, + "rewards/tag_count_reward": 0.4095982313156128, "step": 3001 }, { "clip_ratio": 0.0, - "completion_length": 1573.2947387695312, + "completion_length": 1724.8973999023438, "epoch": 0.8967216787394519, - "grad_norm": 10.81359577178955, - "kl": 0.215576171875, - "learning_rate": 3.2024077738361077e-09, - "loss": 0.1094, - "reward": 0.395647332072258, - "reward_std": 0.1563160978257656, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 7.091845989227295, + "kl": 3.2333984375, + "learning_rate": 1.601203886918054e-08, + "loss": 0.1751, + "reward": 0.506138414144516, + "reward_std": 0.11216121260076761, + "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4369419813156128, "step": 3002 }, { "clip_ratio": 0.0, - "completion_length": 1598.4688110351562, + "completion_length": 1765.618408203125, "epoch": 0.8970203868269734, - "grad_norm": 14.945013046264648, - "kl": 0.210205078125, - "learning_rate": 3.184069308279641e-09, - "loss": 0.1277, - "reward": 0.3911830559372902, - "reward_std": 0.19105515629053116, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 21.163101196289062, + "kl": 3.357421875, + "learning_rate": 1.5920346541398205e-08, + "loss": 0.2151, + "reward": 0.5133928805589676, + "reward_std": 0.17487531155347824, + "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080559372902, + "rewards/tag_count_reward": 0.435267873108387, "step": 3003 }, { "clip_ratio": 0.0, - "completion_length": 1655.5223999023438, + "completion_length": 1825.7456359863281, "epoch": 0.8973190949144948, - "grad_norm": 14.073664665222168, - "kl": 0.245361328125, - "learning_rate": 3.1657817738673097e-09, - "loss": 0.1167, - "reward": 0.3744419813156128, - "reward_std": 0.1903778240084648, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 5.242889881134033, + "kl": 4.80078125, + "learning_rate": 1.582890886933655e-08, + "loss": 0.2813, + "reward": 0.4743303805589676, + "reward_std": 0.12895838357508183, + "rewards/accuracy_reward": 0.05580357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4185268059372902, "step": 3004 }, { "clip_ratio": 0.0, - "completion_length": 1508.4174499511719, + "completion_length": 1696.6183471679688, "epoch": 0.8976178030020163, - "grad_norm": 11.16243839263916, - "kl": 0.253662109375, - "learning_rate": 3.147545190494161e-09, - "loss": 0.1302, - "reward": 0.357700914144516, - "reward_std": 0.1609322540462017, - "rewards/accuracy_reward": 0.022321429569274187, + "grad_norm": 15.135379791259766, + "kl": 4.1640625, + "learning_rate": 1.5737725952470805e-08, + "loss": 0.2561, + "reward": 0.4765625298023224, + "reward_std": 0.12551194243133068, + "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.436383955180645, "step": 3005 }, { "clip_ratio": 0.0, - "completion_length": 1551.2701416015625, + "completion_length": 1744.9197387695312, "epoch": 0.8979165110895377, - "grad_norm": 11.37264633178711, - "kl": 0.237060546875, - "learning_rate": 3.1293595779998184e-09, - "loss": 0.1082, - "reward": 0.4693080559372902, - "reward_std": 0.22560475766658783, - "rewards/accuracy_reward": 0.1183035783469677, + "grad_norm": 5.449563980102539, + "kl": 3.953125, + "learning_rate": 1.5646797889999093e-08, + "loss": 0.2307, + "reward": 0.5429687574505806, + "reward_std": 0.1960749812424183, + "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044813156128, + "rewards/tag_count_reward": 0.4268973469734192, "step": 3006 }, { "clip_ratio": 0.0, - "completion_length": 1666.1495971679688, + "completion_length": 1808.3036804199219, "epoch": 0.8982152191770593, - "grad_norm": 11.489145278930664, - "kl": 0.285400390625, - "learning_rate": 3.1112249561684288e-09, - "loss": 0.1297, - "reward": 0.3604910895228386, - "reward_std": 0.19548142701387405, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 40.83533477783203, + "kl": 6.3828125, + "learning_rate": 1.5556124780842144e-08, + "loss": 0.3468, + "reward": 0.4520089477300644, + "reward_std": 0.16220754757523537, + "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553656578064, + "rewards/tag_count_reward": 0.3939732313156128, "step": 3007 }, { "clip_ratio": 0.0, - "completion_length": 1633.5982971191406, + "completion_length": 1789.8572387695312, "epoch": 0.8985139272645807, - "grad_norm": 12.857306480407715, - "kl": 0.27099609375, - "learning_rate": 3.0931413447286947e-09, - "loss": 0.1067, - "reward": 0.4263393059372902, - "reward_std": 0.1812010407447815, - "rewards/accuracy_reward": 0.1071428582072258, + "grad_norm": 16.653467178344727, + "kl": 4.16796875, + "learning_rate": 1.5465706723643474e-08, + "loss": 0.23, + "reward": 0.5379464626312256, + "reward_std": 0.1597521211951971, + "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3008 }, { "clip_ratio": 0.0, - "completion_length": 1612.0558776855469, + "completion_length": 1767.9130554199219, "epoch": 0.8988126353521022, - "grad_norm": 13.030545234680176, - "kl": 0.248779296875, - "learning_rate": 3.0751087633538174e-09, - "loss": 0.1267, - "reward": 0.3722098395228386, - "reward_std": 0.15734185092151165, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 8.859665870666504, + "kl": 3.5703125, + "learning_rate": 1.5375543816769087e-08, + "loss": 0.2082, + "reward": 0.4838169887661934, + "reward_std": 0.11428386904299259, + "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312723517418, + "rewards/tag_count_reward": 0.4347098395228386, "step": 3009 }, { "clip_ratio": 0.0, - "completion_length": 1666.9308471679688, + "completion_length": 1773.87060546875, "epoch": 0.8991113434396236, - "grad_norm": 12.017329216003418, - "kl": 0.23046875, - "learning_rate": 3.057127231661466e-09, - "loss": 0.1321, - "reward": 0.3337053656578064, - "reward_std": 0.19290436431765556, - "rewards/accuracy_reward": 0.02455357206054032, + "grad_norm": 13.51382064819336, + "kl": 3.103515625, + "learning_rate": 1.528563615830733e-08, + "loss": 0.1675, + "reward": 0.4893973469734192, + "reward_std": 0.15845802798867226, + "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3010 }, { "clip_ratio": 0.0, - "completion_length": 1568.22998046875, + "completion_length": 1756.0670471191406, "epoch": 0.8994100515271451, - "grad_norm": 13.147826194763184, - "kl": 0.21630859375, - "learning_rate": 3.0391967692137866e-09, - "loss": 0.1673, - "reward": 0.4056919813156128, - "reward_std": 0.16123561933636665, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 14.859163284301758, + "kl": 5.16015625, + "learning_rate": 1.519598384606893e-08, + "loss": 0.3, + "reward": 0.4966518208384514, + "reward_std": 0.13064716011285782, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "rewards/tag_count_reward": 0.4073660895228386, "step": 3011 }, { - "clip_ratio": 0.0, - "completion_length": 1649.1429138183594, - "epoch": 0.8997087596146666, - "grad_norm": 12.675135612487793, - "kl": 0.2529296875, - "learning_rate": 3.021317395517381e-09, - "loss": 0.116, - "reward": 0.4224330559372902, - "reward_std": 0.20696276426315308, - "rewards/accuracy_reward": 0.11607143469154835, + "clip_ratio": 0.0, + "completion_length": 1805.6786499023438, + "epoch": 0.8997087596146666, + "grad_norm": 12.079204559326172, + "kl": 4.96484375, + "learning_rate": 1.5106586977586904e-08, + "loss": 0.2839, + "reward": 0.5368303880095482, + "reward_std": 0.16945725865662098, + "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.306361623108387, + "rewards/tag_count_reward": 0.4162946715950966, "step": 3012 }, { "clip_ratio": 0.0, - "completion_length": 1552.3929443359375, + "completion_length": 1767.6429443359375, "epoch": 0.9000074677021881, - "grad_norm": 14.014389038085938, - "kl": 0.22314453125, - "learning_rate": 3.0034891300232347e-09, - "loss": 0.1417, - "reward": 0.5217634215950966, - "reward_std": 0.20181823149323463, - "rewards/accuracy_reward": 0.17410715017467737, + "grad_norm": 6.974257946014404, + "kl": 2.837890625, + "learning_rate": 1.501744565011617e-08, + "loss": 0.1681, + "reward": 0.6277901977300644, + "reward_std": 0.17926384136080742, + "rewards/accuracy_reward": 0.19196429336443543, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "rewards/tag_count_reward": 0.4358259066939354, "step": 3013 }, { "clip_ratio": 0.0, - "completion_length": 1681.0603332519531, + "completion_length": 1811.9509582519531, "epoch": 0.9003061757897095, - "grad_norm": 11.603867530822754, - "kl": 0.2822265625, - "learning_rate": 2.9857119921267715e-09, - "loss": 0.1018, - "reward": 0.3761160895228386, - "reward_std": 0.2232108786702156, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 20.885343551635742, + "kl": 5.484375, + "learning_rate": 1.492855996063386e-08, + "loss": 0.3143, + "reward": 0.4743303805589676, + "reward_std": 0.1958291307091713, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4029018059372902, "step": 3014 }, { "clip_ratio": 0.0, - "completion_length": 1690.26123046875, + "completion_length": 1822.6697082519531, "epoch": 0.9006048838772309, - "grad_norm": 14.27543830871582, - "kl": 0.22998046875, - "learning_rate": 2.967986001167755e-09, - "loss": 0.1223, - "reward": 0.3822544813156128, - "reward_std": 0.20689989253878593, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 11.043693542480469, + "kl": 3.26953125, + "learning_rate": 1.4839930005838775e-08, + "loss": 0.1846, + "reward": 0.5206473544239998, + "reward_std": 0.14596332795917988, + "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3015 }, { "clip_ratio": 0.0, - "completion_length": 1607.8304138183594, + "completion_length": 1799.7523193359375, "epoch": 0.9009035919647524, - "grad_norm": 11.557488441467285, - "kl": 0.279052734375, - "learning_rate": 2.9503111764303366e-09, - "loss": 0.1164, - "reward": 0.4280134066939354, - "reward_std": 0.20781145989894867, - "rewards/accuracy_reward": 0.12276786286383867, + "grad_norm": 7.681408405303955, + "kl": 4.0546875, + "learning_rate": 1.4751555882151683e-08, + "loss": 0.2197, + "reward": 0.5641741305589676, + "reward_std": 0.20423424988985062, + "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.412388414144516, "step": 3016 }, { "clip_ratio": 0.0, - "completion_length": 1601.7791137695312, + "completion_length": 1744.52685546875, "epoch": 0.9012023000522739, - "grad_norm": 14.359674453735352, - "kl": 0.224365234375, - "learning_rate": 2.9326875371430025e-09, - "loss": 0.1328, - "reward": 0.4257812649011612, - "reward_std": 0.19316628202795982, - "rewards/accuracy_reward": 0.10044643469154835, + "grad_norm": 11.086936950683594, + "kl": 3.40625, + "learning_rate": 1.4663437685715014e-08, + "loss": 0.2169, + "reward": 0.545758955180645, + "reward_std": 0.18009774573147297, + "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4252232387661934, "step": 3017 }, { "clip_ratio": 0.0, - "completion_length": 1586.6607666015625, + "completion_length": 1757.9487609863281, "epoch": 0.9015010081397954, - "grad_norm": 13.289493560791016, - "kl": 0.21875, - "learning_rate": 2.9151151024785212e-09, - "loss": 0.127, - "reward": 0.4843750298023224, - "reward_std": 0.23883289471268654, - "rewards/accuracy_reward": 0.14955357951112092, + "grad_norm": 6.577908992767334, + "kl": 4.16015625, + "learning_rate": 1.4575575512392608e-08, + "loss": 0.2548, + "reward": 0.5831473544239998, + "reward_std": 0.2279868759214878, + "rewards/accuracy_reward": 0.16517858067527413, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4179687723517418, "step": 3018 }, { "clip_ratio": 0.0, - "completion_length": 1599.2098999023438, + "completion_length": 1802.2076721191406, "epoch": 0.9017997162273168, - "grad_norm": 12.96336555480957, - "kl": 0.250732421875, - "learning_rate": 2.897593891554001e-09, - "loss": 0.1338, - "reward": 0.403459832072258, - "reward_std": 0.1796436607837677, - "rewards/accuracy_reward": 0.06696428940631449, + "grad_norm": 10.551751136779785, + "kl": 4.45703125, + "learning_rate": 1.4487969457770005e-08, + "loss": 0.2568, + "reward": 0.4715401977300644, + "reward_std": 0.12084257788956165, + "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.4179687649011612, "step": 3019 }, { "clip_ratio": 0.0, - "completion_length": 1659.4554138183594, + "completion_length": 1745.399658203125, "epoch": 0.9020984243148383, - "grad_norm": 10.839662551879883, - "kl": 0.240234375, - "learning_rate": 2.8801239234307993e-09, - "loss": 0.1211, - "reward": 0.4620535969734192, - "reward_std": 0.19701942428946495, - "rewards/accuracy_reward": 0.1495535783469677, + "grad_norm": 21.559125900268555, + "kl": 2.9453125, + "learning_rate": 1.4400619617153998e-08, + "loss": 0.197, + "reward": 0.601562537252903, + "reward_std": 0.18469862639904022, + "rewards/accuracy_reward": 0.17187500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4296875223517418, "step": 3020 }, { "clip_ratio": 0.0, - "completion_length": 1600.0826416015625, + "completion_length": 1797.2076721191406, "epoch": 0.9023971324023597, - "grad_norm": 11.184205055236816, - "kl": 0.230224609375, - "learning_rate": 2.8627052171145195e-09, - "loss": 0.1354, - "reward": 0.434709832072258, - "reward_std": 0.1811411716043949, - "rewards/accuracy_reward": 0.11160715157166123, + "grad_norm": 7.436116695404053, + "kl": 3.30859375, + "learning_rate": 1.4313526085572596e-08, + "loss": 0.1834, + "reward": 0.561941996216774, + "reward_std": 0.1673073936253786, + "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4213169887661934, "step": 3021 }, { "clip_ratio": 0.0, - "completion_length": 1535.8460388183594, + "completion_length": 1719.8281860351562, "epoch": 0.9026958404898813, - "grad_norm": 12.204748153686523, - "kl": 0.2412109375, - "learning_rate": 2.845337791555014e-09, - "loss": 0.1439, - "reward": 0.4073660895228386, - "reward_std": 0.21668827161192894, - "rewards/accuracy_reward": 0.07366071688011289, + "grad_norm": 12.858040809631348, + "kl": 3.8125, + "learning_rate": 1.4226688957775068e-08, + "loss": 0.2291, + "reward": 0.5027901977300644, + "reward_std": 0.1566948276013136, + "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4268973395228386, "step": 3022 }, { "clip_ratio": 0.0, - "completion_length": 1558.0022888183594, + "completion_length": 1715.0067749023438, "epoch": 0.9029945485774027, - "grad_norm": 15.248337745666504, - "kl": 0.259765625, - "learning_rate": 2.8280216656463405e-09, - "loss": 0.1595, - "reward": 0.436941996216774, - "reward_std": 0.1842067986726761, - "rewards/accuracy_reward": 0.10491071827709675, + "grad_norm": 8.969083786010742, + "kl": 4.052734375, + "learning_rate": 1.4140108328231703e-08, + "loss": 0.2399, + "reward": 0.546316996216774, + "reward_std": 0.15149638429284096, + "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.428013414144516, "step": 3023 }, { "clip_ratio": 0.0, - "completion_length": 1593.6228332519531, + "completion_length": 1759.7523193359375, "epoch": 0.9032932566649242, - "grad_norm": 12.841617584228516, - "kl": 0.263916015625, - "learning_rate": 2.810756858226737e-09, - "loss": 0.1474, - "reward": 0.3459821566939354, - "reward_std": 0.20335038751363754, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 11.13490104675293, + "kl": 2.9375, + "learning_rate": 1.4053784291133686e-08, + "loss": 0.1927, + "reward": 0.4854910895228386, + "reward_std": 0.1499284766614437, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357238650322, + "rewards/tag_count_reward": 0.4386160895228386, "step": 3024 }, { "clip_ratio": 0.0, - "completion_length": 1630.169677734375, + "completion_length": 1778.4978332519531, "epoch": 0.9035919647524456, - "grad_norm": 13.740464210510254, - "kl": 0.239013671875, - "learning_rate": 2.7935433880786396e-09, - "loss": 0.1282, - "reward": 0.4296875223517418, - "reward_std": 0.19011279195547104, - "rewards/accuracy_reward": 0.08928571734577417, + "grad_norm": 3.653440237045288, + "kl": 3.8359375, + "learning_rate": 1.3967716940393198e-08, + "loss": 0.2544, + "reward": 0.530133955180645, + "reward_std": 0.18014714866876602, + "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4185267984867096, "step": 3025 }, { "clip_ratio": 0.0, - "completion_length": 1648.8840026855469, + "completion_length": 1806.9085388183594, "epoch": 0.9038906728399672, - "grad_norm": 12.71583080291748, - "kl": 0.27197265625, - "learning_rate": 2.7763812739285962e-09, - "loss": 0.1361, - "reward": 0.3331473395228386, - "reward_std": 0.17785682529211044, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 4.947422981262207, + "kl": 4.1796875, + "learning_rate": 1.3881906369642982e-08, + "loss": 0.2415, + "reward": 0.4609375223517418, + "reward_std": 0.1501184031367302, + "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937649011612, + "rewards/tag_count_reward": 0.4207589402794838, "step": 3026 }, { "clip_ratio": 0.0, - "completion_length": 1585.4286499023438, + "completion_length": 1709.4688110351562, "epoch": 0.9041893809274886, - "grad_norm": 10.86838436126709, - "kl": 0.2265625, - "learning_rate": 2.759270534447311e-09, - "loss": 0.1075, - "reward": 0.345982164144516, - "reward_std": 0.18253270350396633, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 7.598712921142578, + "kl": 3.859375, + "learning_rate": 1.3796352672236556e-08, + "loss": 0.232, + "reward": 0.4603794887661934, + "reward_std": 0.14632592536509037, + "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4268973395228386, "step": 3027 }, { "clip_ratio": 0.0, - "completion_length": 1618.1563415527344, + "completion_length": 1767.2098999023438, "epoch": 0.9044880890150101, - "grad_norm": 14.081441879272461, - "kl": 0.230712890625, - "learning_rate": 2.74221118824961e-09, - "loss": 0.152, - "reward": 0.4040178805589676, - "reward_std": 0.1649579256772995, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 9.425708770751953, + "kl": 3.85546875, + "learning_rate": 1.371105594124805e-08, + "loss": 0.2343, + "reward": 0.5066964477300644, + "reward_std": 0.12787888757884502, + "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4263392984867096, "step": 3028 }, { "clip_ratio": 0.0, - "completion_length": 1640.41748046875, + "completion_length": 1792.0358276367188, "epoch": 0.9047867971025315, - "grad_norm": 12.1456880569458, - "kl": 0.234375, - "learning_rate": 2.725203253894365e-09, - "loss": 0.1297, - "reward": 0.3560268059372902, - "reward_std": 0.23022255301475525, - "rewards/accuracy_reward": 0.04241071571595967, + "grad_norm": 7.950504779815674, + "kl": 4.50390625, + "learning_rate": 1.3626016269471823e-08, + "loss": 0.2516, + "reward": 0.483258955180645, + "reward_std": 0.1965232789516449, + "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4185268059372902, "step": 3029 }, { "clip_ratio": 0.0, - "completion_length": 1508.9554443359375, + "completion_length": 1667.6987609863281, "epoch": 0.905085505190053, - "grad_norm": 12.045793533325195, - "kl": 0.203369140625, - "learning_rate": 2.708246749884546e-09, - "loss": 0.1528, - "reward": 0.409040205180645, - "reward_std": 0.19325312227010727, - "rewards/accuracy_reward": 0.066964291036129, + "grad_norm": 7.137703895568848, + "kl": 2.84765625, + "learning_rate": 1.3541233749422732e-08, + "loss": 0.1731, + "reward": 0.5083705559372902, + "reward_std": 0.12266553938388824, + "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.342075914144516, + "rewards/tag_count_reward": 0.4458705484867096, "step": 3030 }, { "clip_ratio": 0.0, - "completion_length": 1581.5781860351562, + "completion_length": 1749.6942749023438, "epoch": 0.9053842132775745, - "grad_norm": 13.547330856323242, - "kl": 0.225830078125, - "learning_rate": 2.6913416946671873e-09, - "loss": 0.1366, - "reward": 0.4938616305589676, - "reward_std": 0.21458016708493233, - "rewards/accuracy_reward": 0.1584821492433548, + "grad_norm": 10.648491859436035, + "kl": 4.376953125, + "learning_rate": 1.3456708473335936e-08, + "loss": 0.2488, + "reward": 0.6004464626312256, + "reward_std": 0.19203465059399605, + "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794738650322, + "rewards/tag_count_reward": 0.4308035895228386, "step": 3031 }, { "clip_ratio": 0.0, - "completion_length": 1619.8326416015625, + "completion_length": 1747.9063415527344, "epoch": 0.905682921365096, - "grad_norm": 14.223465919494629, - "kl": 0.20947265625, - "learning_rate": 2.6744881066333104e-09, - "loss": 0.1409, - "reward": 0.3655134066939354, - "reward_std": 0.18179205805063248, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 12.397019386291504, + "kl": 3.0234375, + "learning_rate": 1.3372440533166551e-08, + "loss": 0.175, + "reward": 0.4827009215950966, + "reward_std": 0.1601248849183321, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3498883992433548, + "rewards/tag_count_reward": 0.446986623108387, "step": 3032 }, { "clip_ratio": 0.0, - "completion_length": 1601.6272888183594, + "completion_length": 1771.4286499023438, "epoch": 0.9059816294526174, - "grad_norm": 13.600160598754883, - "kl": 0.2021484375, - "learning_rate": 2.6576860041179703e-09, - "loss": 0.1184, - "reward": 0.3856026977300644, - "reward_std": 0.19383610785007477, - "rewards/accuracy_reward": 0.04017857275903225, + "grad_norm": 9.668246269226074, + "kl": 3.3359375, + "learning_rate": 1.3288430020589853e-08, + "loss": 0.2089, + "reward": 0.502790205180645, + "reward_std": 0.1742841601371765, + "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.4380580559372902, "step": 3033 }, { "clip_ratio": 0.0, - "completion_length": 1610.6317443847656, + "completion_length": 1750.419677734375, "epoch": 0.9062803375401389, - "grad_norm": 13.119230270385742, - "kl": 0.21875, - "learning_rate": 2.640935405400224e-09, - "loss": 0.1421, - "reward": 0.3577009066939354, - "reward_std": 0.17707864567637444, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 15.599602699279785, + "kl": 2.484375, + "learning_rate": 1.3204677027001122e-08, + "loss": 0.162, + "reward": 0.5122768059372902, + "reward_std": 0.1495446301996708, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3420759066939354, + "rewards/tag_count_reward": 0.4497768059372902, "step": 3034 }, { "clip_ratio": 0.0, - "completion_length": 1491.83935546875, + "completion_length": 1680.7031860351562, "epoch": 0.9065790456276603, - "grad_norm": 13.099777221679688, - "kl": 0.193603515625, - "learning_rate": 2.624236328703061e-09, - "loss": 0.1317, - "reward": 0.5050223469734192, - "reward_std": 0.2109372466802597, - "rewards/accuracy_reward": 0.13839286309666932, + "grad_norm": 6.781495094299316, + "kl": 2.71484375, + "learning_rate": 1.3121181643515306e-08, + "loss": 0.1947, + "reward": 0.5859375149011612, + "reward_std": 0.16371503099799156, + "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3666294813156128, + "rewards/tag_count_reward": 0.4475446715950966, "step": 3035 }, { "clip_ratio": 0.0, - "completion_length": 1674.0000915527344, + "completion_length": 1825.415283203125, "epoch": 0.9068777537151819, - "grad_norm": 11.664702415466309, - "kl": 0.236083984375, - "learning_rate": 2.607588792193449e-09, - "loss": 0.1335, - "reward": 0.4224330559372902, - "reward_std": 0.2102738954126835, - "rewards/accuracy_reward": 0.1160714365541935, + "grad_norm": 11.519110679626465, + "kl": 4.69921875, + "learning_rate": 1.3037943960967245e-08, + "loss": 0.2582, + "reward": 0.5390625298023224, + "reward_std": 0.16831624880433083, + "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3063616156578064, + "rewards/tag_count_reward": 0.4162946566939354, "step": 3036 }, { "clip_ratio": 0.0, - "completion_length": 1589.6183776855469, + "completion_length": 1725.13623046875, "epoch": 0.9071764618027033, - "grad_norm": 12.671123504638672, - "kl": 0.268310546875, - "learning_rate": 2.590992813982279e-09, - "loss": 0.1339, - "reward": 0.4157366156578064, - "reward_std": 0.20129886642098427, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 4.192883491516113, + "kl": 3.50390625, + "learning_rate": 1.2954964069911396e-08, + "loss": 0.2139, + "reward": 0.5452009215950966, + "reward_std": 0.15532734617590904, + "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3037 }, { "clip_ratio": 0.0, - "completion_length": 1665.3572082519531, + "completion_length": 1805.3326721191406, "epoch": 0.9074751698902248, - "grad_norm": 12.019387245178223, - "kl": 0.26220703125, - "learning_rate": 2.5744484121243414e-09, - "loss": 0.1093, - "reward": 0.4268973395228386, - "reward_std": 0.18306365981698036, - "rewards/accuracy_reward": 0.12276786379516125, + "grad_norm": 11.080338478088379, + "kl": 4.55078125, + "learning_rate": 1.2872242060621707e-08, + "loss": 0.2447, + "reward": 0.5541295036673546, + "reward_std": 0.13940712064504623, + "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294664144516, + "rewards/tag_count_reward": 0.4224330633878708, "step": 3038 }, { "clip_ratio": 0.0, - "completion_length": 1612.4420776367188, + "completion_length": 1740.2745971679688, "epoch": 0.9077738779777462, - "grad_norm": 15.794790267944336, - "kl": 0.245849609375, - "learning_rate": 2.5579556046183338e-09, - "loss": 0.1516, - "reward": 0.4034598395228386, - "reward_std": 0.15864012576639652, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 9.880365371704102, + "kl": 5.671875, + "learning_rate": 1.278977802309167e-08, + "loss": 0.3287, + "reward": 0.506138414144516, + "reward_std": 0.12769347988069057, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705559372902, + "rewards/tag_count_reward": 0.4190848395228386, "step": 3039 }, { "clip_ratio": 0.0, - "completion_length": 1516.7857666015625, + "completion_length": 1696.2322082519531, "epoch": 0.9080725860652678, - "grad_norm": 15.666000366210938, - "kl": 0.23046875, - "learning_rate": 2.541514409406803e-09, - "loss": 0.1662, - "reward": 0.431919664144516, - "reward_std": 0.1809094101190567, - "rewards/accuracy_reward": 0.08258929220028222, + "grad_norm": 7.809533596038818, + "kl": 4.68359375, + "learning_rate": 1.2707572047034015e-08, + "loss": 0.2759, + "reward": 0.5156250223517418, + "reward_std": 0.14715279079973698, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3493303656578064, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3040 }, { "clip_ratio": 0.0, - "completion_length": 1580.6741943359375, + "completion_length": 1754.4442749023438, "epoch": 0.9083712941527892, - "grad_norm": 10.123638153076172, - "kl": 0.23828125, - "learning_rate": 2.525124844376164e-09, - "loss": 0.1273, - "reward": 0.3850446566939354, - "reward_std": 0.18772533908486366, - "rewards/accuracy_reward": 0.07142857275903225, + "grad_norm": 28.321964263916016, + "kl": 5.5078125, + "learning_rate": 1.262562422188082e-08, + "loss": 0.3082, + "reward": 0.494419664144516, + "reward_std": 0.16195397078990936, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4073660895228386, "step": 3041 }, { "clip_ratio": 0.0, - "completion_length": 1594.6898193359375, + "completion_length": 1783.243408203125, "epoch": 0.9086700022403107, - "grad_norm": 12.884642601013184, - "kl": 0.21630859375, - "learning_rate": 2.5087869273566597e-09, - "loss": 0.1257, - "reward": 0.4536830484867096, - "reward_std": 0.20864815264940262, - "rewards/accuracy_reward": 0.12500000558793545, + "grad_norm": 18.60158920288086, + "kl": 4.6484375, + "learning_rate": 1.25439346367833e-08, + "loss": 0.2489, + "reward": 0.5781250298023224, + "reward_std": 0.1846358235925436, + "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4196428805589676, "step": 3042 }, { "clip_ratio": 0.0, - "completion_length": 1611.02685546875, + "completion_length": 1759.8884887695312, "epoch": 0.9089687103278321, - "grad_norm": 9.613454818725586, - "kl": 0.24267578125, - "learning_rate": 2.492500676122333e-09, - "loss": 0.1075, - "reward": 0.411272332072258, - "reward_std": 0.1595790833234787, - "rewards/accuracy_reward": 0.1071428619325161, + "grad_norm": 13.930624008178711, + "kl": 4.5078125, + "learning_rate": 1.2462503380611667e-08, + "loss": 0.2436, + "reward": 0.5306919887661934, + "reward_std": 0.09980147518217564, + "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3043 }, { "clip_ratio": 0.0, - "completion_length": 1608.0022888183594, + "completion_length": 1729.2098999023438, "epoch": 0.9092674184153536, - "grad_norm": 11.473465919494629, - "kl": 0.235107421875, - "learning_rate": 2.47626610839104e-09, - "loss": 0.1487, - "reward": 0.3610491305589676, - "reward_std": 0.1845119595527649, - "rewards/accuracy_reward": 0.04687500302679837, + "grad_norm": 7.3565874099731445, + "kl": 3.76171875, + "learning_rate": 1.2381330541955198e-08, + "loss": 0.2193, + "reward": 0.5223214402794838, + "reward_std": 0.19376079738140106, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.4330357313156128, "step": 3044 }, { "clip_ratio": 0.0, - "completion_length": 1561.2433776855469, + "completion_length": 1697.0313415527344, "epoch": 0.909566126502875, - "grad_norm": 13.545825004577637, - "kl": 0.22900390625, - "learning_rate": 2.4600832418243957e-09, - "loss": 0.1554, - "reward": 0.435267873108387, - "reward_std": 0.18626410514116287, - "rewards/accuracy_reward": 0.10044643399305642, + "grad_norm": 6.694425106048584, + "kl": 4.1171875, + "learning_rate": 1.2300416209121977e-08, + "loss": 0.2357, + "reward": 0.541852705180645, + "reward_std": 0.15285667032003403, + "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4324776977300644, "step": 3045 }, { "clip_ratio": 0.0, - "completion_length": 1664.3929138183594, + "completion_length": 1810.3148498535156, "epoch": 0.9098648345903966, - "grad_norm": 12.555281639099121, - "kl": 0.27197265625, - "learning_rate": 2.443952094027768e-09, - "loss": 0.1338, - "reward": 0.3113839402794838, - "reward_std": 0.1662411279976368, - "rewards/accuracy_reward": 0.006696428870782256, + "grad_norm": 5.282022476196289, + "kl": 4.31640625, + "learning_rate": 1.221976047013884e-08, + "loss": 0.2247, + "reward": 0.4536830484867096, + "reward_std": 0.15884272940456867, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3046875149011612, + "rewards/tag_count_reward": 0.4246651977300644, "step": 3046 }, { "clip_ratio": 0.0, - "completion_length": 1680.9531860351562, + "completion_length": 1796.7188415527344, "epoch": 0.910163542677918, - "grad_norm": 11.696270942687988, - "kl": 0.229248046875, - "learning_rate": 2.427872682550269e-09, - "loss": 0.1285, - "reward": 0.3264508992433548, - "reward_std": 0.1860903762280941, - "rewards/accuracy_reward": 0.011160715017467737, + "grad_norm": 25.243209838867188, + "kl": 2.900390625, + "learning_rate": 1.2139363412751347e-08, + "loss": 0.2018, + "reward": 0.4419643059372902, + "reward_std": 0.13291450589895248, + "rewards/accuracy_reward": 0.013392857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3047 }, { "clip_ratio": 0.0, - "completion_length": 1539.24560546875, + "completion_length": 1701.4844360351562, "epoch": 0.9104622507654395, - "grad_norm": 11.100549697875977, - "kl": 0.241943359375, - "learning_rate": 2.4118450248847235e-09, - "loss": 0.1143, - "reward": 0.4324776977300644, - "reward_std": 0.19365349784493446, - "rewards/accuracy_reward": 0.10491071757860482, + "grad_norm": 8.00151252746582, + "kl": 4.0234375, + "learning_rate": 1.2059225124423617e-08, + "loss": 0.2593, + "reward": 0.565848246216774, + "reward_std": 0.17314192466437817, + "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4296875149011612, "step": 3048 }, { "clip_ratio": 0.0, - "completion_length": 1633.9219665527344, + "completion_length": 1769.62060546875, "epoch": 0.9107609588529609, - "grad_norm": 14.73744010925293, - "kl": 0.27001953125, - "learning_rate": 2.395869138467649e-09, - "loss": 0.1554, - "reward": 0.3978794813156128, - "reward_std": 0.16143113374710083, + "grad_norm": 5.522336006164551, + "kl": 4.046875, + "learning_rate": 1.1979345692338245e-08, + "loss": 0.2419, + "reward": 0.5033482313156128, + "reward_std": 0.11757203377783298, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3049 }, { "clip_ratio": 0.0, - "completion_length": 1647.7076721191406, + "completion_length": 1800.149658203125, "epoch": 0.9110596669404825, - "grad_norm": 10.51854133605957, - "kl": 0.240966796875, - "learning_rate": 2.379945040679243e-09, - "loss": 0.1121, - "reward": 0.468191996216774, - "reward_std": 0.19152260944247246, - "rewards/accuracy_reward": 0.1517857164144516, + "grad_norm": 6.179238319396973, + "kl": 4.08203125, + "learning_rate": 1.1899725203396216e-08, + "loss": 0.2348, + "reward": 0.6026785969734192, + "reward_std": 0.17641312070190907, + "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3050 }, { "clip_ratio": 0.0, - "completion_length": 1701.8504943847656, + "completion_length": 1830.51123046875, "epoch": 0.9113583750280039, - "grad_norm": 12.33753776550293, - "kl": 0.23681640625, - "learning_rate": 2.3640727488433766e-09, - "loss": 0.1372, - "reward": 0.3593750074505806, - "reward_std": 0.1803232654929161, - "rewards/accuracy_reward": 0.05133928661234677, + "grad_norm": 6.5983734130859375, + "kl": 4.78125, + "learning_rate": 1.1820363744216883e-08, + "loss": 0.2758, + "reward": 0.4492187649011612, + "reward_std": 0.13487210683524609, + "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3080357164144516, + "rewards/tag_count_reward": 0.400111623108387, "step": 3051 }, { "clip_ratio": 0.0, - "completion_length": 1626.0625305175781, + "completion_length": 1785.0313415527344, "epoch": 0.9116570831155254, - "grad_norm": 12.263229370117188, - "kl": 0.23681640625, - "learning_rate": 2.348252280227525e-09, - "loss": 0.1128, - "reward": 0.3599330484867096, - "reward_std": 0.19322320818901062, - "rewards/accuracy_reward": 0.03794642956927419, + "grad_norm": 4.358333587646484, + "kl": 3.80078125, + "learning_rate": 1.1741261401137626e-08, + "loss": 0.2286, + "reward": 0.4453125149011612, + "reward_std": 0.1487284917384386, + "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.416294664144516, "step": 3052 }, { "clip_ratio": 0.0, - "completion_length": 1703.4911499023438, + "completion_length": 1799.5536499023438, "epoch": 0.9119557912030468, - "grad_norm": 12.642193794250488, - "kl": 0.249267578125, - "learning_rate": 2.332483652042827e-09, - "loss": 0.1303, - "reward": 0.428571455180645, - "reward_std": 0.17950456216931343, - "rewards/accuracy_reward": 0.1250000037252903, + "grad_norm": 4.497030258178711, + "kl": 4.21484375, + "learning_rate": 1.1662418260214136e-08, + "loss": 0.2276, + "reward": 0.5513393133878708, + "reward_std": 0.16147803515195847, + "rewards/accuracy_reward": 0.13392857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714402794838, + "rewards/tag_count_reward": 0.4174107313156128, "step": 3053 }, { "clip_ratio": 0.0, - "completion_length": 1638.5134887695312, + "completion_length": 1747.040283203125, "epoch": 0.9122544992905683, - "grad_norm": 9.858652114868164, - "kl": 0.237548828125, - "learning_rate": 2.3167668814439933e-09, - "loss": 0.123, - "reward": 0.3515625149011612, - "reward_std": 0.1769014336168766, - "rewards/accuracy_reward": 0.04910714388824999, + "grad_norm": 7.4948410987854, + "kl": 3.5390625, + "learning_rate": 1.1583834407219966e-08, + "loss": 0.2219, + "reward": 0.5005580559372902, + "reward_std": 0.1525675356388092, + "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.431361623108387, "step": 3054 }, { "clip_ratio": 0.0, - "completion_length": 1595.8527221679688, + "completion_length": 1801.0960388183594, "epoch": 0.9125532073780898, - "grad_norm": 12.092644691467285, - "kl": 0.23486328125, - "learning_rate": 2.30110198552933e-09, - "loss": 0.1229, - "reward": 0.4888393059372902, - "reward_std": 0.19782837852835655, - "rewards/accuracy_reward": 0.1651785746216774, + "grad_norm": 4.306365489959717, + "kl": 3.921875, + "learning_rate": 1.150550992764665e-08, + "loss": 0.2151, + "reward": 0.585937537252903, + "reward_std": 0.17312992364168167, + "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.416294664144516, "step": 3055 }, { "clip_ratio": 0.0, - "completion_length": 1447.65185546875, + "completion_length": 1640.2902526855469, "epoch": 0.9128519154656113, - "grad_norm": 12.078157424926758, - "kl": 0.1875, - "learning_rate": 2.2854889813407218e-09, - "loss": 0.1268, - "reward": 0.4748884066939354, - "reward_std": 0.16672789677977562, - "rewards/accuracy_reward": 0.1116071492433548, + "grad_norm": 11.753826141357422, + "kl": 3.318359375, + "learning_rate": 1.1427444906703609e-08, + "loss": 0.2259, + "reward": 0.5664062798023224, + "reward_std": 0.1418941468000412, + "rewards/accuracy_reward": 0.12500000651925802, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3632812723517418, + "rewards/tag_count_reward": 0.4414062649011612, "step": 3056 }, { "clip_ratio": 0.0, - "completion_length": 1614.6340026855469, + "completion_length": 1771.5045166015625, "epoch": 0.9131506235531327, - "grad_norm": 12.23386001586914, - "kl": 0.220947265625, - "learning_rate": 2.2699278858635652e-09, - "loss": 0.1295, - "reward": 0.4910714477300644, - "reward_std": 0.22717121988534927, - "rewards/accuracy_reward": 0.15848215040750802, + "grad_norm": 12.52258014678955, + "kl": 2.4765625, + "learning_rate": 1.1349639429317825e-08, + "loss": 0.1535, + "reward": 0.6238839626312256, + "reward_std": 0.19463452324271202, + "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325892984867096, + "rewards/tag_count_reward": 0.4363839477300644, "step": 3057 }, { "clip_ratio": 0.0, - "completion_length": 1604.2188110351562, + "completion_length": 1791.9331665039062, "epoch": 0.9134493316406541, - "grad_norm": 11.846099853515625, - "kl": 0.227783203125, - "learning_rate": 2.2544187160268235e-09, - "loss": 0.121, - "reward": 0.3621651977300644, - "reward_std": 0.1804756037890911, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 3.472938060760498, + "kl": 3.248046875, + "learning_rate": 1.1272093580134118e-08, + "loss": 0.1694, + "reward": 0.4793526977300644, + "reward_std": 0.12507658638060093, + "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901977300644, + "rewards/tag_count_reward": 0.428013414144516, "step": 3058 }, { "clip_ratio": 0.0, - "completion_length": 1640.0313415527344, + "completion_length": 1815.5291137695312, "epoch": 0.9137480397281756, - "grad_norm": 10.770573616027832, - "kl": 0.220703125, - "learning_rate": 2.238961488702956e-09, - "loss": 0.1226, - "reward": 0.345424123108387, - "reward_std": 0.18902868777513504, - "rewards/accuracy_reward": 0.026785715017467737, + "grad_norm": 4.629681587219238, + "kl": 4.2578125, + "learning_rate": 1.1194807443514781e-08, + "loss": 0.2538, + "reward": 0.4146205559372902, + "reward_std": 0.13787997514009476, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.3900669813156128, "step": 3059 }, { "clip_ratio": 0.0, - "completion_length": 1532.8193054199219, + "completion_length": 1745.524658203125, "epoch": 0.9140467478156971, - "grad_norm": 11.954042434692383, - "kl": 0.228515625, - "learning_rate": 2.2235562207078952e-09, - "loss": 0.1406, - "reward": 0.427455373108387, - "reward_std": 0.15876271948218346, - "rewards/accuracy_reward": 0.09821429220028222, + "grad_norm": 16.71782112121582, + "kl": 3.63671875, + "learning_rate": 1.1117781103539475e-08, + "loss": 0.2186, + "reward": 0.5518973395228386, + "reward_std": 0.15606126189231873, + "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3060 }, { "clip_ratio": 0.0, - "completion_length": 1595.6406860351562, + "completion_length": 1699.4420166015625, "epoch": 0.9143454559032186, - "grad_norm": 11.359992980957031, - "kl": 0.223388671875, - "learning_rate": 2.2082029288010727e-09, - "loss": 0.1304, - "reward": 0.368861623108387, - "reward_std": 0.21010566875338554, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 10.47619915008545, + "kl": 3.46875, + "learning_rate": 1.1041014644005364e-08, + "loss": 0.2168, + "reward": 0.498883955180645, + "reward_std": 0.180588249117136, + "rewards/accuracy_reward": 0.06696428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.431919664144516, "step": 3061 }, { "clip_ratio": 0.0, - "completion_length": 1619.9554138183594, + "completion_length": 1749.5023193359375, "epoch": 0.91464416399074, - "grad_norm": 12.22728157043457, - "kl": 0.223876953125, - "learning_rate": 2.1929016296853676e-09, - "loss": 0.1211, - "reward": 0.3398437723517418, - "reward_std": 0.18717363849282265, - "rewards/accuracy_reward": 0.020089285913854837, + "grad_norm": 16.9040584564209, + "kl": 3.40234375, + "learning_rate": 1.0964508148426837e-08, + "loss": 0.2171, + "reward": 0.462611623108387, + "reward_std": 0.1347369384020567, + "rewards/accuracy_reward": 0.0290178582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4335937723517418, "step": 3062 }, { "clip_ratio": 0.0, - "completion_length": 1564.3973693847656, + "completion_length": 1708.7255249023438, "epoch": 0.9149428720782615, - "grad_norm": 12.92938232421875, - "kl": 0.23681640625, - "learning_rate": 2.1776523400070855e-09, - "loss": 0.1442, - "reward": 0.4107142984867096, - "reward_std": 0.22040586546063423, - "rewards/accuracy_reward": 0.0825892873108387, + "grad_norm": 9.712556838989258, + "kl": 3.4453125, + "learning_rate": 1.0888261700035428e-08, + "loss": 0.2222, + "reward": 0.517857164144516, + "reward_std": 0.16814947500824928, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.4375000149011612, "step": 3063 }, { "clip_ratio": 0.0, - "completion_length": 1535.2322082519531, + "completion_length": 1722.6808776855469, "epoch": 0.9152415801657829, - "grad_norm": 11.476960182189941, - "kl": 0.2060546875, - "learning_rate": 2.1624550763559625e-09, - "loss": 0.1317, - "reward": 0.3777901977300644, - "reward_std": 0.19796273484826088, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 5.283228874206543, + "kl": 2.953125, + "learning_rate": 1.0812275381779811e-08, + "loss": 0.1658, + "reward": 0.478794664144516, + "reward_std": 0.15902943909168243, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4341517984867096, "step": 3064 }, { "clip_ratio": 0.0, - "completion_length": 1678.0000610351562, + "completion_length": 1808.7991943359375, "epoch": 0.9155402882533045, - "grad_norm": 11.16252326965332, - "kl": 0.23974609375, - "learning_rate": 2.147309855265145e-09, - "loss": 0.1289, - "reward": 0.4129464477300644, - "reward_std": 0.21499570831656456, - "rewards/accuracy_reward": 0.10937500558793545, + "grad_norm": 5.598512172698975, + "kl": 3.57421875, + "learning_rate": 1.0736549276325724e-08, + "loss": 0.202, + "reward": 0.5485491380095482, + "reward_std": 0.18903005868196487, + "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714402794838, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3065 }, { "clip_ratio": 0.0, - "completion_length": 1561.4197082519531, + "completion_length": 1769.1451721191406, "epoch": 0.9158389963408259, - "grad_norm": 13.619811058044434, - "kl": 0.238037109375, - "learning_rate": 2.1322166932111253e-09, - "loss": 0.1405, - "reward": 0.439732164144516, - "reward_std": 0.23583756387233734, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 18.885473251342773, + "kl": 2.52734375, + "learning_rate": 1.0661083466055627e-08, + "loss": 0.1556, + "reward": 0.5452009066939354, + "reward_std": 0.18969298340380192, + "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857238650322, + "rewards/tag_count_reward": 0.4425223395228386, "step": 3066 }, { "clip_ratio": 0.0, - "completion_length": 1579.825927734375, + "completion_length": 1743.0290832519531, "epoch": 0.9161377044283474, - "grad_norm": 12.677444458007812, - "kl": 0.22900390625, - "learning_rate": 2.1171756066138026e-09, - "loss": 0.1569, - "reward": 0.492745541036129, - "reward_std": 0.2158965766429901, - "rewards/accuracy_reward": 0.16071429220028222, + "grad_norm": 9.304407119750977, + "kl": 4.61328125, + "learning_rate": 1.0585878033069012e-08, + "loss": 0.2862, + "reward": 0.5987723469734192, + "reward_std": 0.1754293590784073, + "rewards/accuracy_reward": 0.17633929592557251, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.4224330559372902, "step": 3067 }, { "clip_ratio": 0.0, - "completion_length": 1571.5223999023438, + "completion_length": 1749.4197387695312, "epoch": 0.9164364125158688, - "grad_norm": 11.190728187561035, - "kl": 0.24072265625, - "learning_rate": 2.1021866118363986e-09, - "loss": 0.1258, - "reward": 0.3660714402794838, - "reward_std": 0.19820892065763474, - "rewards/accuracy_reward": 0.037946431431919336, + "grad_norm": 8.330451965332031, + "kl": 4.33203125, + "learning_rate": 1.0510933059181993e-08, + "loss": 0.2321, + "reward": 0.5016741305589676, + "reward_std": 0.1721808724105358, + "rewards/accuracy_reward": 0.0758928598370403, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250074505806, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3068 }, { "clip_ratio": 0.0, - "completion_length": 1603.8951416015625, + "completion_length": 1680.8706359863281, "epoch": 0.9167351206033904, - "grad_norm": 15.046077728271484, - "kl": 0.22705078125, - "learning_rate": 2.0872497251854635e-09, - "loss": 0.1423, - "reward": 0.385044664144516, - "reward_std": 0.17895138636231422, - "rewards/accuracy_reward": 0.05357142956927419, + "grad_norm": 12.119317054748535, + "kl": 3.59765625, + "learning_rate": 1.043624862592732e-08, + "loss": 0.2109, + "reward": 0.5251116305589676, + "reward_std": 0.1410869061946869, + "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4335937723517418, "step": 3069 }, { "clip_ratio": 0.0, - "completion_length": 1575.6340026855469, + "completion_length": 1756.1563415527344, "epoch": 0.9170338286909118, - "grad_norm": 10.95777416229248, - "kl": 0.2568359375, - "learning_rate": 2.072364962910883e-09, - "loss": 0.1161, - "reward": 0.4068080484867096, - "reward_std": 0.2146698720753193, - "rewards/accuracy_reward": 0.07812500279396772, + "grad_norm": 5.231842994689941, + "kl": 3.50390625, + "learning_rate": 1.0361824814554416e-08, + "loss": 0.2004, + "reward": 0.5078125223517418, + "reward_std": 0.14818345196545124, + "rewards/accuracy_reward": 0.07812500582076609, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830559372902, + "rewards/tag_count_reward": 0.4296875223517418, "step": 3070 }, { "clip_ratio": 0.0, - "completion_length": 1597.2857971191406, + "completion_length": 1776.3304443359375, "epoch": 0.9173325367784333, - "grad_norm": 11.291136741638184, - "kl": 0.227294921875, - "learning_rate": 2.0575323412058032e-09, - "loss": 0.1118, - "reward": 0.436383955180645, - "reward_std": 0.21407810226082802, - "rewards/accuracy_reward": 0.09375000232830644, + "grad_norm": 3.4943671226501465, + "kl": 3.2890625, + "learning_rate": 1.0287661706029016e-08, + "loss": 0.1785, + "reward": 0.5435268208384514, + "reward_std": 0.1916392520070076, + "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.4319196566939354, "step": 3071 }, { "clip_ratio": 0.0, - "completion_length": 1619.1407165527344, + "completion_length": 1795.1853637695312, "epoch": 0.9176312448659547, - "grad_norm": 10.678609848022461, - "kl": 0.220458984375, - "learning_rate": 2.042751876206672e-09, - "loss": 0.1251, - "reward": 0.3459821566939354, - "reward_std": 0.15934283286333084, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 21.077880859375, + "kl": 4.32421875, + "learning_rate": 1.021375938103336e-08, + "loss": 0.2095, + "reward": 0.4536830484867096, + "reward_std": 0.11673583835363388, + "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.420200914144516, "step": 3072 }, { "clip_ratio": 0.0, - "completion_length": 1676.9665832519531, + "completion_length": 1856.1942749023438, "epoch": 0.9179299529534762, - "grad_norm": 11.214242935180664, - "kl": 0.21630859375, - "learning_rate": 2.0280235839931826e-09, - "loss": 0.0936, - "reward": 0.4503348469734192, - "reward_std": 0.19992489367723465, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 4.496991157531738, + "kl": 3.3828125, + "learning_rate": 1.0140117919965913e-08, + "loss": 0.1775, + "reward": 0.5474330633878708, + "reward_std": 0.1377830784767866, + "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3073 }, { "clip_ratio": 0.0, - "completion_length": 1579.6607666015625, + "completion_length": 1732.1317749023438, "epoch": 0.9182286610409977, - "grad_norm": 11.312671661376953, - "kl": 0.2509765625, - "learning_rate": 2.0133474805882733e-09, - "loss": 0.1211, - "reward": 0.4430803805589676, - "reward_std": 0.17524969205260277, + "grad_norm": 9.806090354919434, + "kl": 4.6328125, + "learning_rate": 1.0066737402941367e-08, + "loss": 0.2868, + "reward": 0.5418526977300644, + "reward_std": 0.12095485255122185, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4280134066939354, "step": 3074 }, { "clip_ratio": 0.0, - "completion_length": 1639.6607971191406, + "completion_length": 1828.0491943359375, "epoch": 0.9185273691285192, - "grad_norm": 13.602657318115234, - "kl": 0.214111328125, - "learning_rate": 1.9987235819581116e-09, - "loss": 0.1221, - "reward": 0.4101562723517418, - "reward_std": 0.223474383354187, - "rewards/accuracy_reward": 0.07589286006987095, + "grad_norm": 19.60527992248535, + "kl": 4.4609375, + "learning_rate": 9.993617909790558e-09, + "loss": 0.2367, + "reward": 0.5094866305589676, + "reward_std": 0.1649342216551304, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3342634066939354, + "rewards/tag_count_reward": 0.4157366305589676, "step": 3075 }, { "clip_ratio": 0.0, - "completion_length": 1592.9130249023438, + "completion_length": 1771.8505249023438, "epoch": 0.9188260772160406, - "grad_norm": 12.126117706298828, - "kl": 0.25146484375, - "learning_rate": 1.9841519040120603e-09, - "loss": 0.1292, - "reward": 0.4681919887661934, - "reward_std": 0.2006128951907158, - "rewards/accuracy_reward": 0.13839286379516125, + "grad_norm": 14.581271171569824, + "kl": 3.98046875, + "learning_rate": 9.9207595200603e-09, + "loss": 0.2017, + "reward": 0.6121652126312256, + "reward_std": 0.18209047242999077, + "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3076 }, { "clip_ratio": 0.0, - "completion_length": 1651.1273193359375, + "completion_length": 1752.3951416015625, "epoch": 0.9191247853035621, - "grad_norm": 9.091644287109375, - "kl": 0.231689453125, - "learning_rate": 1.9696324626026772e-09, - "loss": 0.0995, - "reward": 0.3939732238650322, - "reward_std": 0.21519280970096588, - "rewards/accuracy_reward": 0.07589286006987095, + "grad_norm": 21.229734420776367, + "kl": 4.875, + "learning_rate": 9.848162313013386e-09, + "loss": 0.2846, + "reward": 0.5362723618745804, + "reward_std": 0.19161529652774334, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3180803656578064, + "rewards/tag_count_reward": 0.4202009066939354, "step": 3077 }, { "clip_ratio": 0.0, - "completion_length": 1573.4308776855469, + "completion_length": 1698.8750610351562, "epoch": 0.9194234933910835, - "grad_norm": 12.486048698425293, - "kl": 0.229248046875, - "learning_rate": 1.955165273525694e-09, - "loss": 0.1531, - "reward": 0.3621651902794838, - "reward_std": 0.15492014959454536, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 6.557315349578857, + "kl": 3.1796875, + "learning_rate": 9.77582636762847e-09, + "loss": 0.1881, + "reward": 0.506138414144516, + "reward_std": 0.1553447749465704, + "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544738650322, + "rewards/tag_count_reward": 0.428013414144516, "step": 3078 }, { "clip_ratio": 0.0, - "completion_length": 1581.6697387695312, + "completion_length": 1744.9621276855469, "epoch": 0.9197222014786051, - "grad_norm": 13.269731521606445, - "kl": 0.21630859375, - "learning_rate": 1.9407503525199988e-09, - "loss": 0.134, - "reward": 0.436383955180645, - "reward_std": 0.1909247636795044, - "rewards/accuracy_reward": 0.1093750074505806, + "grad_norm": 9.445967674255371, + "kl": 3.66796875, + "learning_rate": 9.703751762599994e-09, + "loss": 0.2223, + "reward": 0.5524553805589676, + "reward_std": 0.16282760351896286, + "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.427455373108387, "step": 3079 }, { "clip_ratio": 0.0, - "completion_length": 1590.63623046875, + "completion_length": 1771.3907165527344, "epoch": 0.9200209095661265, - "grad_norm": 11.662346839904785, - "kl": 0.231689453125, - "learning_rate": 1.9263877152675965e-09, - "loss": 0.134, - "reward": 0.4514509215950966, - "reward_std": 0.16833097860217094, - "rewards/accuracy_reward": 0.1383928656578064, + "grad_norm": 18.23233413696289, + "kl": 4.734375, + "learning_rate": 9.631938576337984e-09, + "loss": 0.2743, + "reward": 0.5479910895228386, + "reward_std": 0.14629370346665382, + "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.4051339402794838, "step": 3080 }, { "clip_ratio": 0.0, - "completion_length": 1711.2120971679688, + "completion_length": 1814.0491943359375, "epoch": 0.920319617653648, - "grad_norm": 10.366272926330566, - "kl": 0.253662109375, - "learning_rate": 1.9120773773936505e-09, - "loss": 0.1099, - "reward": 0.3125000223517418, - "reward_std": 0.172454372048378, - "rewards/accuracy_reward": 0.013392857974395156, + "grad_norm": 5.184340953826904, + "kl": 4.21875, + "learning_rate": 9.560386886968253e-09, + "loss": 0.2364, + "reward": 0.4453125223517418, + "reward_std": 0.14160324074327946, + "rewards/accuracy_reward": 0.029017858672887087, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071566939354, + "rewards/tag_count_reward": 0.4162946715950966, "step": 3081 }, { "clip_ratio": 0.0, - "completion_length": 1527.4710083007812, + "completion_length": 1717.8125915527344, "epoch": 0.9206183257411694, - "grad_norm": 12.533367156982422, - "kl": 0.2158203125, - "learning_rate": 1.89781935446639e-09, - "loss": 0.158, - "reward": 0.4224330559372902, - "reward_std": 0.22818885743618011, - "rewards/accuracy_reward": 0.0915178619325161, + "grad_norm": 6.77448034286499, + "kl": 2.98828125, + "learning_rate": 9.489096772331951e-09, + "loss": 0.1924, + "reward": 0.5340401977300644, + "reward_std": 0.1878398396074772, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3082 }, { "clip_ratio": 0.0, - "completion_length": 1633.9353637695312, + "completion_length": 1778.1250915527344, "epoch": 0.920917033828691, - "grad_norm": 12.48861026763916, - "kl": 0.2275390625, - "learning_rate": 1.8836136619971466e-09, - "loss": 0.1319, - "reward": 0.4162946566939354, - "reward_std": 0.17661945149302483, - "rewards/accuracy_reward": 0.09151786006987095, + "grad_norm": 10.25493049621582, + "kl": 2.974609375, + "learning_rate": 9.418068309985733e-09, + "loss": 0.1866, + "reward": 0.5345982387661934, + "reward_std": 0.1333506666123867, + "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.324776791036129, + "rewards/tag_count_reward": 0.4341517984867096, "step": 3083 }, { "clip_ratio": 0.0, - "completion_length": 1654.3058776855469, + "completion_length": 1834.1183776855469, "epoch": 0.9212157419162124, - "grad_norm": 13.676626205444336, - "kl": 0.2919921875, - "learning_rate": 1.8694603154403353e-09, - "loss": 0.1252, - "reward": 0.4112723469734192, - "reward_std": 0.20647930726408958, - "rewards/accuracy_reward": 0.10714286006987095, + "grad_norm": 12.673382759094238, + "kl": 4.8359375, + "learning_rate": 9.347301577201678e-09, + "loss": 0.2593, + "reward": 0.5574776977300644, + "reward_std": 0.21058500185608864, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294813156128, + "rewards/tag_count_reward": 0.4146205559372902, "step": 3084 }, { "clip_ratio": 0.0, - "completion_length": 1586.91748046875, + "completion_length": 1763.618408203125, "epoch": 0.9215144500037339, - "grad_norm": 9.859091758728027, - "kl": 0.238037109375, - "learning_rate": 1.8553593301933957e-09, - "loss": 0.1226, - "reward": 0.444754496216774, - "reward_std": 0.16552283242344856, - "rewards/accuracy_reward": 0.13169643515720963, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580484867096, + "grad_norm": 29.40384292602539, + "kl": 4.73046875, + "learning_rate": 9.276796650966979e-09, + "loss": 0.2436, + "reward": 0.5691964477300644, + "reward_std": 0.1542373187839985, + "rewards/accuracy_reward": 0.15848214738070965, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4107142984867096, "step": 3085 }, { "clip_ratio": 0.0, - "completion_length": 1551.7835693359375, + "completion_length": 1711.9040832519531, "epoch": 0.9218131580912553, - "grad_norm": 12.79185676574707, - "kl": 0.25, - "learning_rate": 1.8413107215968172e-09, - "loss": 0.1283, - "reward": 0.419642873108387, - "reward_std": 0.20478244498372078, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 9.678021430969238, + "kl": 3.7734375, + "learning_rate": 9.206553607984086e-09, + "loss": 0.2356, + "reward": 0.5323660895228386, + "reward_std": 0.15971754118800163, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3258928656578064, + "rewards/tag_count_reward": 0.4229910969734192, "step": 3086 }, { "clip_ratio": 0.0, - "completion_length": 1663.2991638183594, + "completion_length": 1828.6607971191406, "epoch": 0.9221118661787768, - "grad_norm": 13.774096488952637, - "kl": 0.258056640625, - "learning_rate": 1.8273145049341242e-09, - "loss": 0.1537, - "reward": 0.3348214477300644, - "reward_std": 0.19463975355029106, - "rewards/accuracy_reward": 0.020089285913854837, + "grad_norm": 2.939305543899536, + "kl": 3.140625, + "learning_rate": 9.13657252467062e-09, + "loss": 0.1652, + "reward": 0.474888414144516, + "reward_std": 0.180317971855402, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.423549123108387, "step": 3087 }, { "clip_ratio": 0.0, - "completion_length": 1659.0179138183594, + "completion_length": 1776.3996276855469, "epoch": 0.9224105742662982, - "grad_norm": 11.175858497619629, - "kl": 0.21435546875, - "learning_rate": 1.8133706954318152e-09, - "loss": 0.1179, - "reward": 0.3465401977300644, - "reward_std": 0.17980384081602097, + "grad_norm": 11.6106595993042, + "kl": 2.98046875, + "learning_rate": 9.066853477159076e-09, + "loss": 0.1752, + "reward": 0.4486607313156128, + "reward_std": 0.12987824715673923, "rewards/accuracy_reward": 0.015625000931322575, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151977300644, + "rewards/tag_count_reward": 0.4330357387661934, "step": 3088 }, { "clip_ratio": 0.0, - "completion_length": 1690.0067749023438, + "completion_length": 1814.6563110351562, "epoch": 0.9227092823538198, - "grad_norm": 11.620011329650879, - "kl": 0.239013671875, - "learning_rate": 1.799479308259394e-09, - "loss": 0.1312, - "reward": 0.3443080484867096, - "reward_std": 0.20515944436192513, - "rewards/accuracy_reward": 0.0379464291036129, + "grad_norm": 9.244889259338379, + "kl": 3.361328125, + "learning_rate": 8.99739654129697e-09, + "loss": 0.2032, + "reward": 0.4799107238650322, + "reward_std": 0.16903270967304707, + "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3063616156578064, + "rewards/tag_count_reward": 0.419642873108387, "step": 3089 }, { "clip_ratio": 0.0, - "completion_length": 1652.7388916015625, + "completion_length": 1766.0781860351562, "epoch": 0.9230079904413412, - "grad_norm": 12.801745414733887, - "kl": 0.292724609375, - "learning_rate": 1.7856403585293279e-09, - "loss": 0.1285, - "reward": 0.3671875149011612, - "reward_std": 0.17601115256547928, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 13.433330535888672, + "kl": 4.20703125, + "learning_rate": 8.92820179264664e-09, + "loss": 0.2559, + "reward": 0.487165205180645, + "reward_std": 0.14917483367025852, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196492433548, + "rewards/tag_count_reward": 0.4157366305589676, "step": 3090 }, { "clip_ratio": 0.0, - "completion_length": 1637.0402526855469, + "completion_length": 1827.9398193359375, "epoch": 0.9233066985288627, - "grad_norm": 11.244285583496094, - "kl": 0.222900390625, - "learning_rate": 1.7718538612970346e-09, - "loss": 0.1204, - "reward": 0.3911830559372902, - "reward_std": 0.21138368174433708, - "rewards/accuracy_reward": 0.06250000442378223, + "grad_norm": 19.02574348449707, + "kl": 3.81640625, + "learning_rate": 8.859269306485174e-09, + "loss": 0.1995, + "reward": 0.4782366305589676, + "reward_std": 0.14759855717420578, + "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830559372902, + "rewards/tag_count_reward": 0.404575914144516, "step": 3091 }, { "clip_ratio": 0.0, - "completion_length": 1631.2589721679688, + "completion_length": 1778.9264221191406, "epoch": 0.9236054066163841, - "grad_norm": 10.902469635009766, - "kl": 0.239990234375, - "learning_rate": 1.7581198315608724e-09, - "loss": 0.1215, - "reward": 0.3649553805589676, - "reward_std": 0.2080032341182232, - "rewards/accuracy_reward": 0.04910714505240321, + "grad_norm": 10.65204906463623, + "kl": 4.765625, + "learning_rate": 8.790599157804363e-09, + "loss": 0.2756, + "reward": 0.465959832072258, + "reward_std": 0.18430043384432793, + "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.4056919738650322, "step": 3092 }, { "clip_ratio": 0.0, - "completion_length": 1567.1072387695312, + "completion_length": 1741.4063415527344, "epoch": 0.9239041147039057, - "grad_norm": 12.26137638092041, - "kl": 0.251708984375, - "learning_rate": 1.744438284262123e-09, - "loss": 0.1406, - "reward": 0.4642857313156128, - "reward_std": 0.19362110272049904, - "rewards/accuracy_reward": 0.13839286682195961, + "grad_norm": 6.63496732711792, + "kl": 3.9765625, + "learning_rate": 8.722191421310614e-09, + "loss": 0.2362, + "reward": 0.5625000223517418, + "reward_std": 0.16316584311425686, + "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3093 }, { "clip_ratio": 0.0, - "completion_length": 1443.5223693847656, + "completion_length": 1609.1340026855469, "epoch": 0.9242028227914271, - "grad_norm": 11.25540542602539, - "kl": 0.269287109375, - "learning_rate": 1.7308092342849633e-09, - "loss": 0.1225, - "reward": 0.4341518059372902, - "reward_std": 0.14163235016167164, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 19.142261505126953, + "kl": 2.734375, + "learning_rate": 8.654046171424817e-09, + "loss": 0.187, + "reward": 0.5251116454601288, + "reward_std": 0.09467749111354351, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.349330373108387, + "rewards/tag_count_reward": 0.4469866305589676, "step": 3094 }, { "clip_ratio": 0.0, - "completion_length": 1565.4040832519531, + "completion_length": 1746.68310546875, "epoch": 0.9245015308789486, - "grad_norm": 11.848077774047852, - "kl": 0.24755859375, - "learning_rate": 1.7172326964564777e-09, - "loss": 0.1292, - "reward": 0.3950893059372902, - "reward_std": 0.1809428483247757, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 6.223680019378662, + "kl": 3.29296875, + "learning_rate": 8.586163482282388e-09, + "loss": 0.2091, + "reward": 0.5022321715950966, + "reward_std": 0.15067282505333424, + "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.424107164144516, "step": 3095 }, { "clip_ratio": 0.0, - "completion_length": 1655.290283203125, + "completion_length": 1801.8728332519531, "epoch": 0.92480023896647, - "grad_norm": 9.665376663208008, - "kl": 0.26220703125, - "learning_rate": 1.70370868554659e-09, - "loss": 0.0944, - "reward": 0.3325893059372902, - "reward_std": 0.16728629171848297, - "rewards/accuracy_reward": 0.02008928661234677, + "grad_norm": 20.50689125061035, + "kl": 4.59375, + "learning_rate": 8.518543427732949e-09, + "loss": 0.2264, + "reward": 0.4414062723517418, + "reward_std": 0.15418160147964954, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000149011612, + "rewards/tag_count_reward": 0.4056919813156128, "step": 3096 }, { "clip_ratio": 0.0, - "completion_length": 1618.5558776855469, + "completion_length": 1763.696533203125, "epoch": 0.9250989470539915, - "grad_norm": 54.78184509277344, - "kl": 1.0859375, - "learning_rate": 1.6902372162681033e-09, - "loss": 0.1736, - "reward": 0.377232164144516, - "reward_std": 0.17258355766534805, - "rewards/accuracy_reward": 0.04687500116415322, + "grad_norm": 14.881399154663086, + "kl": 2.71875, + "learning_rate": 8.451186081340516e-09, + "loss": 0.1606, + "reward": 0.5011160969734192, + "reward_std": 0.14176259748637676, + "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4363839477300644, "step": 3097 }, { "clip_ratio": 0.0, - "completion_length": 1651.9443054199219, + "completion_length": 1813.4331359863281, "epoch": 0.925397655141513, - "grad_norm": 8.719439506530762, - "kl": 0.26953125, - "learning_rate": 1.6768183032766726e-09, - "loss": 0.108, - "reward": 0.3253348395228386, - "reward_std": 0.19957349821925163, - "rewards/accuracy_reward": 0.026785714784637094, + "grad_norm": 4.612854480743408, + "kl": 4.1796875, + "learning_rate": 8.384091516383363e-09, + "loss": 0.2359, + "reward": 0.4531250223517418, + "reward_std": 0.16692296788096428, + "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.298549123108387, + "rewards/tag_count_reward": 0.4084821566939354, "step": 3098 }, { "clip_ratio": 0.0, - "completion_length": 1578.8393859863281, + "completion_length": 1691.0179443359375, "epoch": 0.9256963632290345, - "grad_norm": 11.866087913513184, - "kl": 0.255615234375, - "learning_rate": 1.6634519611707366e-09, - "loss": 0.1329, - "reward": 0.5167411044239998, - "reward_std": 0.17017202824354172, - "rewards/accuracy_reward": 0.18303572200238705, + "grad_norm": 14.503144264221191, + "kl": 3.51171875, + "learning_rate": 8.317259805853682e-09, + "loss": 0.2401, + "reward": 0.617745578289032, + "reward_std": 0.16978689655661583, + "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.428013414144516, "step": 3099 }, { "clip_ratio": 0.0, - "completion_length": 1618.3817749023438, + "completion_length": 1752.8773498535156, "epoch": 0.9259950713165559, - "grad_norm": 14.54488754272461, - "kl": 0.227294921875, - "learning_rate": 1.6501382044915747e-09, - "loss": 0.1442, - "reward": 0.4871652126312256, - "reward_std": 0.17964345589280128, - "rewards/accuracy_reward": 0.160714291036129, + "grad_norm": 5.220859050750732, + "kl": 3.9453125, + "learning_rate": 8.250691022457872e-09, + "loss": 0.2274, + "reward": 0.5731026977300644, + "reward_std": 0.12308023124933243, + "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4190848395228386, "step": 3100 }, { "clip_ratio": 0.0, - "completion_length": 1596.1295471191406, + "completion_length": 1750.0202026367188, "epoch": 0.9262937794040773, - "grad_norm": 13.275833129882812, - "kl": 0.259033203125, - "learning_rate": 1.6368770477232618e-09, - "loss": 0.1449, - "reward": 0.4062500223517418, - "reward_std": 0.18097169697284698, - "rewards/accuracy_reward": 0.08705357694998384, + "grad_norm": 3.6601431369781494, + "kl": 3.54296875, + "learning_rate": 8.18438523861631e-09, + "loss": 0.2011, + "reward": 0.5223214477300644, + "reward_std": 0.15881632268428802, + "rewards/accuracy_reward": 0.09598214877769351, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4263393059372902, "step": 3101 }, { "clip_ratio": 0.0, - "completion_length": 1657.2879943847656, + "completion_length": 1783.2634887695312, "epoch": 0.9265924874915988, - "grad_norm": 12.25716781616211, - "kl": 0.250732421875, - "learning_rate": 1.6236685052926136e-09, - "loss": 0.1266, - "reward": 0.4090401977300644, - "reward_std": 0.1941714771091938, - "rewards/accuracy_reward": 0.08258929220028222, + "grad_norm": 18.882341384887695, + "kl": 3.4375, + "learning_rate": 8.118342526463068e-09, + "loss": 0.2169, + "reward": 0.524553582072258, + "reward_std": 0.1694175899028778, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4263393059372902, "step": 3102 }, { "clip_ratio": 0.0, - "completion_length": 1669.19873046875, + "completion_length": 1798.60498046875, "epoch": 0.9268911955791203, - "grad_norm": 11.845788955688477, - "kl": 0.252685546875, - "learning_rate": 1.6105125915692464e-09, - "loss": 0.1214, - "reward": 0.3482143059372902, - "reward_std": 0.20293445512652397, - "rewards/accuracy_reward": 0.03348214412108064, + "grad_norm": 8.143572807312012, + "kl": 3.80859375, + "learning_rate": 8.052562957846231e-09, + "loss": 0.2106, + "reward": 0.4693080559372902, + "reward_std": 0.17779669910669327, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4246651977300644, "step": 3103 }, { "clip_ratio": 0.0, - "completion_length": 1603.93310546875, + "completion_length": 1819.1072082519531, "epoch": 0.9271899036666418, - "grad_norm": 12.88851261138916, - "kl": 0.2275390625, - "learning_rate": 1.597409320865506e-09, - "loss": 0.1338, - "reward": 0.4944196566939354, - "reward_std": 0.20019777491688728, - "rewards/accuracy_reward": 0.14955358067527413, + "grad_norm": 8.65472412109375, + "kl": 3.60546875, + "learning_rate": 7.98704660432753e-09, + "loss": 0.2099, + "reward": 0.5619419887661934, + "reward_std": 0.1586289443075657, + "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.344866082072258, + "rewards/tag_count_reward": 0.423549123108387, "step": 3104 }, { "clip_ratio": 0.0, - "completion_length": 1520.0826416015625, + "completion_length": 1720.1273193359375, "epoch": 0.9274886117541632, - "grad_norm": 13.437482833862305, - "kl": 0.234619140625, - "learning_rate": 1.5843587074364506e-09, - "loss": 0.1522, - "reward": 0.365513414144516, - "reward_std": 0.17630689032375813, - "rewards/accuracy_reward": 0.01562500069849193, + "grad_norm": 5.983371734619141, + "kl": 4.0703125, + "learning_rate": 7.921793537182253e-09, + "loss": 0.2433, + "reward": 0.4676339477300644, + "reward_std": 0.16578348353505135, + "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3498884066939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3105 }, { "clip_ratio": 0.0, - "completion_length": 1625.4777526855469, + "completion_length": 1735.9308776855469, "epoch": 0.9277873198416847, - "grad_norm": 12.324629783630371, - "kl": 0.2705078125, - "learning_rate": 1.5713607654798843e-09, - "loss": 0.1327, - "reward": 0.4257812723517418, - "reward_std": 0.19695432111620903, - "rewards/accuracy_reward": 0.11607143585570157, + "grad_norm": 6.4985761642456055, + "kl": 3.88671875, + "learning_rate": 7.856803827399422e-09, + "loss": 0.234, + "reward": 0.5491071790456772, + "reward_std": 0.17311876267194748, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "rewards/tag_count_reward": 0.424107164144516, "step": 3106 }, { "clip_ratio": 0.0, - "completion_length": 1597.57373046875, + "completion_length": 1773.8706359863281, "epoch": 0.9280860279292061, - "grad_norm": 10.61235237121582, - "kl": 0.250732421875, - "learning_rate": 1.5584155091362904e-09, - "loss": 0.1099, - "reward": 0.3967634215950966, - "reward_std": 0.14829028770327568, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 8.45417594909668, + "kl": 3.65625, + "learning_rate": 7.792077545681452e-09, + "loss": 0.2069, + "reward": 0.5100446566939354, + "reward_std": 0.12487944401800632, + "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.4252232313156128, "step": 3107 }, { "clip_ratio": 0.0, - "completion_length": 1580.6473999023438, + "completion_length": 1744.1719665527344, "epoch": 0.9283847360167277, - "grad_norm": 13.888839721679688, - "kl": 0.24462890625, - "learning_rate": 1.5455229524888313e-09, - "loss": 0.1411, - "reward": 0.387834832072258, - "reward_std": 0.1748930737376213, + "grad_norm": 19.50294303894043, + "kl": 3.54296875, + "learning_rate": 7.727614762444157e-09, + "loss": 0.2339, + "reward": 0.4715401977300644, + "reward_std": 0.12681268900632858, "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919738650322, + "rewards/tag_count_reward": 0.426897332072258, "step": 3108 }, { "clip_ratio": 0.0, - "completion_length": 1564.2590026855469, - "epoch": 0.9286834441042491, - "grad_norm": 12.996999740600586, - "kl": 0.227783203125, - "learning_rate": 1.532683109563354e-09, - "loss": 0.1194, - "reward": 0.4107143059372902, - "reward_std": 0.19186555966734886, - "rewards/accuracy_reward": 0.06250000186264515, + "completion_length": 1741.77685546875, + "epoch": 0.9286834441042491, + "grad_norm": 5.543459892272949, + "kl": 3.41015625, + "learning_rate": 7.66341554781677e-09, + "loss": 0.1953, + "reward": 0.514508955180645, + "reward_std": 0.16556296683847904, + "rewards/accuracy_reward": 0.08035714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482143059372902, + "rewards/tag_count_reward": 0.4341518133878708, "step": 3109 }, { "clip_ratio": 0.0, - "completion_length": 1633.0603332519531, + "completion_length": 1782.1875610351562, "epoch": 0.9289821521917706, - "grad_norm": 9.836950302124023, - "kl": 0.26123046875, - "learning_rate": 1.5198959943283462e-09, - "loss": 0.1034, - "reward": 0.3833705484867096, - "reward_std": 0.1748585868626833, - "rewards/accuracy_reward": 0.07366071734577417, + "grad_norm": 8.373510360717773, + "kl": 4.515625, + "learning_rate": 7.59947997164173e-09, + "loss": 0.2663, + "reward": 0.4927455559372902, + "reward_std": 0.1440216265618801, + "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "rewards/tag_count_reward": 0.4101562723517418, "step": 3110 }, { "clip_ratio": 0.0, - "completion_length": 1536.8370971679688, + "completion_length": 1699.5335693359375, "epoch": 0.929280860279292, - "grad_norm": 11.253217697143555, - "kl": 0.24609375, - "learning_rate": 1.5071616206949301e-09, - "loss": 0.1306, - "reward": 0.4280134066939354, - "reward_std": 0.20233887434005737, - "rewards/accuracy_reward": 0.0803571492433548, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3476562649011612, + "grad_norm": 9.046926498413086, + "kl": 3.68359375, + "learning_rate": 7.53580810347465e-09, + "loss": 0.2108, + "reward": 0.5178571566939354, + "reward_std": 0.14192751049995422, + "rewards/accuracy_reward": 0.08928571688011289, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3111 }, { "clip_ratio": 0.0, - "completion_length": 1616.55810546875, + "completion_length": 1775.0982971191406, "epoch": 0.9295795683668135, - "grad_norm": 11.121349334716797, - "kl": 0.2666015625, - "learning_rate": 1.494480002516868e-09, - "loss": 0.1296, - "reward": 0.4654018059372902, - "reward_std": 0.19153089448809624, - "rewards/accuracy_reward": 0.14285715157166123, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "grad_norm": 6.791934490203857, + "kl": 3.8671875, + "learning_rate": 7.47240001258434e-09, + "loss": 0.2312, + "reward": 0.5797991305589676, + "reward_std": 0.1971987523138523, + "rewards/accuracy_reward": 0.16517857648432255, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4146205559372902, "step": 3112 }, { "clip_ratio": 0.0, - "completion_length": 1661.4397888183594, + "completion_length": 1777.4620971679688, "epoch": 0.929878276454335, - "grad_norm": 12.509008407592773, - "kl": 0.26513671875, - "learning_rate": 1.4818511535905077e-09, - "loss": 0.1346, - "reward": 0.450892873108387, - "reward_std": 0.2119353674352169, - "rewards/accuracy_reward": 0.12946428917348385, + "grad_norm": 20.01893424987793, + "kl": 4.6796875, + "learning_rate": 7.409255767952538e-09, + "loss": 0.277, + "reward": 0.5295759290456772, + "reward_std": 0.16408658772706985, + "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4112723469734192, "step": 3113 }, { "clip_ratio": 0.0, - "completion_length": 1623.9732666015625, + "completion_length": 1777.0268859863281, "epoch": 0.9301769845418565, - "grad_norm": 11.804404258728027, - "kl": 0.2373046875, - "learning_rate": 1.4692750876548033e-09, - "loss": 0.1256, - "reward": 0.4386160969734192, - "reward_std": 0.2273441106081009, - "rewards/accuracy_reward": 0.10044643585570157, + "grad_norm": 10.58565616607666, + "kl": 2.490234375, + "learning_rate": 7.3463754382740165e-09, + "loss": 0.1567, + "reward": 0.5552455633878708, + "reward_std": 0.1887827143073082, + "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4369419813156128, "step": 3114 }, { "clip_ratio": 0.0, - "completion_length": 1626.1004943847656, + "completion_length": 1721.7210388183594, "epoch": 0.9304756926293779, - "grad_norm": 13.040892601013184, - "kl": 0.261962890625, - "learning_rate": 1.4567518183912886e-09, - "loss": 0.111, - "reward": 0.3666294738650322, - "reward_std": 0.1684706136584282, - "rewards/accuracy_reward": 0.046875000931322575, + "grad_norm": 5.684086322784424, + "kl": 2.814453125, + "learning_rate": 7.283759091956443e-09, + "loss": 0.1601, + "reward": 0.4827009066939354, + "reward_std": 0.13451972976326942, + "rewards/accuracy_reward": 0.0602678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.4224330484867096, "step": 3115 }, { "clip_ratio": 0.0, - "completion_length": 1552.7701416015625, + "completion_length": 1728.8438415527344, "epoch": 0.9307744007168994, - "grad_norm": 12.966466903686523, - "kl": 0.238525390625, - "learning_rate": 1.4442813594240378e-09, - "loss": 0.1255, - "reward": 0.5128348544239998, - "reward_std": 0.22202913463115692, - "rewards/accuracy_reward": 0.1741071529686451, + "grad_norm": 4.802729606628418, + "kl": 3.37890625, + "learning_rate": 7.2214067971201885e-09, + "loss": 0.1962, + "reward": 0.6389509290456772, + "reward_std": 0.21386107802391052, + "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4313616156578064, "step": 3116 }, { "clip_ratio": 0.0, - "completion_length": 1514.9308776855469, + "completion_length": 1669.9085388183594, "epoch": 0.9310731088044208, - "grad_norm": 13.690888404846191, - "kl": 0.246337890625, - "learning_rate": 1.431863724319693e-09, - "loss": 0.1295, - "reward": 0.5256696715950966, - "reward_std": 0.18974097073078156, - "rewards/accuracy_reward": 0.17410715157166123, + "grad_norm": 17.05530548095703, + "kl": 3.48046875, + "learning_rate": 7.159318621598465e-09, + "loss": 0.2257, + "reward": 0.6227678880095482, + "reward_std": 0.15726906433701515, + "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625149011612, + "rewards/tag_count_reward": 0.4375000149011612, "step": 3117 }, { "clip_ratio": 0.0, - "completion_length": 1646.2210693359375, + "completion_length": 1734.29248046875, "epoch": 0.9313718168919424, - "grad_norm": 13.258161544799805, - "kl": 0.239501953125, - "learning_rate": 1.419498926587437e-09, - "loss": 0.1289, - "reward": 0.3727678805589676, - "reward_std": 0.1847793497145176, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 13.835975646972656, + "kl": 3.3828125, + "learning_rate": 7.097494632937184e-09, + "loss": 0.2097, + "reward": 0.5128348395228386, + "reward_std": 0.13877625204622746, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169643059372902, + "rewards/tag_count_reward": 0.4414062723517418, "step": 3118 }, { "clip_ratio": 0.0, - "completion_length": 1638.1473999023438, + "completion_length": 1813.4755249023438, "epoch": 0.9316705249794638, - "grad_norm": 12.322694778442383, - "kl": 0.255126953125, - "learning_rate": 1.4071869796789426e-09, - "loss": 0.1101, - "reward": 0.443638414144516, - "reward_std": 0.20790699869394302, - "rewards/accuracy_reward": 0.11160714738070965, + "grad_norm": 4.7541327476501465, + "kl": 3.427734375, + "learning_rate": 7.035934898394713e-09, + "loss": 0.1887, + "reward": 0.5758928805589676, + "reward_std": 0.18983046524226665, + "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.4330357313156128, "step": 3119 }, { "clip_ratio": 0.0, - "completion_length": 1610.1161193847656, + "completion_length": 1725.618408203125, "epoch": 0.9319692330669853, - "grad_norm": 13.530994415283203, - "kl": 0.248291015625, - "learning_rate": 1.3949278969884126e-09, - "loss": 0.1482, - "reward": 0.412388414144516, - "reward_std": 0.17507272586226463, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 12.552999496459961, + "kl": 2.859375, + "learning_rate": 6.974639484942063e-09, + "loss": 0.1765, + "reward": 0.5189732387661934, + "reward_std": 0.10459255427122116, + "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.440848246216774, "step": 3120 }, { "clip_ratio": 0.0, - "completion_length": 1694.0514221191406, + "completion_length": 1796.5960388183594, "epoch": 0.9322679411545067, - "grad_norm": 12.057221412658691, - "kl": 0.23095703125, - "learning_rate": 1.382721691852512e-09, - "loss": 0.1311, - "reward": 0.3359375149011612, - "reward_std": 0.20287158340215683, - "rewards/accuracy_reward": 0.022321429569274187, + "grad_norm": 8.727331161499023, + "kl": 4.015625, + "learning_rate": 6.91360845926256e-09, + "loss": 0.2201, + "reward": 0.4497768059372902, + "reward_std": 0.13696794025599957, + "rewards/accuracy_reward": 0.022321430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.427455373108387, "step": 3121 }, { "clip_ratio": 0.0, - "completion_length": 1566.5603637695312, + "completion_length": 1729.83935546875, "epoch": 0.9325666492420283, - "grad_norm": 15.20626163482666, - "kl": 0.216064453125, - "learning_rate": 1.3705683775504074e-09, - "loss": 0.1593, - "reward": 0.4481026902794838, - "reward_std": 0.225236464291811, - "rewards/accuracy_reward": 0.10937500232830644, + "grad_norm": 9.591011047363281, + "kl": 4.203125, + "learning_rate": 6.852841887752037e-09, + "loss": 0.2477, + "reward": 0.5357143059372902, + "reward_std": 0.18371436186134815, + "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4196428656578064, "step": 3122 }, { "clip_ratio": 0.0, - "completion_length": 1577.0580749511719, + "completion_length": 1759.2545471191406, "epoch": 0.9328653573295497, - "grad_norm": 11.761122703552246, - "kl": 0.2314453125, - "learning_rate": 1.358467967303717e-09, - "loss": 0.1502, - "reward": 0.357142873108387, - "reward_std": 0.14143730700016022, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 22.989971160888672, + "kl": 4.265625, + "learning_rate": 6.7923398365185844e-09, + "loss": 0.229, + "reward": 0.4977678805589676, + "reward_std": 0.14584042876958847, + "rewards/accuracy_reward": 0.0669642873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4308035895228386, "step": 3123 }, { "clip_ratio": 0.0, - "completion_length": 1588.6942749023438, + "completion_length": 1756.4465026855469, "epoch": 0.9331640654170712, - "grad_norm": 13.019433975219727, - "kl": 0.236328125, - "learning_rate": 1.3464204742764884e-09, - "loss": 0.1364, - "reward": 0.3889509066939354, - "reward_std": 0.19317369908094406, - "rewards/accuracy_reward": 0.06473214738070965, + "grad_norm": 9.116214752197266, + "kl": 4.5, + "learning_rate": 6.732102371382442e-09, + "loss": 0.2603, + "reward": 0.5089285895228386, + "reward_std": 0.17124420404434204, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4218750149011612, "step": 3124 }, { "clip_ratio": 0.0, - "completion_length": 1647.3058776855469, + "completion_length": 1755.65185546875, "epoch": 0.9334627735045926, - "grad_norm": 8.598480224609375, - "kl": 0.250244140625, - "learning_rate": 1.3344259115752266e-09, - "loss": 0.1108, - "reward": 0.4324776977300644, - "reward_std": 0.19490262120962143, - "rewards/accuracy_reward": 0.14732143143191934, + "grad_norm": 22.127580642700195, + "kl": 5.46875, + "learning_rate": 6.672129557876133e-09, + "loss": 0.3201, + "reward": 0.5697544887661934, + "reward_std": 0.16058344766497612, + "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2851562649011612, + "rewards/tag_count_reward": 0.4135044813156128, "step": 3125 }, { "clip_ratio": 0.0, - "completion_length": 1654.6273193359375, + "completion_length": 1812.8594665527344, "epoch": 0.9337614815921141, - "grad_norm": 12.155628204345703, - "kl": 0.23046875, - "learning_rate": 1.3224842922488322e-09, - "loss": 0.1113, - "reward": 0.4179687723517418, - "reward_std": 0.20285895839333534, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 6.615753173828125, + "kl": 3.001953125, + "learning_rate": 6.612421461244161e-09, + "loss": 0.1564, + "reward": 0.5569196715950966, + "reward_std": 0.16876626014709473, + "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.427455373108387, "step": 3126 }, { "clip_ratio": 0.0, - "completion_length": 1625.5759582519531, + "completion_length": 1736.3505249023438, "epoch": 0.9340601896796356, - "grad_norm": 14.064413070678711, - "kl": 0.267822265625, - "learning_rate": 1.3105956292886189e-09, - "loss": 0.1462, - "reward": 0.3470982238650322, - "reward_std": 0.1715451329946518, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 12.076683044433594, + "kl": 3.9609375, + "learning_rate": 6.552978146443094e-09, + "loss": 0.2644, + "reward": 0.4631696566939354, + "reward_std": 0.1319701299071312, + "rewards/accuracy_reward": 0.03348214295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4296875149011612, "step": 3127 }, { "clip_ratio": 0.0, - "completion_length": 1504.7969665527344, + "completion_length": 1780.7813110351562, "epoch": 0.9343588977671571, - "grad_norm": 12.199499130249023, - "kl": 0.234375, - "learning_rate": 1.2987599356282852e-09, - "loss": 0.1295, - "reward": 0.554687537252903, - "reward_std": 0.19609710574150085, - "rewards/accuracy_reward": 0.2165178656578064, + "grad_norm": 35.0097770690918, + "kl": 5.09765625, + "learning_rate": 6.493799678141426e-09, + "loss": 0.2522, + "reward": 0.6395089626312256, + "reward_std": 0.15091378800570965, + "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696529686451, + "rewards/tag_count_reward": 0.4185268059372902, "step": 3128 }, { "clip_ratio": 0.0, - "completion_length": 1618.8996276855469, + "completion_length": 1768.3148193359375, "epoch": 0.9346576058546785, - "grad_norm": 11.059340476989746, - "kl": 0.21923828125, - "learning_rate": 1.2869772241439092e-09, - "loss": 0.1217, - "reward": 0.3710937723517418, - "reward_std": 0.20903342217206955, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 14.694561958312988, + "kl": 2.734375, + "learning_rate": 6.434886120719546e-09, + "loss": 0.1725, + "reward": 0.5117187649011612, + "reward_std": 0.15856457501649857, + "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830559372902, + "rewards/tag_count_reward": 0.4358259066939354, "step": 3129 }, { "clip_ratio": 0.0, - "completion_length": 1624.3215026855469, + "completion_length": 1761.3550109863281, "epoch": 0.9349563139422, - "grad_norm": 11.932198524475098, - "kl": 0.242919921875, - "learning_rate": 1.2752475076539203e-09, - "loss": 0.1336, - "reward": 0.360491082072258, - "reward_std": 0.18164215609431267, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 7.165038108825684, + "kl": 4.32421875, + "learning_rate": 6.376237538269602e-09, + "loss": 0.264, + "reward": 0.4760044887661934, + "reward_std": 0.1332947574555874, + "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4224330633878708, "step": 3130 }, { "clip_ratio": 0.0, - "completion_length": 1630.3795166015625, + "completion_length": 1797.0804443359375, "epoch": 0.9352550220297214, - "grad_norm": 10.601170539855957, - "kl": 0.267578125, - "learning_rate": 1.2635707989191058e-09, - "loss": 0.1214, - "reward": 0.3247767984867096, - "reward_std": 0.17993677034974098, - "rewards/accuracy_reward": 0.01785714365541935, + "grad_norm": 17.504234313964844, + "kl": 4.69921875, + "learning_rate": 6.317853994595529e-09, + "loss": 0.2485, + "reward": 0.431361623108387, + "reward_std": 0.13290254212915897, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196566939354, + "rewards/tag_count_reward": 0.411272332072258, "step": 3131 }, { "clip_ratio": 0.0, - "completion_length": 1654.5067749023438, + "completion_length": 1735.5134582519531, "epoch": 0.935553730117243, - "grad_norm": 9.747517585754395, - "kl": 0.243408203125, - "learning_rate": 1.2519471106425816e-09, - "loss": 0.123, - "reward": 0.3281250149011612, - "reward_std": 0.1819070652127266, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 11.88955307006836, + "kl": 4.0078125, + "learning_rate": 6.259735553212908e-09, + "loss": 0.208, + "reward": 0.4654018059372902, + "reward_std": 0.16931617259979248, + "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3013393059372902, + "rewards/tag_count_reward": 0.4274553805589676, "step": 3132 }, { "clip_ratio": 0.0, - "completion_length": 1583.7835693359375, + "completion_length": 1741.9822387695312, "epoch": 0.9358524382047644, - "grad_norm": 9.578848838806152, - "kl": 0.197998046875, - "learning_rate": 1.240376455469777e-09, - "loss": 0.0846, - "reward": 0.4765625298023224, - "reward_std": 0.20478202030062675, - "rewards/accuracy_reward": 0.12500000605359674, + "grad_norm": 3.674797296524048, + "kl": 3.75390625, + "learning_rate": 6.201882277348885e-09, + "loss": 0.1923, + "reward": 0.5580357313156128, + "reward_std": 0.1989150382578373, + "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625149011612, + "rewards/tag_count_reward": 0.4218750149011612, "step": 3133 }, { "clip_ratio": 0.0, - "completion_length": 1602.560302734375, + "completion_length": 1712.0313110351562, "epoch": 0.9361511462922859, - "grad_norm": 13.86622428894043, - "kl": 0.232177734375, - "learning_rate": 1.228858845988434e-09, - "loss": 0.1598, - "reward": 0.3950893059372902, - "reward_std": 0.15334121137857437, + "grad_norm": 21.303447723388672, + "kl": 4.01171875, + "learning_rate": 6.144294229942171e-09, + "loss": 0.261, + "reward": 0.502790205180645, + "reward_std": 0.11032152362167835, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3134 }, { "clip_ratio": 0.0, - "completion_length": 1597.2522583007812, + "completion_length": 1724.8438110351562, "epoch": 0.9364498543798073, - "grad_norm": 13.357135772705078, - "kl": 0.226318359375, - "learning_rate": 1.2173942947285742e-09, - "loss": 0.1541, - "reward": 0.4229910895228386, - "reward_std": 0.1867559365928173, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 8.405097961425781, + "kl": 3.533203125, + "learning_rate": 6.086971473642871e-09, + "loss": 0.2215, + "reward": 0.5541294813156128, + "reward_std": 0.14196583814918995, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4380580559372902, "step": 3135 }, { "clip_ratio": 0.0, - "completion_length": 1622.6920166015625, + "completion_length": 1806.8125915527344, "epoch": 0.9367485624673288, - "grad_norm": 9.727771759033203, - "kl": 0.257080078125, - "learning_rate": 1.205982814162515e-09, - "loss": 0.1072, - "reward": 0.4475446715950966, - "reward_std": 0.2392408438026905, - "rewards/accuracy_reward": 0.13169643469154835, + "grad_norm": 17.944316864013672, + "kl": 4.68359375, + "learning_rate": 6.029914070812575e-09, + "loss": 0.249, + "reward": 0.5385044813156128, + "reward_std": 0.1912555918097496, + "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.4090401977300644, "step": 3136 }, { "clip_ratio": 0.0, - "completion_length": 1549.0558471679688, + "completion_length": 1732.7255249023438, "epoch": 0.9370472705548503, - "grad_norm": 11.910747528076172, - "kl": 0.234130859375, - "learning_rate": 1.1946244167048313e-09, - "loss": 0.1374, - "reward": 0.471540205180645, - "reward_std": 0.17358188331127167, - "rewards/accuracy_reward": 0.1272321455180645, + "grad_norm": 6.294620990753174, + "kl": 3.93359375, + "learning_rate": 5.973122083524157e-09, + "loss": 0.2346, + "reward": 0.5625000223517418, + "reward_std": 0.14301216043531895, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080484867096, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3137 }, { "clip_ratio": 0.0, - "completion_length": 1580.6875610351562, + "completion_length": 1776.2366943359375, "epoch": 0.9373459786423718, - "grad_norm": 12.558515548706055, - "kl": 0.2578125, - "learning_rate": 1.1833191147123334e-09, - "loss": 0.1193, - "reward": 0.3856026977300644, - "reward_std": 0.18911318480968475, - "rewards/accuracy_reward": 0.058035718742758036, + "grad_norm": 15.142308235168457, + "kl": 3.6484375, + "learning_rate": 5.916595573561667e-09, + "loss": 0.2258, + "reward": 0.5050223469734192, + "reward_std": 0.17098475247621536, + "rewards/accuracy_reward": 0.07589285960420966, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669738650322, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3138 }, { "clip_ratio": 0.0, - "completion_length": 1589.5804138183594, + "completion_length": 1749.7255249023438, "epoch": 0.9376446867298932, - "grad_norm": 11.955155372619629, - "kl": 0.25439453125, - "learning_rate": 1.1720669204840938e-09, - "loss": 0.1306, - "reward": 0.389508955180645, - "reward_std": 0.16954592615365982, + "grad_norm": 6.567526817321777, + "kl": 3.203125, + "learning_rate": 5.8603346024204694e-09, + "loss": 0.194, + "reward": 0.4743303656578064, + "reward_std": 0.12915113940835, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3139 }, { "clip_ratio": 0.0, - "completion_length": 1703.1764221191406, + "completion_length": 1831.8572082519531, "epoch": 0.9379433948174147, - "grad_norm": 12.080501556396484, - "kl": 0.2724609375, - "learning_rate": 1.1608678462613985e-09, - "loss": 0.1225, - "reward": 0.3532366305589676, - "reward_std": 0.18714265897870064, - "rewards/accuracy_reward": 0.04910714505240321, + "grad_norm": 12.231660842895508, + "kl": 3.6015625, + "learning_rate": 5.804339231306993e-09, + "loss": 0.2138, + "reward": 0.4860491305589676, + "reward_std": 0.13884839043021202, + "rewards/accuracy_reward": 0.053571430034935474, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3041294738650322, + "rewards/tag_count_reward": 0.432477705180645, "step": 3140 }, { "clip_ratio": 0.0, - "completion_length": 1653.1027526855469, + "completion_length": 1788.8862609863281, "epoch": 0.9382421029049361, - "grad_norm": 11.485506057739258, - "kl": 0.246337890625, - "learning_rate": 1.1497219042277294e-09, - "loss": 0.1355, - "reward": 0.3973214477300644, - "reward_std": 0.18239791318774223, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 8.83573055267334, + "kl": 4.6328125, + "learning_rate": 5.748609521138648e-09, + "loss": 0.2445, + "reward": 0.5089285969734192, + "reward_std": 0.1527208685874939, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4151785895228386, "step": 3141 }, { "clip_ratio": 0.0, - "completion_length": 1559.7813110351562, + "completion_length": 1702.3371276855469, "epoch": 0.9385408109924577, - "grad_norm": 11.18380069732666, - "kl": 0.241455078125, - "learning_rate": 1.1386291065087872e-09, - "loss": 0.1131, - "reward": 0.5245535969734192, - "reward_std": 0.18465260788798332, - "rewards/accuracy_reward": 0.1785714365541935, + "grad_norm": 7.195098876953125, + "kl": 3.83984375, + "learning_rate": 5.693145532543936e-09, + "loss": 0.239, + "reward": 0.5959821715950966, + "reward_std": 0.14992650970816612, + "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345982164144516, + "rewards/tag_count_reward": 0.424107164144516, "step": 3142 }, { "clip_ratio": 0.0, - "completion_length": 1730.21435546875, + "completion_length": 1869.4532165527344, "epoch": 0.9388395190799791, - "grad_norm": 11.30674934387207, - "kl": 0.2666015625, - "learning_rate": 1.1275894651724515e-09, - "loss": 0.1132, - "reward": 0.325892873108387, - "reward_std": 0.18382380902767181, - "rewards/accuracy_reward": 0.022321430267766118, + "grad_norm": 5.760509490966797, + "kl": 3.953125, + "learning_rate": 5.637947325862258e-09, + "loss": 0.2181, + "reward": 0.4174107313156128, + "reward_std": 0.13676492683589458, + "rewards/accuracy_reward": 0.008928572060540318, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714477300644, + "rewards/tag_count_reward": 0.4084821566939354, "step": 3143 }, { "clip_ratio": 0.0, - "completion_length": 1617.8884887695312, + "completion_length": 1748.0916137695312, "epoch": 0.9391382271675005, - "grad_norm": 12.202991485595703, - "kl": 0.218505859375, - "learning_rate": 1.1166029922287657e-09, - "loss": 0.1371, - "reward": 0.4179687723517418, - "reward_std": 0.18371378257870674, - "rewards/accuracy_reward": 0.09151786169968545, + "grad_norm": 16.554542541503906, + "kl": 2.84375, + "learning_rate": 5.5830149611438286e-09, + "loss": 0.1715, + "reward": 0.5496651977300644, + "reward_std": 0.15513592213392258, + "rewards/accuracy_reward": 0.11607143701985478, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3264509066939354, + "rewards/tag_count_reward": 0.4335937649011612, "step": 3144 }, { "clip_ratio": 0.0, - "completion_length": 1680.4465026855469, + "completion_length": 1800.3772888183594, "epoch": 0.939436935255022, - "grad_norm": 12.556886672973633, - "kl": 0.267578125, - "learning_rate": 1.1056696996299298e-09, - "loss": 0.1216, - "reward": 0.325892873108387, - "reward_std": 0.19101548567414284, - "rewards/accuracy_reward": 0.01562500116415322, + "grad_norm": 19.81356430053711, + "kl": 3.03515625, + "learning_rate": 5.5283484981496484e-09, + "loss": 0.194, + "reward": 0.4687500223517418, + "reward_std": 0.14997711405158043, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.310267873108387, + "rewards/tag_count_reward": 0.4419643059372902, "step": 3145 }, { "clip_ratio": 0.0, - "completion_length": 1503.5558776855469, + "completion_length": 1613.5157165527344, "epoch": 0.9397356433425434, - "grad_norm": 11.038503646850586, - "kl": 0.270263671875, - "learning_rate": 1.0947895992703127e-09, - "loss": 0.1404, - "reward": 0.4960937723517418, - "reward_std": 0.1873667761683464, - "rewards/accuracy_reward": 0.15625000861473382, + "grad_norm": 10.664583206176758, + "kl": 2.736328125, + "learning_rate": 5.4739479963515635e-09, + "loss": 0.1909, + "reward": 0.6250000298023224, + "reward_std": 0.17824650555849075, + "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4464285895228386, "step": 3146 }, { "clip_ratio": 0.0, - "completion_length": 1633.6384582519531, + "completion_length": 1804.8906860351562, "epoch": 0.940034351430065, - "grad_norm": 9.803953170776367, - "kl": 0.27392578125, - "learning_rate": 1.0839627029863796e-09, - "loss": 0.1031, - "reward": 0.3950892984867096, - "reward_std": 0.21256079897284508, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 12.780122756958008, + "kl": 4.765625, + "learning_rate": 5.419813514931898e-09, + "loss": 0.251, + "reward": 0.5440848395228386, + "reward_std": 0.22799570858478546, + "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071566939354, + "rewards/tag_count_reward": 0.4079241305589676, "step": 3147 }, { "clip_ratio": 0.0, - "completion_length": 1567.1540832519531, + "completion_length": 1717.0223999023438, "epoch": 0.9403330595175864, - "grad_norm": 11.559075355529785, - "kl": 0.25732421875, - "learning_rate": 1.073189022556742e-09, - "loss": 0.1432, - "reward": 0.3627232238650322, - "reward_std": 0.15954610332846642, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 7.646012783050537, + "kl": 4.01171875, + "learning_rate": 5.36594511278371e-09, + "loss": 0.2488, + "reward": 0.4743303805589676, + "reward_std": 0.14721242897212505, + "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125074505806, + "rewards/tag_count_reward": 0.4207589402794838, "step": 3148 }, { "clip_ratio": 0.0, - "completion_length": 1681.8906860351562, + "completion_length": 1827.399658203125, "epoch": 0.9406317676051079, - "grad_norm": 10.65440845489502, - "kl": 0.240478515625, - "learning_rate": 1.0624685697021019e-09, - "loss": 0.1015, - "reward": 0.3387276902794838, - "reward_std": 0.214151531457901, - "rewards/accuracy_reward": 0.04017857275903225, + "grad_norm": 7.611230373382568, + "kl": 3.85546875, + "learning_rate": 5.312342848510509e-09, + "loss": 0.2185, + "reward": 0.4960937723517418, + "reward_std": 0.21288838610053062, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2985491156578064, + "rewards/tag_count_reward": 0.424665205180645, "step": 3149 }, { "clip_ratio": 0.0, - "completion_length": 1548.8147888183594, + "completion_length": 1727.5871276855469, "epoch": 0.9409304756926293, - "grad_norm": 13.796981811523438, - "kl": 0.20703125, - "learning_rate": 1.0518013560852634e-09, - "loss": 0.1351, - "reward": 0.4447544813156128, - "reward_std": 0.2262903694063425, - "rewards/accuracy_reward": 0.09375000605359674, + "grad_norm": 7.610684871673584, + "kl": 3.70703125, + "learning_rate": 5.2590067804263175e-09, + "loss": 0.2226, + "reward": 0.5641741305589676, + "reward_std": 0.22518694400787354, + "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3510044813156128, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3150 }, { "clip_ratio": 0.0, - "completion_length": 1662.8929443359375, + "completion_length": 1803.4197387695312, "epoch": 0.9412291837801509, - "grad_norm": 10.16843032836914, - "kl": 0.224853515625, - "learning_rate": 1.0411873933111159e-09, - "loss": 0.1175, - "reward": 0.344866082072258, - "reward_std": 0.1824580766260624, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 10.031593322753906, + "kl": 3.064453125, + "learning_rate": 5.20593696655558e-09, + "loss": 0.174, + "reward": 0.4609375074505806, + "reward_std": 0.1403737049549818, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482313156128, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3151 }, { "clip_ratio": 0.0, - "completion_length": 1679.7076721191406, + "completion_length": 1793.4666137695312, "epoch": 0.9415278918676723, - "grad_norm": 11.934950828552246, - "kl": 0.24462890625, - "learning_rate": 1.030626692926595e-09, - "loss": 0.1278, - "reward": 0.4090401828289032, - "reward_std": 0.1716676913201809, - "rewards/accuracy_reward": 0.11160714784637094, + "grad_norm": 6.310960292816162, + "kl": 3.25, + "learning_rate": 5.153133464632975e-09, + "loss": 0.2044, + "reward": 0.550781287252903, + "reward_std": 0.1295801568776369, + "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3152 }, { "clip_ratio": 0.0, - "completion_length": 1645.7210693359375, + "completion_length": 1776.6384582519531, "epoch": 0.9418265999551938, - "grad_norm": 14.832554817199707, - "kl": 0.21923828125, - "learning_rate": 1.020119266420727e-09, - "loss": 0.1475, - "reward": 0.3593750149011612, - "reward_std": 0.22075046598911285, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 10.475260734558105, + "kl": 3.171875, + "learning_rate": 5.100596332103635e-09, + "loss": 0.2002, + "reward": 0.458705373108387, + "reward_std": 0.16390957683324814, + "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3153 }, { "clip_ratio": 0.0, - "completion_length": 1638.2478637695312, + "completion_length": 1749.4219360351562, "epoch": 0.9421253080427152, - "grad_norm": 11.188582420349121, - "kl": 0.26220703125, - "learning_rate": 1.0096651252245514e-09, - "loss": 0.1256, - "reward": 0.3995535969734192, - "reward_std": 0.1917683742940426, - "rewards/accuracy_reward": 0.07812500302679837, + "grad_norm": 4.847557067871094, + "kl": 3.6796875, + "learning_rate": 5.048325626122757e-09, + "loss": 0.2235, + "reward": 0.5139509215950966, + "reward_std": 0.18066633492708206, + "rewards/accuracy_reward": 0.08482143422588706, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321428582072258, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3154 }, { "clip_ratio": 0.0, - "completion_length": 1690.54248046875, + "completion_length": 1818.08935546875, "epoch": 0.9424240161302367, - "grad_norm": 10.336177825927734, - "kl": 0.287109375, - "learning_rate": 9.992642807111484e-10, - "loss": 0.1156, - "reward": 0.3443080559372902, - "reward_std": 0.1850583739578724, + "grad_norm": 5.6882171630859375, + "kl": 4.109375, + "learning_rate": 4.996321403555742e-09, + "loss": 0.231, + "reward": 0.4637276977300644, + "reward_std": 0.1563696563243866, "rewards/accuracy_reward": 0.05357143236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2907366156578064, + "rewards/tag_count_reward": 0.4101562649011612, "step": 3155 }, { "clip_ratio": 0.0, - "completion_length": 1701.1697082519531, + "completion_length": 1827.3996276855469, "epoch": 0.9427227242177582, - "grad_norm": 10.774609565734863, - "kl": 0.25341796875, - "learning_rate": 9.88916744195617e-10, - "loss": 0.1289, - "reward": 0.3437500149011612, - "reward_std": 0.19651055335998535, - "rewards/accuracy_reward": 0.0558035746216774, + "grad_norm": 23.35978889465332, + "kl": 5.1953125, + "learning_rate": 4.944583720978085e-09, + "loss": 0.2714, + "reward": 0.4787946715950966, + "reward_std": 0.1769043579697609, + "rewards/accuracy_reward": 0.07142857694998384, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2879464477300644, + "rewards/tag_count_reward": 0.4073660895228386, "step": 3156 }, { "clip_ratio": 0.0, - "completion_length": 1570.6563415527344, + "completion_length": 1712.5804443359375, "epoch": 0.9430214323052797, - "grad_norm": 12.514638900756836, - "kl": 0.20947265625, - "learning_rate": 9.786225269350745e-10, - "loss": 0.137, - "reward": 0.4330357387661934, - "reward_std": 0.24612359330058098, - "rewards/accuracy_reward": 0.08482143376022577, + "grad_norm": 12.596489906311035, + "kl": 3.16796875, + "learning_rate": 4.893112634675372e-09, + "loss": 0.2033, + "reward": 0.573660746216774, + "reward_std": 0.2282893806695938, + "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3482143059372902, + "rewards/tag_count_reward": 0.4419643059372902, "step": 3157 }, { "clip_ratio": 0.0, - "completion_length": 1627.8616638183594, + "completion_length": 1818.4777526855469, "epoch": 0.9433201403928011, - "grad_norm": 12.567456245422363, - "kl": 0.216064453125, - "learning_rate": 9.683816401286015e-10, - "loss": 0.131, - "reward": 0.392299123108387, - "reward_std": 0.17724448069930077, - "rewards/accuracy_reward": 0.04910714481957257, + "grad_norm": 16.636917114257812, + "kl": 3.865234375, + "learning_rate": 4.841908200643008e-09, + "loss": 0.2199, + "reward": 0.4960937723517418, + "reward_std": 0.16121109575033188, + "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.426897332072258, "step": 3158 }, { "clip_ratio": 0.0, - "completion_length": 1538.3840026855469, + "completion_length": 1700.7120971679688, "epoch": 0.9436188484803226, - "grad_norm": 11.723496437072754, - "kl": 0.20703125, - "learning_rate": 9.581940949172918e-10, - "loss": 0.1433, - "reward": 0.4921875223517418, - "reward_std": 0.22233816981315613, + "grad_norm": 6.981161117553711, + "kl": 3.98046875, + "learning_rate": 4.790970474586459e-09, + "loss": 0.2366, + "reward": 0.5719866305589676, + "reward_std": 0.18915634788572788, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3470982313156128, + "rewards/tag_count_reward": 0.4268973469734192, "step": 3159 }, { "clip_ratio": 0.0, - "completion_length": 1609.08935546875, + "completion_length": 1741.5134887695312, "epoch": 0.943917556567844, - "grad_norm": 13.504440307617188, - "kl": 0.231201171875, - "learning_rate": 9.480599023841962e-10, - "loss": 0.1615, - "reward": 0.431919664144516, - "reward_std": 0.1564822979271412, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 7.836648464202881, + "kl": 4.546875, + "learning_rate": 4.740299511920981e-09, + "loss": 0.2752, + "reward": 0.5357143133878708, + "reward_std": 0.12292356602847576, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.322544664144516, + "rewards/tag_count_reward": 0.4196428805589676, "step": 3160 }, { "clip_ratio": 0.0, - "completion_length": 1611.852783203125, + "completion_length": 1727.7009582519531, "epoch": 0.9442162646553656, - "grad_norm": 14.327571868896484, - "kl": 0.257080078125, - "learning_rate": 9.37979073554318e-10, - "loss": 0.1488, - "reward": 0.3694196566939354, - "reward_std": 0.22289692610502243, - "rewards/accuracy_reward": 0.03794643119908869, + "grad_norm": 6.599543571472168, + "kl": 3.65625, + "learning_rate": 4.68989536777159e-09, + "loss": 0.2202, + "reward": 0.5234375223517418, + "reward_std": 0.19363130629062653, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.436383955180645, "step": 3161 }, { "clip_ratio": 0.0, - "completion_length": 1647.3795776367188, + "completion_length": 1778.6965026855469, "epoch": 0.944514972742887, - "grad_norm": 12.413259506225586, - "kl": 0.236083984375, - "learning_rate": 9.279516193946124e-10, - "loss": 0.1174, - "reward": 0.407924123108387, - "reward_std": 0.2165302336215973, - "rewards/accuracy_reward": 0.06919643259607255, + "grad_norm": 8.948225975036621, + "kl": 4.51953125, + "learning_rate": 4.639758096973062e-09, + "loss": 0.2605, + "reward": 0.518415205180645, + "reward_std": 0.1995774433016777, + "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.415736623108387, "step": 3162 }, { "clip_ratio": 0.0, - "completion_length": 1635.4665832519531, + "completion_length": 1823.5625915527344, "epoch": 0.9448136808304085, - "grad_norm": 11.658528327941895, - "kl": 0.27197265625, - "learning_rate": 9.179775508139697e-10, - "loss": 0.1252, - "reward": 0.3878348395228386, - "reward_std": 0.1899057887494564, - "rewards/accuracy_reward": 0.07812500558793545, + "grad_norm": 6.4680304527282715, + "kl": 4.5390625, + "learning_rate": 4.589887754069849e-09, + "loss": 0.2601, + "reward": 0.4732143059372902, + "reward_std": 0.14428245462477207, + "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098246216774, + "rewards/tag_count_reward": 0.4107142984867096, "step": 3163 }, { "clip_ratio": 0.0, - "completion_length": 1615.2344360351562, + "completion_length": 1729.2143859863281, "epoch": 0.9451123889179299, - "grad_norm": 11.237738609313965, - "kl": 0.254150390625, - "learning_rate": 9.080568786631936e-10, - "loss": 0.1195, - "reward": 0.3750000223517418, - "reward_std": 0.20423252508044243, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 11.300824165344238, + "kl": 4.7421875, + "learning_rate": 4.540284393315969e-09, + "loss": 0.2796, + "reward": 0.494419664144516, + "reward_std": 0.16322331316769123, + "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4252232313156128, "step": 3164 }, { "clip_ratio": 0.0, - "completion_length": 1611.6451721191406, + "completion_length": 1736.2456359863281, "epoch": 0.9454110970054515, - "grad_norm": 12.558074951171875, - "kl": 0.265869140625, - "learning_rate": 8.98189613735012e-10, - "loss": 0.1296, - "reward": 0.3643973395228386, - "reward_std": 0.20890868082642555, - "rewards/accuracy_reward": 0.040178572526201606, + "grad_norm": 5.113112449645996, + "kl": 4.37109375, + "learning_rate": 4.49094806867506e-09, + "loss": 0.2704, + "reward": 0.4698660895228386, + "reward_std": 0.19220685958862305, + "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4207589477300644, "step": 3165 }, { "clip_ratio": 0.0, - "completion_length": 1713.6339721679688, + "completion_length": 1829.5469360351562, "epoch": 0.9457098050929729, - "grad_norm": 11.613253593444824, - "kl": 0.263916015625, - "learning_rate": 8.883757667640268e-10, - "loss": 0.1187, - "reward": 0.3588169738650322, - "reward_std": 0.20241231843829155, - "rewards/accuracy_reward": 0.058035718742758036, + "grad_norm": 24.551511764526367, + "kl": 4.70703125, + "learning_rate": 4.441878833820134e-09, + "loss": 0.2674, + "reward": 0.4871651977300644, + "reward_std": 0.18432844430208206, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3007812574505806, + "rewards/tag_count_reward": 0.4045759066939354, "step": 3166 }, { "clip_ratio": 0.0, - "completion_length": 1720.7790832519531, + "completion_length": 1872.2009582519531, "epoch": 0.9460085131804944, - "grad_norm": 9.237274169921875, - "kl": 0.2587890625, - "learning_rate": 8.786153484267589e-10, - "loss": 0.1015, - "reward": 0.3197544813156128, - "reward_std": 0.18294966965913773, - "rewards/accuracy_reward": 0.02232142980210483, + "grad_norm": 34.3686408996582, + "kl": 6.0078125, + "learning_rate": 4.393076742133794e-09, + "loss": 0.3151, + "reward": 0.4386160895228386, + "reward_std": 0.18710462376475334, + "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330559372902, + "rewards/tag_count_reward": 0.3939732313156128, "step": 3167 }, { "clip_ratio": 0.0, - "completion_length": 1657.4175109863281, + "completion_length": 1805.9911193847656, "epoch": 0.9463072212680158, - "grad_norm": 12.097478866577148, - "kl": 0.244873046875, - "learning_rate": 8.689083693415755e-10, - "loss": 0.1339, - "reward": 0.3448660895228386, - "reward_std": 0.16986791044473648, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 14.207158088684082, + "kl": 5.05859375, + "learning_rate": 4.344541846707878e-09, + "loss": 0.2826, + "reward": 0.463727705180645, + "reward_std": 0.14076603762805462, + "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.407924123108387, "step": 3168 }, { "clip_ratio": 0.0, - "completion_length": 1564.1116333007812, + "completion_length": 1703.6741943359375, "epoch": 0.9466059293555373, - "grad_norm": 17.29115867614746, - "kl": 0.216552734375, - "learning_rate": 8.592548400687183e-10, - "loss": 0.1647, - "reward": 0.4508928805589676, - "reward_std": 0.21048187837004662, - "rewards/accuracy_reward": 0.11160714738070965, + "grad_norm": 4.945898532867432, + "kl": 3.65625, + "learning_rate": 4.2962742003435916e-09, + "loss": 0.238, + "reward": 0.5641741305589676, + "reward_std": 0.17153637669980526, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3392857238650322, + "rewards/tag_count_reward": 0.4391741305589676, "step": 3169 }, { "clip_ratio": 0.0, - "completion_length": 1593.7924499511719, + "completion_length": 1794.618408203125, "epoch": 0.9469046374430587, - "grad_norm": 13.63908576965332, - "kl": 0.241943359375, - "learning_rate": 8.49654771110292e-10, - "loss": 0.1466, - "reward": 0.3515625149011612, - "reward_std": 0.1902056783437729, - "rewards/accuracy_reward": 0.022321430267766118, + "grad_norm": 5.989921569824219, + "kl": 3.93359375, + "learning_rate": 4.2482738555514596e-09, + "loss": 0.21, + "reward": 0.4386160969734192, + "reward_std": 0.15121140331029892, + "rewards/accuracy_reward": 0.029017859138548374, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4095982313156128, "step": 3170 }, { "clip_ratio": 0.0, - "completion_length": 1639.4063415527344, + "completion_length": 1811.1653137207031, "epoch": 0.9472033455305803, - "grad_norm": 11.851630210876465, - "kl": 0.257080078125, - "learning_rate": 8.401081729102199e-10, - "loss": 0.1122, - "reward": 0.3895089477300644, - "reward_std": 0.18542250245809555, - "rewards/accuracy_reward": 0.06250000488944352, + "grad_norm": 5.36673641204834, + "kl": 3.89453125, + "learning_rate": 4.2005408645510995e-09, + "loss": 0.2284, + "reward": 0.490513414144516, + "reward_std": 0.16210117377340794, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3270089477300644, + "rewards/tag_count_reward": 0.4190848469734192, "step": 3171 }, { "clip_ratio": 0.0, - "completion_length": 1593.4040832519531, + "completion_length": 1720.30810546875, "epoch": 0.9475020536181017, - "grad_norm": 13.675760269165039, - "kl": 0.241943359375, - "learning_rate": 8.306150558542668e-10, - "loss": 0.1516, - "reward": 0.4252232313156128, - "reward_std": 0.18793946877121925, - "rewards/accuracy_reward": 0.08928571874275804, + "grad_norm": 7.075869083404541, + "kl": 4.4296875, + "learning_rate": 4.1530752792713345e-09, + "loss": 0.2756, + "reward": 0.550781287252903, + "reward_std": 0.17472469806671143, + "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4235491305589676, "step": 3172 }, { "clip_ratio": 0.0, - "completion_length": 1551.6340026855469, + "completion_length": 1710.9777526855469, "epoch": 0.9478007617056232, - "grad_norm": 12.579513549804688, - "kl": 0.225830078125, - "learning_rate": 8.211754302700158e-10, - "loss": 0.1206, - "reward": 0.4492187649011612, - "reward_std": 0.1918775513768196, - "rewards/accuracy_reward": 0.11160714970901608, + "grad_norm": 6.183236122131348, + "kl": 3.9375, + "learning_rate": 4.105877151350079e-09, + "loss": 0.25, + "reward": 0.5652901977300644, + "reward_std": 0.20641006901860237, + "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4268973469734192, "step": 3173 }, { "clip_ratio": 0.0, - "completion_length": 1671.96435546875, + "completion_length": 1810.16748046875, "epoch": 0.9480994697931446, - "grad_norm": 12.537674903869629, - "kl": 0.24658203125, - "learning_rate": 8.117893064268522e-10, - "loss": 0.1151, - "reward": 0.3348214402794838, - "reward_std": 0.1891937106847763, - "rewards/accuracy_reward": 0.02455357275903225, + "grad_norm": 13.622652053833008, + "kl": 3.55859375, + "learning_rate": 4.058946532134261e-09, + "loss": 0.2034, + "reward": 0.4614955633878708, + "reward_std": 0.15879039093852043, + "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3102678656578064, + "rewards/tag_count_reward": 0.4235491305589676, "step": 3174 }, { "clip_ratio": 0.0, - "completion_length": 1600.8973999023438, + "completion_length": 1698.0603332519531, "epoch": 0.9483981778806662, - "grad_norm": 11.400148391723633, - "kl": 0.21533203125, - "learning_rate": 8.024566945359635e-10, - "loss": 0.1241, - "reward": 0.4631696715950966, - "reward_std": 0.21479196846485138, - "rewards/accuracy_reward": 0.1339285746216774, + "grad_norm": 5.101284980773926, + "kl": 3.421875, + "learning_rate": 4.0122834726798175e-09, + "loss": 0.2153, + "reward": 0.5904018133878708, + "reward_std": 0.18477614410221577, + "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4386160969734192, "step": 3175 }, { "clip_ratio": 0.0, - "completion_length": 1595.1429138183594, + "completion_length": 1785.2188110351562, "epoch": 0.9486968859681876, - "grad_norm": 12.201568603515625, - "kl": 0.225341796875, - "learning_rate": 7.931776047503169e-10, - "loss": 0.1346, - "reward": 0.3660714402794838, - "reward_std": 0.2051396556198597, - "rewards/accuracy_reward": 0.029017857741564512, + "grad_norm": 9.613402366638184, + "kl": 3.35546875, + "learning_rate": 3.965888023751584e-09, + "loss": 0.2111, + "reward": 0.5011160895228386, + "reward_std": 0.19675280526280403, + "rewards/accuracy_reward": 0.07142857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.4296875149011612, "step": 3176 }, { "clip_ratio": 0.0, - "completion_length": 1635.3170471191406, + "completion_length": 1790.4621276855469, "epoch": 0.9489955940557091, - "grad_norm": 11.642928123474121, - "kl": 0.230712890625, - "learning_rate": 7.839520471646432e-10, - "loss": 0.1311, - "reward": 0.3738839402794838, - "reward_std": 0.22634172439575195, - "rewards/accuracy_reward": 0.0513392873108387, + "grad_norm": 8.176694869995117, + "kl": 4.2734375, + "learning_rate": 3.919760235823216e-09, + "loss": 0.2368, + "reward": 0.4771205633878708, + "reward_std": 0.16108850575983524, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446492433548, + "rewards/tag_count_reward": 0.4190848469734192, "step": 3177 }, { "clip_ratio": 0.0, - "completion_length": 1651.5045166015625, + "completion_length": 1786.6697082519531, "epoch": 0.9492943021432305, - "grad_norm": 10.436589241027832, - "kl": 0.226318359375, - "learning_rate": 7.747800318154529e-10, - "loss": 0.1126, - "reward": 0.364955373108387, - "reward_std": 0.15645021572709084, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 6.700039863586426, + "kl": 4.3359375, + "learning_rate": 3.873900159077264e-09, + "loss": 0.2448, + "reward": 0.4637276977300644, + "reward_std": 0.12255089730024338, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.416852705180645, "step": 3178 }, { "clip_ratio": 0.0, - "completion_length": 1572.8705749511719, + "completion_length": 1768.2344360351562, "epoch": 0.949593010230752, - "grad_norm": 11.307379722595215, - "kl": 0.2490234375, - "learning_rate": 7.656615686809975e-10, - "loss": 0.1302, - "reward": 0.3504464402794838, - "reward_std": 0.1638137623667717, - "rewards/accuracy_reward": 0.0446428582072258, + "grad_norm": 7.902632236480713, + "kl": 4.8125, + "learning_rate": 3.8283078434049875e-09, + "loss": 0.2684, + "reward": 0.4614955559372902, + "reward_std": 0.119241613894701, + "rewards/accuracy_reward": 0.0535714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.4079241305589676, "step": 3179 }, { "clip_ratio": 0.0, - "completion_length": 1583.7813415527344, + "completion_length": 1741.6384887695312, "epoch": 0.9498917183182735, - "grad_norm": 12.031296730041504, - "kl": 0.21826171875, - "learning_rate": 7.565966676812696e-10, - "loss": 0.1345, - "reward": 0.4793526977300644, - "reward_std": 0.18319914489984512, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 4.458237171173096, + "kl": 3.6015625, + "learning_rate": 3.782983338406348e-09, + "loss": 0.2103, + "reward": 0.5864955559372902, + "reward_std": 0.15262810699641705, + "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4302455559372902, "step": 3180 }, { "clip_ratio": 0.0, - "completion_length": 1583.1875915527344, + "completion_length": 1772.41748046875, "epoch": 0.950190426405795, - "grad_norm": 9.874241828918457, - "kl": 0.21435546875, - "learning_rate": 7.475853386780029e-10, - "loss": 0.0975, - "reward": 0.4676339477300644, - "reward_std": 0.2180381417274475, - "rewards/accuracy_reward": 0.12946429289877415, + "grad_norm": 5.634639263153076, + "kl": 3.90625, + "learning_rate": 3.7379266933900145e-09, + "loss": 0.2275, + "reward": 0.5848214477300644, + "reward_std": 0.22271934151649475, + "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4308035969734192, "step": 3181 }, { "clip_ratio": 0.0, - "completion_length": 1585.87060546875, + "completion_length": 1721.69873046875, "epoch": 0.9504891344933164, - "grad_norm": 14.943253517150879, - "kl": 0.230224609375, - "learning_rate": 7.386275914746221e-10, - "loss": 0.1655, - "reward": 0.4804687723517418, - "reward_std": 0.17993808910250664, - "rewards/accuracy_reward": 0.1339285783469677, + "grad_norm": 10.401625633239746, + "kl": 2.9140625, + "learning_rate": 3.6931379573731105e-09, + "loss": 0.1685, + "reward": 0.6082589477300644, + "reward_std": 0.14908975176513195, + "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3465401902794838, + "rewards/tag_count_reward": 0.4363839477300644, "step": 3182 }, { "clip_ratio": 0.0, - "completion_length": 1597.4308776855469, + "completion_length": 1759.9844665527344, "epoch": 0.9507878425808379, - "grad_norm": 12.761160850524902, - "kl": 0.269775390625, - "learning_rate": 7.297234358162929e-10, - "loss": 0.1406, - "reward": 0.407924123108387, - "reward_std": 0.18454989045858383, - "rewards/accuracy_reward": 0.10267857578583062, + "grad_norm": 13.341290473937988, + "kl": 3.6640625, + "learning_rate": 3.6486171790814647e-09, + "loss": 0.2179, + "reward": 0.5373884290456772, + "reward_std": 0.14686298184096813, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3183 }, { "clip_ratio": 0.0, - "completion_length": 1559.8014221191406, + "completion_length": 1674.2411804199219, "epoch": 0.9510865506683593, - "grad_norm": 13.738595962524414, - "kl": 0.258544921875, - "learning_rate": 7.208728813898613e-10, - "loss": 0.1638, - "reward": 0.3973214402794838, - "reward_std": 0.2049853578209877, - "rewards/accuracy_reward": 0.07366071944124997, + "grad_norm": 14.428565979003906, + "kl": 2.9921875, + "learning_rate": 3.6043644069493064e-09, + "loss": 0.1993, + "reward": 0.5239955708384514, + "reward_std": 0.18549969606101513, + "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4369419813156128, "step": 3184 }, { "clip_ratio": 0.0, - "completion_length": 1625.1629943847656, + "completion_length": 1761.3014221191406, "epoch": 0.9513852587558809, - "grad_norm": 11.290306091308594, - "kl": 0.2490234375, - "learning_rate": 7.120759378238583e-10, - "loss": 0.1062, - "reward": 0.3811384066939354, - "reward_std": 0.1844024956226349, - "rewards/accuracy_reward": 0.058035716880112886, + "grad_norm": 8.357264518737793, + "kl": 4.06640625, + "learning_rate": 3.560379689119292e-09, + "loss": 0.2379, + "reward": 0.4726562649011612, + "reward_std": 0.1429803017526865, + "rewards/accuracy_reward": 0.06473214738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026977300644, + "rewards/tag_count_reward": 0.407924123108387, "step": 3185 }, { "clip_ratio": 0.0, - "completion_length": 1615.0625610351562, + "completion_length": 1757.9643859863281, "epoch": 0.9516839668434023, - "grad_norm": 12.058358192443848, - "kl": 0.25537109375, - "learning_rate": 7.03332614688501e-10, - "loss": 0.1353, - "reward": 0.3627232313156128, - "reward_std": 0.20846598967909813, - "rewards/accuracy_reward": 0.040178571827709675, + "grad_norm": 4.255255222320557, + "kl": 3.6875, + "learning_rate": 3.5166630734425052e-09, + "loss": 0.2305, + "reward": 0.4771205633878708, + "reward_std": 0.14277459122240543, + "rewards/accuracy_reward": 0.049107146449387074, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.428013414144516, "step": 3186 }, { "clip_ratio": 0.0, - "completion_length": 1597.1384582519531, + "completion_length": 1743.0982971191406, "epoch": 0.9519826749309237, - "grad_norm": 12.82456111907959, - "kl": 0.25048828125, - "learning_rate": 6.946429214956695e-10, - "loss": 0.1343, - "reward": 0.3554687574505806, - "reward_std": 0.20150255411863327, - "rewards/accuracy_reward": 0.026785715715959668, + "grad_norm": 6.341939449310303, + "kl": 3.50390625, + "learning_rate": 3.4732146074783477e-09, + "loss": 0.2226, + "reward": 0.4776785969734192, + "reward_std": 0.17072607204318047, + "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4330357313156128, "step": 3187 }, { "clip_ratio": 0.0, - "completion_length": 1599.0625915527344, + "completion_length": 1778.85498046875, "epoch": 0.9522813830184452, - "grad_norm": 12.990900039672852, - "kl": 0.239501953125, - "learning_rate": 6.860068676988906e-10, - "loss": 0.1213, - "reward": 0.396205373108387, - "reward_std": 0.21257450059056282, - "rewards/accuracy_reward": 0.053571431431919336, + "grad_norm": 19.17293357849121, + "kl": 3.181640625, + "learning_rate": 3.430034338494453e-09, + "loss": 0.1825, + "reward": 0.5312500298023224, + "reward_std": 0.18148424476385117, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339477300644, + "rewards/tag_count_reward": 0.4375000149011612, "step": 3188 }, { "clip_ratio": 0.0, - "completion_length": 1668.8884582519531, + "completion_length": 1834.7188415527344, "epoch": 0.9525800911059666, - "grad_norm": 11.04008960723877, - "kl": 0.271484375, - "learning_rate": 6.774244626933489e-10, - "loss": 0.1242, - "reward": 0.4213169887661934, - "reward_std": 0.20087797194719315, - "rewards/accuracy_reward": 0.1160714328289032, + "grad_norm": 10.420117378234863, + "kl": 4.1796875, + "learning_rate": 3.387122313466745e-09, + "loss": 0.2216, + "reward": 0.5446428880095482, + "reward_std": 0.18158254399895668, + "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455559372902, + "rewards/tag_count_reward": 0.4040178805589676, "step": 3189 }, { "clip_ratio": 0.0, - "completion_length": 1599.8527221679688, + "completion_length": 1749.8750610351562, "epoch": 0.9528787991934882, - "grad_norm": 11.46895980834961, - "kl": 0.251220703125, - "learning_rate": 6.688957158158593e-10, - "loss": 0.1205, - "reward": 0.4386160895228386, - "reward_std": 0.1744261048734188, - "rewards/accuracy_reward": 0.1138392947614193, + "grad_norm": 6.479268550872803, + "kl": 4.19140625, + "learning_rate": 3.344478579079296e-09, + "loss": 0.2208, + "reward": 0.545758955180645, + "reward_std": 0.13298838585615158, + "rewards/accuracy_reward": 0.12500000861473382, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4207589477300644, "step": 3190 }, { "clip_ratio": 0.0, - "completion_length": 1580.9442443847656, + "completion_length": 1747.2969360351562, "epoch": 0.9531775072810096, - "grad_norm": 12.716561317443848, - "kl": 0.2470703125, - "learning_rate": 6.604206363448661e-10, - "loss": 0.1288, - "reward": 0.4693080484867096, - "reward_std": 0.17633166536688805, - "rewards/accuracy_reward": 0.12500000861473382, + "grad_norm": 13.04605770111084, + "kl": 4.15234375, + "learning_rate": 3.3021031817243305e-09, + "loss": 0.2672, + "reward": 0.5485491305589676, + "reward_std": 0.13957325369119644, + "rewards/accuracy_reward": 0.12500000419095159, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3443080484867096, + "rewards/tag_count_reward": 0.423549123108387, "step": 3191 }, { "clip_ratio": 0.0, - "completion_length": 1672.5670166015625, + "completion_length": 1803.9643249511719, "epoch": 0.9534762153685311, - "grad_norm": 10.660918235778809, - "kl": 0.272216796875, - "learning_rate": 6.519992335004221e-10, - "loss": 0.1087, - "reward": 0.388392873108387, - "reward_std": 0.1753869205713272, - "rewards/accuracy_reward": 0.08482143143191934, + "grad_norm": 8.232499122619629, + "kl": 4.0859375, + "learning_rate": 3.25999616750211e-09, + "loss": 0.2353, + "reward": 0.5133928805589676, + "reward_std": 0.14984129182994366, + "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3035714440047741, + "rewards/tag_count_reward": 0.4062500149011612, "step": 3192 }, { "clip_ratio": 0.0, - "completion_length": 1663.49560546875, + "completion_length": 1763.87060546875, "epoch": 0.9537749234560525, - "grad_norm": 13.246490478515625, - "kl": 0.2744140625, - "learning_rate": 6.436315164441875e-10, - "loss": 0.1264, - "reward": 0.4296875223517418, - "reward_std": 0.17197741195559502, - "rewards/accuracy_reward": 0.12276786309666932, + "grad_norm": 5.395274639129639, + "kl": 4.078125, + "learning_rate": 3.2181575822209373e-09, + "loss": 0.2314, + "reward": 0.5418526977300644, + "reward_std": 0.1361988317221403, + "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196492433548, + "rewards/tag_count_reward": 0.4168526977300644, "step": 3193 }, { "clip_ratio": 0.0, - "completion_length": 1589.6072082519531, + "completion_length": 1772.0290832519531, "epoch": 0.954073631543574, - "grad_norm": 11.85213565826416, - "kl": 0.235107421875, - "learning_rate": 6.353174942794138e-10, - "loss": 0.1427, - "reward": 0.4168526902794838, - "reward_std": 0.21588028222322464, - "rewards/accuracy_reward": 0.07812500558793545, + "grad_norm": 15.65932846069336, + "kl": 5.5078125, + "learning_rate": 3.1765874713970685e-09, + "loss": 0.3223, + "reward": 0.4854910895228386, + "reward_std": 0.17267966642975807, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4006696566939354, "step": 3194 }, { "clip_ratio": 0.0, - "completion_length": 1620.7701721191406, + "completion_length": 1790.774658203125, "epoch": 0.9543723396310955, - "grad_norm": 12.620087623596191, - "kl": 0.247802734375, - "learning_rate": 6.270571760509546e-10, - "loss": 0.1349, - "reward": 0.4665178954601288, - "reward_std": 0.21424733474850655, - "rewards/accuracy_reward": 0.13616072246804833, + "grad_norm": 7.965880870819092, + "kl": 3.8046875, + "learning_rate": 3.1352858802547734e-09, + "loss": 0.2394, + "reward": 0.5613839477300644, + "reward_std": 0.1668187379837036, + "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4274553805589676, "step": 3195 }, { "clip_ratio": 0.0, - "completion_length": 1662.8638916015625, + "completion_length": 1829.0246276855469, "epoch": 0.954671047718617, - "grad_norm": 13.170239448547363, - "kl": 0.260986328125, - "learning_rate": 6.188505707452163e-10, - "loss": 0.1335, - "reward": 0.3934151902794838, - "reward_std": 0.1814696490764618, - "rewards/accuracy_reward": 0.07812500232830644, + "grad_norm": 9.920350074768066, + "kl": 4.8125, + "learning_rate": 3.0942528537260816e-09, + "loss": 0.2439, + "reward": 0.4960937649011612, + "reward_std": 0.15205425955355167, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4068080559372902, "step": 3196 }, { "clip_ratio": 0.0, - "completion_length": 1695.6786499023438, + "completion_length": 1793.6808776855469, "epoch": 0.9549697558061384, - "grad_norm": 10.3033447265625, - "kl": 0.27294921875, - "learning_rate": 6.106976872901792e-10, - "loss": 0.1127, - "reward": 0.4017857313156128, - "reward_std": 0.20642118528485298, - "rewards/accuracy_reward": 0.1093750037252903, + "grad_norm": 12.731158256530762, + "kl": 3.83984375, + "learning_rate": 3.053488436450896e-09, + "loss": 0.2385, + "reward": 0.5256696566939354, + "reward_std": 0.16851279698312283, + "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2924107238650322, + "rewards/tag_count_reward": 0.411830373108387, "step": 3197 }, { "clip_ratio": 0.0, - "completion_length": 1587.9665832519531, + "completion_length": 1784.9554138183594, "epoch": 0.9552684638936599, - "grad_norm": 14.665741920471191, - "kl": 0.220458984375, - "learning_rate": 6.025985345553985e-10, - "loss": 0.1315, - "reward": 0.4174107387661934, - "reward_std": 0.18534554168581963, - "rewards/accuracy_reward": 0.08705357694998384, + "grad_norm": 7.935122489929199, + "kl": 3.79296875, + "learning_rate": 3.0129926727769926e-09, + "loss": 0.2218, + "reward": 0.5106026977300644, + "reward_std": 0.158831886947155, + "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.330357164144516, + "rewards/tag_count_reward": 0.4146205559372902, "step": 3198 }, { "clip_ratio": 0.0, - "completion_length": 1563.0916137695312, + "completion_length": 1692.1630249023438, "epoch": 0.9555671719811814, - "grad_norm": 13.6808500289917, - "kl": 0.2197265625, - "learning_rate": 5.945531213519373e-10, - "loss": 0.1389, - "reward": 0.4609375223517418, - "reward_std": 0.20476964116096497, - "rewards/accuracy_reward": 0.12276786286383867, + "grad_norm": 3.705775737762451, + "kl": 3.6328125, + "learning_rate": 2.972765606759686e-09, + "loss": 0.2013, + "reward": 0.5781250223517418, + "reward_std": 0.2159331515431404, + "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4375000223517418, "step": 3199 }, { "clip_ratio": 0.0, - "completion_length": 1684.6697387695312, + "completion_length": 1828.0156860351562, "epoch": 0.9558658800687029, - "grad_norm": 10.693188667297363, - "kl": 0.246826171875, - "learning_rate": 5.865614564324273e-10, - "loss": 0.1089, - "reward": 0.404017873108387, - "reward_std": 0.17706066742539406, - "rewards/accuracy_reward": 0.10491072130389512, + "grad_norm": 25.695594787597656, + "kl": 5.859375, + "learning_rate": 2.932807282162136e-09, + "loss": 0.3036, + "reward": 0.5184151977300644, + "reward_std": 0.14088300615549088, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071492433548, + "rewards/tag_count_reward": 0.4023437649011612, "step": 3200 }, { "clip_ratio": 0.0, - "completion_length": 1722.6094360351562, + "completion_length": 1874.7568054199219, "epoch": 0.9561645881562243, - "grad_norm": 9.741623878479004, - "kl": 0.240234375, - "learning_rate": 5.78623548491014e-10, - "loss": 0.1049, - "reward": 0.3543526902794838, - "reward_std": 0.16373609751462936, - "rewards/accuracy_reward": 0.04687500232830644, + "grad_norm": 5.909251689910889, + "kl": 4.36328125, + "learning_rate": 2.8931177424550703e-09, + "loss": 0.2182, + "reward": 0.459263414144516, + "reward_std": 0.13483469188213348, + "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.4123884066939354, "step": 3201 }, { "clip_ratio": 0.0, - "completion_length": 1658.3304443359375, + "completion_length": 1782.8795471191406, "epoch": 0.9564632962437458, - "grad_norm": 12.80587100982666, - "kl": 0.249755859375, - "learning_rate": 5.707394061633675e-10, - "loss": 0.129, - "reward": 0.3627232313156128, - "reward_std": 0.19168058782815933, - "rewards/accuracy_reward": 0.06250000232830644, + "grad_norm": 8.672348976135254, + "kl": 4.6484375, + "learning_rate": 2.8536970308168375e-09, + "loss": 0.2647, + "reward": 0.4977678805589676, + "reward_std": 0.1677652969956398, + "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3002232238650322, + "rewards/tag_count_reward": 0.4151785895228386, "step": 3202 }, { "clip_ratio": 0.0, - "completion_length": 1632.4397888183594, + "completion_length": 1720.0134887695312, "epoch": 0.9567620043312672, - "grad_norm": 7.973084926605225, - "kl": 0.23828125, - "learning_rate": 5.629090380266544e-10, - "loss": 0.1032, - "reward": 0.3900669813156128, - "reward_std": 0.1829211302101612, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 4.105799198150635, + "kl": 3.91015625, + "learning_rate": 2.814545190133272e-09, + "loss": 0.2326, + "reward": 0.5234375223517418, + "reward_std": 0.16041920334100723, + "rewards/accuracy_reward": 0.10267857275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455559372902, + "rewards/tag_count_reward": 0.4207589477300644, "step": 3203 }, { "clip_ratio": 0.0, - "completion_length": 1635.3772888183594, + "completion_length": 1759.7567749023438, "epoch": 0.9570607124187888, - "grad_norm": 9.273287773132324, - "kl": 0.218505859375, - "learning_rate": 5.551324525995604e-10, - "loss": 0.1036, - "reward": 0.3588169738650322, - "reward_std": 0.213894072920084, - "rewards/accuracy_reward": 0.03571428847499192, + "grad_norm": 5.669576168060303, + "kl": 3.69140625, + "learning_rate": 2.7756622629978023e-09, + "loss": 0.21, + "reward": 0.471540205180645, + "reward_std": 0.18292231857776642, + "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3231026902794838, + "rewards/tag_count_reward": 0.4246651977300644, "step": 3204 }, { "clip_ratio": 0.0, - "completion_length": 1585.3683776855469, + "completion_length": 1761.5603332519531, "epoch": 0.9573594205063102, - "grad_norm": 11.855996131896973, - "kl": 0.208984375, - "learning_rate": 5.474096583422349e-10, - "loss": 0.1218, - "reward": 0.407924123108387, - "reward_std": 0.20713147521018982, - "rewards/accuracy_reward": 0.0714285746216774, + "grad_norm": 8.462360382080078, + "kl": 3.076171875, + "learning_rate": 2.7370482917111747e-09, + "loss": 0.1753, + "reward": 0.533482164144516, + "reward_std": 0.17462538182735443, + "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.439732164144516, "step": 3205 }, { "clip_ratio": 0.0, - "completion_length": 1582.5313415527344, + "completion_length": 1683.3437805175781, "epoch": 0.9576581285938317, - "grad_norm": 13.021567344665527, - "kl": 0.22119140625, - "learning_rate": 5.397406636563296e-10, - "loss": 0.1667, - "reward": 0.3498883992433548, - "reward_std": 0.2151344083249569, - "rewards/accuracy_reward": 0.024553572293370962, + "grad_norm": 12.1578369140625, + "kl": 3.265625, + "learning_rate": 2.6987033182816478e-09, + "loss": 0.2162, + "reward": 0.4642857313156128, + "reward_std": 0.15591027028858662, + "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.433035746216774, "step": 3206 }, { "clip_ratio": 0.0, - "completion_length": 1723.7723693847656, + "completion_length": 1812.6607971191406, "epoch": 0.9579568366813531, - "grad_norm": 11.382416725158691, - "kl": 0.237548828125, - "learning_rate": 5.321254768849482e-10, - "loss": 0.1116, - "reward": 0.329241082072258, - "reward_std": 0.18987061828374863, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 6.98715353012085, + "kl": 4.796875, + "learning_rate": 2.6606273844247405e-09, + "loss": 0.2678, + "reward": 0.435825914144516, + "reward_std": 0.16061965562403202, + "rewards/accuracy_reward": 0.024553572991862893, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.313616082072258, + "rewards/tag_count_reward": 0.4112723395228386, "step": 3207 }, { "clip_ratio": 0.0, - "completion_length": 1565.7210388183594, + "completion_length": 1717.1898193359375, "epoch": 0.9582555447688746, - "grad_norm": 12.834395408630371, - "kl": 0.229248046875, - "learning_rate": 5.245641063126804e-10, - "loss": 0.1323, - "reward": 0.403459832072258, - "reward_std": 0.17406092584133148, - "rewards/accuracy_reward": 0.06696428917348385, + "grad_norm": 10.775981903076172, + "kl": 3.16796875, + "learning_rate": 2.6228205315634023e-09, + "loss": 0.2059, + "reward": 0.5407366380095482, + "reward_std": 0.17142374627292156, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955559372902, + "rewards/tag_count_reward": 0.440290205180645, "step": 3208 }, { "clip_ratio": 0.0, - "completion_length": 1606.7947387695312, + "completion_length": 1776.7388916015625, "epoch": 0.9585542528563961, - "grad_norm": 13.917374610900879, - "kl": 0.21484375, - "learning_rate": 5.170565601655519e-10, - "loss": 0.1363, - "reward": 0.4659598469734192, - "reward_std": 0.19197675213217735, - "rewards/accuracy_reward": 0.13392857694998384, + "grad_norm": 3.871722459793091, + "kl": 3.81640625, + "learning_rate": 2.5852828008277593e-09, + "loss": 0.2029, + "reward": 0.5507812723517418, + "reward_std": 0.14525078795850277, + "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312723517418, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3209 }, { "clip_ratio": 0.0, - "completion_length": 1612.5781860351562, + "completion_length": 1753.7166137695312, "epoch": 0.9588529609439176, - "grad_norm": 13.75051212310791, - "kl": 0.234375, - "learning_rate": 5.096028466110347e-10, - "loss": 0.1453, - "reward": 0.4386160969734192, - "reward_std": 0.17219550907611847, - "rewards/accuracy_reward": 0.12500000488944352, + "grad_norm": 16.75496482849121, + "kl": 5.109375, + "learning_rate": 2.5480142330551736e-09, + "loss": 0.2791, + "reward": 0.5340401977300644, + "reward_std": 0.14148659072816372, + "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160895228386, + "rewards/tag_count_reward": 0.4090401977300644, "step": 3210 }, { "clip_ratio": 0.0, - "completion_length": 1717.4197082519531, + "completion_length": 1855.8304443359375, "epoch": 0.959151669031439, - "grad_norm": 12.306219100952148, - "kl": 0.244873046875, - "learning_rate": 5.022029737580424e-10, - "loss": 0.1159, - "reward": 0.3515625149011612, - "reward_std": 0.196803230792284, - "rewards/accuracy_reward": 0.055803575087338686, + "grad_norm": 6.700922966003418, + "kl": 4.26171875, + "learning_rate": 2.5110148687902123e-09, + "loss": 0.2248, + "reward": 0.4609375149011612, + "reward_std": 0.14860033616423607, + "rewards/accuracy_reward": 0.051339289639145136, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2957589402794838, + "rewards/tag_count_reward": 0.4095982313156128, "step": 3211 }, { "clip_ratio": 0.0, - "completion_length": 1610.3036804199219, + "completion_length": 1764.5692443847656, "epoch": 0.9594503771189605, - "grad_norm": 12.47861099243164, - "kl": 0.269287109375, - "learning_rate": 4.948569496569078e-10, - "loss": 0.1197, - "reward": 0.3543526977300644, - "reward_std": 0.1952795647084713, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 17.808759689331055, + "kl": 3.984375, + "learning_rate": 2.474284748284539e-09, + "loss": 0.2201, + "reward": 0.5011160895228386, + "reward_std": 0.19133322685956955, + "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776977300644, + "rewards/tag_count_reward": 0.4252232313156128, "step": 3212 }, { "clip_ratio": 0.0, - "completion_length": 1678.8772888183594, + "completion_length": 1834.571533203125, "epoch": 0.9597490852064819, - "grad_norm": 12.336080551147461, - "kl": 0.232666015625, - "learning_rate": 4.875647822993822e-10, - "loss": 0.1256, - "reward": 0.3956473395228386, - "reward_std": 0.2049204297363758, - "rewards/accuracy_reward": 0.07366071827709675, + "grad_norm": 5.078474998474121, + "kl": 3.60546875, + "learning_rate": 2.437823911496911e-09, + "loss": 0.2053, + "reward": 0.4760044813156128, + "reward_std": 0.16334300860762596, + "rewards/accuracy_reward": 0.06696429010480642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4090401977300644, "step": 3213 }, { "clip_ratio": 0.0, - "completion_length": 1644.8193054199219, + "completion_length": 1768.2813110351562, "epoch": 0.9600477932940035, - "grad_norm": 13.025069236755371, - "kl": 0.239013671875, - "learning_rate": 4.803264796186368e-10, - "loss": 0.1434, - "reward": 0.3850446566939354, - "reward_std": 0.17622138187289238, - "rewards/accuracy_reward": 0.0781250037252903, + "grad_norm": 3.833346128463745, + "kl": 4.48046875, + "learning_rate": 2.401632398093184e-09, + "loss": 0.2626, + "reward": 0.5072544887661934, + "reward_std": 0.15252332389354706, + "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3069196492433548, + "rewards/tag_count_reward": 0.4157366305589676, "step": 3214 }, { "clip_ratio": 0.0, - "completion_length": 1603.13623046875, + "completion_length": 1732.7634887695312, "epoch": 0.9603465013815249, - "grad_norm": 12.041620254516602, - "kl": 0.283447265625, - "learning_rate": 4.731420494892336e-10, - "loss": 0.1287, - "reward": 0.3822544813156128, - "reward_std": 0.1804664209485054, - "rewards/accuracy_reward": 0.08482143399305642, + "grad_norm": 18.387584686279297, + "kl": 4.87890625, + "learning_rate": 2.3657102474461677e-09, + "loss": 0.2854, + "reward": 0.5195312798023224, + "reward_std": 0.1605258844792843, + "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2974330484867096, + "rewards/tag_count_reward": 0.4146205484867096, "step": 3215 }, { "clip_ratio": 0.0, - "completion_length": 1616.3504943847656, + "completion_length": 1786.2857971191406, "epoch": 0.9606452094690464, - "grad_norm": 10.591991424560547, - "kl": 0.21533203125, - "learning_rate": 4.660114997271258e-10, - "loss": 0.1209, - "reward": 0.391741082072258, - "reward_std": 0.18019946664571762, - "rewards/accuracy_reward": 0.0602678619325161, + "grad_norm": 13.01712703704834, + "kl": 3.76953125, + "learning_rate": 2.330057498635629e-09, + "loss": 0.2075, + "reward": 0.4815848395228386, + "reward_std": 0.12791107781231403, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.428013414144516, "step": 3216 }, { "clip_ratio": 0.0, - "completion_length": 1536.6361999511719, + "completion_length": 1707.8706359863281, "epoch": 0.9609439175565678, - "grad_norm": 12.314233779907227, - "kl": 0.228515625, - "learning_rate": 4.5893483808965294e-10, - "loss": 0.1285, - "reward": 0.3738839402794838, - "reward_std": 0.1572633720934391, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 3.9677698612213135, + "kl": 4.42578125, + "learning_rate": 2.2946741904482646e-09, + "loss": 0.26, + "reward": 0.4681919813156128, + "reward_std": 0.13768784515559673, + "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.4190848395228386, "step": 3217 }, { "clip_ratio": 0.0, - "completion_length": 1593.8170166015625, + "completion_length": 1757.8125610351562, "epoch": 0.9612426256440894, - "grad_norm": 11.322098731994629, - "kl": 0.245361328125, - "learning_rate": 4.519120722755343e-10, - "loss": 0.1336, - "reward": 0.3688616305589676, - "reward_std": 0.17228158935904503, - "rewards/accuracy_reward": 0.04910714412108064, + "grad_norm": 12.316123962402344, + "kl": 4.49609375, + "learning_rate": 2.2595603613776713e-09, + "loss": 0.2568, + "reward": 0.4888393133878708, + "reward_std": 0.16414214670658112, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3197544813156128, + "rewards/tag_count_reward": 0.419642873108387, "step": 3218 }, { "clip_ratio": 0.0, - "completion_length": 1615.7166137695312, + "completion_length": 1777.01123046875, "epoch": 0.9615413337316108, - "grad_norm": 9.67038631439209, - "kl": 0.227294921875, - "learning_rate": 4.449432099248418e-10, - "loss": 0.1038, - "reward": 0.3493303805589676, - "reward_std": 0.17020573280751705, - "rewards/accuracy_reward": 0.0200892873108387, + "grad_norm": 6.586704254150391, + "kl": 3.40234375, + "learning_rate": 2.224716049624209e-09, + "loss": 0.1906, + "reward": 0.4514509215950966, + "reward_std": 0.14094720594584942, + "rewards/accuracy_reward": 0.03348214365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4179687723517418, "step": 3219 }, { "clip_ratio": 0.0, - "completion_length": 1649.341552734375, + "completion_length": 1749.7009887695312, "epoch": 0.9618400418191323, - "grad_norm": 12.284724235534668, - "kl": 0.21630859375, - "learning_rate": 4.3802825861902206e-10, - "loss": 0.1173, - "reward": 0.3660714477300644, - "reward_std": 0.14984987676143646, - "rewards/accuracy_reward": 0.03794643026776612, + "grad_norm": 13.039702415466309, + "kl": 3.16015625, + "learning_rate": 2.1901412930951103e-09, + "loss": 0.204, + "reward": 0.486607164144516, + "reward_std": 0.11408872716128826, + "rewards/accuracy_reward": 0.0468750037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.439732164144516, "step": 3220 }, { "clip_ratio": 0.0, - "completion_length": 1636.0737609863281, + "completion_length": 1779.9688110351562, "epoch": 0.9621387499066537, - "grad_norm": 10.475423812866211, - "kl": 0.23388671875, - "learning_rate": 4.311672258808574e-10, - "loss": 0.1133, - "reward": 0.3649553805589676, - "reward_std": 0.1643117144703865, - "rewards/accuracy_reward": 0.046875003492459655, + "grad_norm": 13.141895294189453, + "kl": 4.2890625, + "learning_rate": 2.155836129404287e-09, + "loss": 0.2534, + "reward": 0.4972098395228386, + "reward_std": 0.1626892313361168, + "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3180803805589676, + "rewards/tag_count_reward": 0.4257812649011612, "step": 3221 }, { "clip_ratio": 0.0, - "completion_length": 1575.2009887695312, + "completion_length": 1695.446533203125, "epoch": 0.9624374579941752, - "grad_norm": 10.318424224853516, - "kl": 0.22216796875, - "learning_rate": 4.243601191744828e-10, - "loss": 0.1225, - "reward": 0.4508928805589676, - "reward_std": 0.16470800898969173, + "grad_norm": 10.435662269592285, + "kl": 3.7890625, + "learning_rate": 2.121800595872414e-09, + "loss": 0.2309, + "reward": 0.5580357313156128, + "reward_std": 0.13018103316426277, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325892873108387, + "rewards/tag_count_reward": 0.4330357313156128, "step": 3222 }, { "clip_ratio": 0.0, - "completion_length": 1538.5335388183594, + "completion_length": 1675.6697387695312, "epoch": 0.9627361660816967, - "grad_norm": 10.542007446289062, - "kl": 0.21826171875, - "learning_rate": 4.1760694590536883e-10, - "loss": 0.1282, - "reward": 0.3722098395228386, - "reward_std": 0.1697671264410019, - "rewards/accuracy_reward": 0.03348214481957257, + "grad_norm": 12.00804615020752, + "kl": 3.25390625, + "learning_rate": 2.088034729526844e-09, + "loss": 0.2002, + "reward": 0.4525669738650322, + "reward_std": 0.11714149080216885, + "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276977300644, + "rewards/tag_count_reward": 0.4324776902794838, "step": 3223 }, { "clip_ratio": 0.0, - "completion_length": 1558.6161499023438, + "completion_length": 1662.0648193359375, "epoch": 0.9630348741692182, - "grad_norm": 11.893441200256348, - "kl": 0.216064453125, - "learning_rate": 4.109077134202998e-10, - "loss": 0.1568, - "reward": 0.4687500149011612, - "reward_std": 0.23206495866179466, - "rewards/accuracy_reward": 0.12723214528523386, - "rewards/format_reward": 0.004464285913854837, - "rewards/tag_count_reward": 0.3370535895228386, + "grad_norm": 3.8778293132781982, + "kl": 3.73046875, + "learning_rate": 2.054538567101499e-09, + "loss": 0.2364, + "reward": 0.580357164144516, + "reward_std": 0.18404560908675194, + "rewards/accuracy_reward": 0.15178572479635477, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3224 }, { "clip_ratio": 0.0, - "completion_length": 1641.9866638183594, + "completion_length": 1820.1094360351562, "epoch": 0.9633335822567396, - "grad_norm": 11.310487747192383, - "kl": 0.234619140625, - "learning_rate": 4.0426242900737926e-10, - "loss": 0.1223, - "reward": 0.435267873108387, - "reward_std": 0.19178173691034317, - "rewards/accuracy_reward": 0.12053571874275804, + "grad_norm": 6.830080032348633, + "kl": 3.89453125, + "learning_rate": 2.0213121450368963e-09, + "loss": 0.2165, + "reward": 0.5792410969734192, + "reward_std": 0.19683990254998207, + "rewards/accuracy_reward": 0.1584821455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321566939354, + "rewards/tag_count_reward": 0.4207589477300644, "step": 3225 }, { "clip_ratio": 0.0, - "completion_length": 1586.5313110351562, + "completion_length": 1782.0179138183594, "epoch": 0.9636322903442611, - "grad_norm": 10.721861839294434, - "kl": 0.236328125, - "learning_rate": 3.9767109989604084e-10, - "loss": 0.1122, - "reward": 0.392299123108387, - "reward_std": 0.17152101173996925, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 4.4903693199157715, + "kl": 4.640625, + "learning_rate": 1.9883554994802044e-09, + "loss": 0.2569, + "reward": 0.482700914144516, + "reward_std": 0.15014082193374634, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3186384066939354, + "rewards/tag_count_reward": 0.4135044813156128, "step": 3226 }, { "clip_ratio": 0.0, - "completion_length": 1711.6496276855469, + "completion_length": 1838.9375915527344, "epoch": 0.9639309984317825, - "grad_norm": 11.3810396194458, - "kl": 0.2470703125, - "learning_rate": 3.911337332569875e-10, - "loss": 0.1188, - "reward": 0.3688616156578064, - "reward_std": 0.15355907753109932, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 10.182792663574219, + "kl": 4.56640625, + "learning_rate": 1.9556686662849377e-09, + "loss": 0.2401, + "reward": 0.4732143133878708, + "reward_std": 0.12268347665667534, + "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2952008992433548, + "rewards/tag_count_reward": 0.397321455180645, "step": 3227 }, { "clip_ratio": 0.0, - "completion_length": 1580.6630249023438, + "completion_length": 1753.3639221191406, "epoch": 0.9642297065193041, - "grad_norm": 12.011466979980469, - "kl": 0.226806640625, - "learning_rate": 3.846503362022413e-10, - "loss": 0.139, - "reward": 0.4469866305589676, - "reward_std": 0.20663845539093018, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 14.042912483215332, + "kl": 3.1015625, + "learning_rate": 1.9232516810112064e-09, + "loss": 0.1831, + "reward": 0.5675223469734192, + "reward_std": 0.1663905456662178, + "rewards/accuracy_reward": 0.13392857555299997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4335937723517418, "step": 3228 }, { "clip_ratio": 0.0, - "completion_length": 1599.4375610351562, + "completion_length": 1748.8616943359375, "epoch": 0.9645284146068255, - "grad_norm": 12.00692081451416, - "kl": 0.218994140625, - "learning_rate": 3.7822091578510463e-10, - "loss": 0.1368, - "reward": 0.4330357313156128, - "reward_std": 0.22763565182685852, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 8.678559303283691, + "kl": 3.93359375, + "learning_rate": 1.891104578925523e-09, + "loss": 0.2227, + "reward": 0.545200914144516, + "reward_std": 0.19857880100607872, + "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3370535895228386, + "rewards/tag_count_reward": 0.4268973469734192, "step": 3229 }, { "clip_ratio": 0.0, - "completion_length": 1508.3482666015625, + "completion_length": 1701.5693054199219, "epoch": 0.9648271226943469, - "grad_norm": 11.589564323425293, - "kl": 0.200439453125, - "learning_rate": 3.7184547900015463e-10, - "loss": 0.1383, - "reward": 0.4073660895228386, - "reward_std": 0.1760479360818863, - "rewards/accuracy_reward": 0.05580357555299997, + "grad_norm": 13.495780944824219, + "kl": 3.55859375, + "learning_rate": 1.859227395000773e-09, + "loss": 0.2209, + "reward": 0.5145089626312256, + "reward_std": 0.1673678308725357, + "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3515625149011612, + "rewards/tag_count_reward": 0.4341518059372902, "step": 3230 }, { "clip_ratio": 0.0, - "completion_length": 1601.7857971191406, + "completion_length": 1743.5157165527344, "epoch": 0.9651258307818684, - "grad_norm": 13.533483505249023, - "kl": 0.222900390625, - "learning_rate": 3.6552403278324315e-10, - "loss": 0.1541, - "reward": 0.4056919887661934, - "reward_std": 0.1740151382982731, - "rewards/accuracy_reward": 0.08035714784637094, + "grad_norm": 13.046490669250488, + "kl": 5.2890625, + "learning_rate": 1.827620163916216e-09, + "loss": 0.307, + "reward": 0.495535746216774, + "reward_std": 0.12875139899551868, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4107143133878708, "step": 3231 }, { "clip_ratio": 0.0, - "completion_length": 1557.6741638183594, + "completion_length": 1742.5380249023438, "epoch": 0.9654245388693898, - "grad_norm": 11.592642784118652, - "kl": 0.21044921875, - "learning_rate": 3.592565840114803e-10, - "loss": 0.1238, - "reward": 0.361607164144516, - "reward_std": 0.17268460988998413, - "rewards/accuracy_reward": 0.020089286845177412, + "grad_norm": 9.2525634765625, + "kl": 2.9609375, + "learning_rate": 1.7962829200574015e-09, + "loss": 0.1717, + "reward": 0.4620535895228386, + "reward_std": 0.1367003209888935, + "rewards/accuracy_reward": 0.02232142980210483, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.341517873108387, + "rewards/tag_count_reward": 0.439732164144516, "step": 3232 }, { "clip_ratio": 0.0, - "completion_length": 1561.5893859863281, + "completion_length": 1721.3460693359375, "epoch": 0.9657232469569114, - "grad_norm": 12.345383644104004, - "kl": 0.21826171875, - "learning_rate": 3.530431395032396e-10, - "loss": 0.1396, - "reward": 0.379464291036129, - "reward_std": 0.19867490604519844, - "rewards/accuracy_reward": 0.04910714668221772, + "grad_norm": 11.904914855957031, + "kl": 4.828125, + "learning_rate": 1.765215697516198e-09, + "loss": 0.2804, + "reward": 0.4849330559372902, + "reward_std": 0.1466090828180313, + "rewards/accuracy_reward": 0.07142857648432255, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4135044813156128, "step": 3233 }, { "clip_ratio": 0.0, - "completion_length": 1650.9598693847656, + "completion_length": 1771.024658203125, "epoch": 0.9660219550444328, - "grad_norm": 8.1951322555542, - "kl": 0.242431640625, - "learning_rate": 3.468837060181362e-10, - "loss": 0.0969, - "reward": 0.3750000149011612, - "reward_std": 0.19366420432925224, - "rewards/accuracy_reward": 0.0758928619325161, + "grad_norm": 5.6642961502075195, + "kl": 3.875, + "learning_rate": 1.734418530090681e-09, + "loss": 0.2206, + "reward": 0.5022321715950966, + "reward_std": 0.17550724558532238, + "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.299107164144516, + "rewards/tag_count_reward": 0.4174107313156128, "step": 3234 }, { "clip_ratio": 0.0, - "completion_length": 1632.4241638183594, + "completion_length": 1757.2009582519531, "epoch": 0.9663206631319543, - "grad_norm": 12.34991455078125, - "kl": 0.2197265625, - "learning_rate": 3.407782902570322e-10, - "loss": 0.1393, - "reward": 0.4324776977300644, - "reward_std": 0.19356803968548775, - "rewards/accuracy_reward": 0.10267857578583062, + "grad_norm": 6.1873626708984375, + "kl": 3.59765625, + "learning_rate": 1.703891451285161e-09, + "loss": 0.2164, + "reward": 0.5563616454601288, + "reward_std": 0.1640193685889244, + "rewards/accuracy_reward": 0.1316964386496693, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329799123108387, + "rewards/tag_count_reward": 0.4246651977300644, "step": 3235 }, { "clip_ratio": 0.0, - "completion_length": 1594.7813415527344, + "completion_length": 1734.9107666015625, "epoch": 0.9666193712194757, - "grad_norm": 13.079147338867188, - "kl": 0.218505859375, - "learning_rate": 3.3472689886202554e-10, - "loss": 0.1442, - "reward": 0.4095982387661934, - "reward_std": 0.15361764281988144, - "rewards/accuracy_reward": 0.0736607164144516, + "grad_norm": 11.023072242736816, + "kl": 3.54296875, + "learning_rate": 1.6736344943101277e-09, + "loss": 0.2383, + "reward": 0.510602705180645, + "reward_std": 0.12545725144445896, + "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4280134066939354, "step": 3236 }, { "clip_ratio": 0.0, - "completion_length": 1616.8951416015625, + "completion_length": 1757.58935546875, "epoch": 0.9669180793069972, - "grad_norm": 10.797182083129883, - "kl": 0.232177734375, - "learning_rate": 3.2872953841642794e-10, - "loss": 0.1299, - "reward": 0.4319196566939354, - "reward_std": 0.19638054445385933, - "rewards/accuracy_reward": 0.10937500721774995, + "grad_norm": 8.52560806274414, + "kl": 3.9765625, + "learning_rate": 1.6436476920821396e-09, + "loss": 0.2214, + "reward": 0.5535714626312256, + "reward_std": 0.18101037666201591, + "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3237 }, { "clip_ratio": 0.0, - "completion_length": 1608.7031860351562, + "completion_length": 1788.4576416015625, "epoch": 0.9672167873945187, - "grad_norm": 12.181683540344238, - "kl": 0.198974609375, - "learning_rate": 3.227862154447869e-10, - "loss": 0.1313, - "reward": 0.3856026902794838, - "reward_std": 0.1844737622886896, - "rewards/accuracy_reward": 0.03125000116415322, + "grad_norm": 5.21449613571167, + "kl": 3.515625, + "learning_rate": 1.6139310772239346e-09, + "loss": 0.1935, + "reward": 0.4827009215950966, + "reward_std": 0.1373145617544651, + "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3543526902794838, + "rewards/tag_count_reward": 0.4335937649011612, "step": 3238 }, { "clip_ratio": 0.0, - "completion_length": 1664.4264221191406, + "completion_length": 1839.9152526855469, "epoch": 0.9675154954820402, - "grad_norm": 10.420833587646484, - "kl": 0.25341796875, - "learning_rate": 3.1689693641285267e-10, - "loss": 0.1098, - "reward": 0.3710937649011612, - "reward_std": 0.1934010088443756, - "rewards/accuracy_reward": 0.06473214412108064, + "grad_norm": 3.3413240909576416, + "kl": 4.71875, + "learning_rate": 1.5844846820642632e-09, + "loss": 0.2622, + "reward": 0.510044664144516, + "reward_std": 0.20998390763998032, + "rewards/accuracy_reward": 0.09821428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3063616156578064, + "rewards/tag_count_reward": 0.4118303805589676, "step": 3239 }, { "clip_ratio": 0.0, - "completion_length": 1522.4107971191406, + "completion_length": 1726.3348999023438, "epoch": 0.9678142035695616, - "grad_norm": 14.02145004272461, - "kl": 0.20556640625, - "learning_rate": 3.11061707727589e-10, - "loss": 0.17, - "reward": 0.3666294813156128, - "reward_std": 0.1683400794863701, - "rewards/accuracy_reward": 0.011160714784637094, + "grad_norm": 11.348340034484863, + "kl": 3.1796875, + "learning_rate": 1.555308538637945e-09, + "loss": 0.1876, + "reward": 0.4391741305589676, + "reward_std": 0.11797518283128738, + "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3554687723517418, + "rewards/tag_count_reward": 0.4302455633878708, "step": 3240 }, { "clip_ratio": 0.0, - "completion_length": 1634.0157165527344, + "completion_length": 1793.8728332519531, "epoch": 0.9681129116570831, - "grad_norm": 10.391838073730469, - "kl": 0.22021484375, - "learning_rate": 3.052805357371402e-10, - "loss": 0.1217, - "reward": 0.3431919813156128, - "reward_std": 0.201731588691473, - "rewards/accuracy_reward": 0.0267857164144516, + "grad_norm": 3.342447519302368, + "kl": 3.07421875, + "learning_rate": 1.526402678685701e-09, + "loss": 0.1839, + "reward": 0.4575893059372902, + "reward_std": 0.1572511661797762, + "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4308035969734192, "step": 3241 }, { "clip_ratio": 0.0, - "completion_length": 1646.5246276855469, + "completion_length": 1768.4085388183594, "epoch": 0.9684116197446045, - "grad_norm": 10.60085678100586, - "kl": 0.22900390625, - "learning_rate": 2.995534267308697e-10, - "loss": 0.1243, - "reward": 0.408482164144516, - "reward_std": 0.17149633169174194, - "rewards/accuracy_reward": 0.09151786053553224, + "grad_norm": 16.113948822021484, + "kl": 4.01953125, + "learning_rate": 1.4977671336543484e-09, + "loss": 0.2376, + "reward": 0.5217634215950966, + "reward_std": 0.1494471412152052, + "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.316964291036129, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3242 }, { "clip_ratio": 0.0, - "completion_length": 1643.7299499511719, + "completion_length": 1802.3281860351562, "epoch": 0.9687103278321261, - "grad_norm": 13.472392082214355, - "kl": 0.226318359375, - "learning_rate": 2.938803869392992e-10, - "loss": 0.139, - "reward": 0.448102705180645, - "reward_std": 0.19761697947978973, - "rewards/accuracy_reward": 0.12053572316654027, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3275669813156128, + "grad_norm": 7.708029747009277, + "kl": 3.17578125, + "learning_rate": 1.4694019346964958e-09, + "loss": 0.1761, + "reward": 0.5552455633878708, + "reward_std": 0.15770412608981133, + "rewards/accuracy_reward": 0.12723215157166123, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4280134066939354, "step": 3243 }, { "clip_ratio": 0.0, - "completion_length": 1625.6540832519531, + "completion_length": 1787.6563415527344, "epoch": 0.9690090359196475, - "grad_norm": 10.369513511657715, - "kl": 0.218994140625, - "learning_rate": 2.8826142253414733e-10, - "loss": 0.1162, - "reward": 0.451450914144516, - "reward_std": 0.1512911282479763, - "rewards/accuracy_reward": 0.1205357201397419, + "grad_norm": 7.213177680969238, + "kl": 4.48046875, + "learning_rate": 1.4413071126707367e-09, + "loss": 0.2409, + "reward": 0.5608259215950966, + "reward_std": 0.1370740346610546, + "rewards/accuracy_reward": 0.13616071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4246651977300644, "step": 3244 }, { "clip_ratio": 0.0, - "completion_length": 1688.7255249023438, + "completion_length": 1834.4911499023438, "epoch": 0.969307744007169, - "grad_norm": 11.670913696289062, - "kl": 0.241943359375, - "learning_rate": 2.82696539628291e-10, - "loss": 0.1207, - "reward": 0.371651791036129, - "reward_std": 0.2027040608227253, - "rewards/accuracy_reward": 0.06250000325962901, + "grad_norm": 6.7094831466674805, + "kl": 4.53125, + "learning_rate": 1.413482698141455e-09, + "loss": 0.2549, + "reward": 0.4849330559372902, + "reward_std": 0.16904442012310028, + "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4068080559372902, "step": 3245 }, { "clip_ratio": 0.0, - "completion_length": 1554.6116943359375, + "completion_length": 1709.4241943359375, "epoch": 0.9696064520946904, - "grad_norm": 12.041770935058594, - "kl": 0.237060546875, - "learning_rate": 2.771857442757819e-10, - "loss": 0.1544, - "reward": 0.3738839477300644, - "reward_std": 0.19203253462910652, - "rewards/accuracy_reward": 0.05580357392318547, + "grad_norm": 11.190864562988281, + "kl": 4.2890625, + "learning_rate": 1.3859287213789095e-09, + "loss": 0.265, + "reward": 0.5083705633878708, + "reward_std": 0.14753515273332596, + "rewards/accuracy_reward": 0.07812500186264515, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.4302455633878708, "step": 3246 }, { "clip_ratio": 0.0, - "completion_length": 1591.7701416015625, + "completion_length": 1774.87060546875, "epoch": 0.969905160182212, - "grad_norm": 11.642980575561523, - "kl": 0.223388671875, - "learning_rate": 2.717290424718188e-10, - "loss": 0.118, - "reward": 0.416294664144516, - "reward_std": 0.16807594150304794, - "rewards/accuracy_reward": 0.08482143329456449, + "grad_norm": 7.2082200050354, + "kl": 4.20703125, + "learning_rate": 1.3586452123590941e-09, + "loss": 0.2546, + "reward": 0.5167410895228386, + "reward_std": 0.148310799151659, + "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4229910895228386, "step": 3247 }, { "clip_ratio": 0.0, - "completion_length": 1681.7545166015625, + "completion_length": 1837.290283203125, "epoch": 0.9702038682697334, - "grad_norm": 12.107640266418457, - "kl": 0.23046875, - "learning_rate": 2.6632644015276983e-10, - "loss": 0.1288, - "reward": 0.3638392984867096, - "reward_std": 0.16686175391077995, - "rewards/accuracy_reward": 0.04241071757860482, + "grad_norm": 5.3650288581848145, + "kl": 4.35546875, + "learning_rate": 1.3316322007638491e-09, + "loss": 0.2479, + "reward": 0.4547991305589676, + "reward_std": 0.12177349627017975, + "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3214285895228386, + "rewards/tag_count_reward": 0.4146205559372902, "step": 3248 }, { "clip_ratio": 0.0, - "completion_length": 1600.0670166015625, + "completion_length": 1763.3616943359375, "epoch": 0.9705025763572549, - "grad_norm": 12.264391899108887, - "kl": 0.219482421875, - "learning_rate": 2.6097794319612785e-10, - "loss": 0.1432, - "reward": 0.3671875149011612, - "reward_std": 0.18375548720359802, - "rewards/accuracy_reward": 0.026785715483129025, + "grad_norm": 9.203317642211914, + "kl": 3.91015625, + "learning_rate": 1.3048897159806393e-09, + "loss": 0.22, + "reward": 0.4670759066939354, + "reward_std": 0.14849507808685303, + "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3249 }, { "clip_ratio": 0.0, - "completion_length": 1648.7121276855469, + "completion_length": 1804.180908203125, "epoch": 0.9708012844447763, - "grad_norm": 11.688170433044434, - "kl": 0.23046875, - "learning_rate": 2.5568355742053293e-10, - "loss": 0.131, - "reward": 0.3398437723517418, - "reward_std": 0.1798744574189186, - "rewards/accuracy_reward": 0.017857143422588706, + "grad_norm": 8.387083053588867, + "kl": 3.52734375, + "learning_rate": 1.2784177871026648e-09, + "loss": 0.1944, + "reward": 0.4486607387661934, + "reward_std": 0.1336724292486906, + "rewards/accuracy_reward": 0.026785715715959668, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.321986623108387, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3250 }, { "clip_ratio": 0.0, - "completion_length": 1638.3192749023438, + "completion_length": 1791.4822387695312, "epoch": 0.9710999925322978, - "grad_norm": 12.369890213012695, - "kl": 0.240966796875, - "learning_rate": 2.50443288585761e-10, - "loss": 0.1342, - "reward": 0.4352678805589676, - "reward_std": 0.2131337784230709, + "grad_norm": 14.715662002563477, + "kl": 2.8046875, + "learning_rate": 1.2522164429288051e-09, + "loss": 0.1831, + "reward": 0.5546875223517418, + "reward_std": 0.1625835169106722, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3147321492433548, + "rewards/tag_count_reward": 0.4341518059372902, "step": 3251 }, { "clip_ratio": 0.0, - "completion_length": 1631.66748046875, + "completion_length": 1733.4777526855469, "epoch": 0.9713987006198193, - "grad_norm": 13.72048568725586, - "kl": 0.24658203125, - "learning_rate": 2.45257142392713e-10, - "loss": 0.1468, - "reward": 0.428013414144516, - "reward_std": 0.15669255144894123, - "rewards/accuracy_reward": 0.113839291036129, + "grad_norm": 7.860199451446533, + "kl": 4.01171875, + "learning_rate": 1.226285711963565e-09, + "loss": 0.2234, + "reward": 0.5401785969734192, + "reward_std": 0.11176765523850918, + "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.424107164144516, "step": 3252 }, { "clip_ratio": 0.0, - "completion_length": 1569.9353332519531, + "completion_length": 1676.5447387695312, "epoch": 0.9716974087073408, - "grad_norm": 13.321434020996094, - "kl": 0.232421875, - "learning_rate": 2.401251244834035e-10, - "loss": 0.1452, - "reward": 0.431361623108387, - "reward_std": 0.18067064136266708, - "rewards/accuracy_reward": 0.09151786426082253, + "grad_norm": 12.388923645019531, + "kl": 2.9140625, + "learning_rate": 1.2006256224170176e-09, + "loss": 0.212, + "reward": 0.558035746216774, + "reward_std": 0.16197515465319157, + "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3398437649011612, + "rewards/tag_count_reward": 0.4419643059372902, "step": 3253 }, { "clip_ratio": 0.0, - "completion_length": 1497.4666137695312, + "completion_length": 1658.4554443359375, "epoch": 0.9719961167948622, - "grad_norm": 11.315459251403809, - "kl": 0.208251953125, - "learning_rate": 2.3504724044097204e-10, - "loss": 0.1221, - "reward": 0.4994419887661934, - "reward_std": 0.21361489966511726, - "rewards/accuracy_reward": 0.1406250074505806, + "grad_norm": 17.441303253173828, + "kl": 2.482421875, + "learning_rate": 1.17523620220486e-09, + "loss": 0.1706, + "reward": 0.5887276977300644, + "reward_std": 0.19740157201886177, + "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3588169813156128, + "rewards/tag_count_reward": 0.4414062723517418, "step": 3254 }, { "clip_ratio": 0.0, - "completion_length": 1539.6429443359375, + "completion_length": 1738.1853332519531, "epoch": 0.9722948248823837, - "grad_norm": 13.851570129394531, - "kl": 0.2177734375, - "learning_rate": 2.300234957896441e-10, - "loss": 0.1533, - "reward": 0.3588169813156128, - "reward_std": 0.1773918904364109, - "rewards/accuracy_reward": 0.015625000931322575, + "grad_norm": 7.259647369384766, + "kl": 3.14453125, + "learning_rate": 1.1501174789482205e-09, + "loss": 0.1827, + "reward": 0.4542410969734192, + "reward_std": 0.1370771937072277, + "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3431919813156128, + "rewards/tag_count_reward": 0.4296875149011612, "step": 3255 }, { "clip_ratio": 0.0, - "completion_length": 1653.3215026855469, + "completion_length": 1804.2188415527344, "epoch": 0.9725935329699051, - "grad_norm": 11.77859878540039, - "kl": 0.282958984375, - "learning_rate": 2.2505389599477564e-10, - "loss": 0.1077, - "reward": 0.3800223395228386, - "reward_std": 0.17775210738182068, + "grad_norm": 4.185131072998047, + "kl": 4.08984375, + "learning_rate": 1.1252694799738782e-09, + "loss": 0.2213, + "reward": 0.4815848544239998, + "reward_std": 0.1548773217946291, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901977300644, + "rewards/tag_count_reward": 0.4168526977300644, "step": 3256 }, { "clip_ratio": 0.0, - "completion_length": 1546.2992248535156, + "completion_length": 1722.8170471191406, "epoch": 0.9728922410574267, - "grad_norm": 13.355142593383789, - "kl": 0.232421875, - "learning_rate": 2.201384464628031e-10, - "loss": 0.1241, - "reward": 0.3828125149011612, - "reward_std": 0.1916753128170967, - "rewards/accuracy_reward": 0.06250000349245965, + "grad_norm": 10.445076942443848, + "kl": 3.421875, + "learning_rate": 1.1006922323140155e-09, + "loss": 0.2177, + "reward": 0.5133928880095482, + "reward_std": 0.15792641043663025, + "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3203125149011612, + "rewards/tag_count_reward": 0.4375000223517418, "step": 3257 }, { "clip_ratio": 0.0, - "completion_length": 1603.6719360351562, + "completion_length": 1811.1853637695312, "epoch": 0.9731909491449481, - "grad_norm": 11.322084426879883, - "kl": 0.21630859375, - "learning_rate": 2.1527715254124334e-10, - "loss": 0.0888, - "reward": 0.387276791036129, - "reward_std": 0.20675116032361984, - "rewards/accuracy_reward": 0.051339289639145136, + "grad_norm": 3.947209358215332, + "kl": 3.19140625, + "learning_rate": 1.0763857627062167e-09, + "loss": 0.1849, + "reward": 0.4827009215950966, + "reward_std": 0.163326783105731, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3359375149011612, + "rewards/tag_count_reward": 0.4202009066939354, "step": 3258 }, { "clip_ratio": 0.0, - "completion_length": 1625.1183471679688, + "completion_length": 1758.7500610351562, "epoch": 0.9734896572324696, - "grad_norm": 12.890602111816406, - "kl": 0.24072265625, - "learning_rate": 2.104700195187159e-10, - "loss": 0.1198, - "reward": 0.392299123108387, - "reward_std": 0.18966740369796753, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 8.148619651794434, + "kl": 4.84375, + "learning_rate": 1.0523500975935794e-09, + "loss": 0.2744, + "reward": 0.4916294813156128, + "reward_std": 0.16704277321696281, + "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.411272332072258, "step": 3259 }, { "clip_ratio": 0.0, - "completion_length": 1614.60498046875, + "completion_length": 1727.977783203125, "epoch": 0.973788365319991, - "grad_norm": 11.785706520080566, - "kl": 0.24169921875, - "learning_rate": 2.057170526249097e-10, - "loss": 0.1422, - "reward": 0.4123884066939354, - "reward_std": 0.20211618021130562, - "rewards/accuracy_reward": 0.10044643003493547, + "grad_norm": 8.718232154846191, + "kl": 4.31640625, + "learning_rate": 1.0285852631245485e-09, + "loss": 0.2689, + "reward": 0.5429687723517418, + "reward_std": 0.1785835474729538, + "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3119419813156128, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3260 }, { "clip_ratio": 0.0, - "completion_length": 1600.1920471191406, + "completion_length": 1756.68310546875, "epoch": 0.9740870734075125, - "grad_norm": 14.605752944946289, - "kl": 0.241455078125, - "learning_rate": 2.010182570305885e-10, - "loss": 0.1375, - "reward": 0.3861607313156128, - "reward_std": 0.20084348693490028, - "rewards/accuracy_reward": 0.06696428917348385, + "grad_norm": 18.217281341552734, + "kl": 4.58203125, + "learning_rate": 1.0050912851529425e-09, + "loss": 0.2554, + "reward": 0.5089285895228386, + "reward_std": 0.16127764247357845, + "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.419642873108387, "step": 3261 }, { "clip_ratio": 0.0, - "completion_length": 1558.8750610351562, + "completion_length": 1702.5291137695312, "epoch": 0.974385781495034, - "grad_norm": 12.736122131347656, - "kl": 0.2138671875, - "learning_rate": 1.9637363784757443e-10, - "loss": 0.1238, - "reward": 0.3967634215950966, - "reward_std": 0.19983938708901405, - "rewards/accuracy_reward": 0.05133928917348385, + "grad_norm": 11.334710121154785, + "kl": 3.5234375, + "learning_rate": 9.818681892378722e-10, + "loss": 0.2235, + "reward": 0.502790205180645, + "reward_std": 0.18495706096291542, + "rewards/accuracy_reward": 0.06696429010480642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.345424123108387, + "rewards/tag_count_reward": 0.4358259066939354, "step": 3262 }, { "clip_ratio": 0.0, - "completion_length": 1559.4241638183594, + "completion_length": 1719.0022888183594, "epoch": 0.9746844895825555, - "grad_norm": 11.205911636352539, - "kl": 0.197509765625, - "learning_rate": 1.9178320012876449e-10, - "loss": 0.1153, - "reward": 0.3867187649011612, - "reward_std": 0.17993245273828506, - "rewards/accuracy_reward": 0.029017858440056443, + "grad_norm": 4.499971389770508, + "kl": 2.546875, + "learning_rate": 9.589160006438224e-10, + "loss": 0.1475, + "reward": 0.4737723469734192, + "reward_std": 0.13281539268791676, + "rewards/accuracy_reward": 0.029017859371379018, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.357700914144516, + "rewards/tag_count_reward": 0.4447544813156128, "step": 3263 }, { "clip_ratio": 0.0, - "completion_length": 1555.7500610351562, + "completion_length": 1704.1920166015625, "epoch": 0.9749831976700769, - "grad_norm": 14.799580574035645, - "kl": 0.211669921875, - "learning_rate": 1.8724694886810278e-10, - "loss": 0.1373, - "reward": 0.482142873108387, - "reward_std": 0.20838763564825058, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 24.08409881591797, + "kl": 2.90234375, + "learning_rate": 9.362347443405138e-10, + "loss": 0.1919, + "reward": 0.5814732387661934, + "reward_std": 0.17124444991350174, + "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3549107313156128, + "rewards/tag_count_reward": 0.4430803805589676, "step": 3264 }, { "clip_ratio": 0.0, - "completion_length": 1561.8906860351562, + "completion_length": 1644.2366943359375, "epoch": 0.9752819057575984, - "grad_norm": 13.357328414916992, - "kl": 0.225830078125, - "learning_rate": 1.8276488900058618e-10, - "loss": 0.1499, - "reward": 0.424107164144516, - "reward_std": 0.19198548048734665, - "rewards/accuracy_reward": 0.09375000186264515, + "grad_norm": 4.518803119659424, + "kl": 4.01953125, + "learning_rate": 9.138244450029309e-10, + "loss": 0.2466, + "reward": 0.5580357313156128, + "reward_std": 0.1352438535541296, + "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.435267873108387, "step": 3265 }, { "clip_ratio": 0.0, - "completion_length": 1606.12060546875, + "completion_length": 1769.8527526855469, "epoch": 0.9755806138451198, - "grad_norm": 12.159141540527344, - "kl": 0.2373046875, - "learning_rate": 1.783370254022587e-10, - "loss": 0.1327, - "reward": 0.353794664144516, - "reward_std": 0.17780140414834023, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 6.444426536560059, + "kl": 3.298828125, + "learning_rate": 8.916851270112935e-10, + "loss": 0.1823, + "reward": 0.491071455180645, + "reward_std": 0.16471440717577934, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3266 }, { "clip_ratio": 0.0, - "completion_length": 1532.7322082519531, + "completion_length": 1677.5960693359375, "epoch": 0.9758793219326414, - "grad_norm": 14.411086082458496, - "kl": 0.220703125, - "learning_rate": 1.7396336289019487e-10, - "loss": 0.1504, - "reward": 0.3833705484867096, - "reward_std": 0.1683763898909092, - "rewards/accuracy_reward": 0.044642857974395156, + "grad_norm": 14.600465774536133, + "kl": 2.501953125, + "learning_rate": 8.698168144509744e-10, + "loss": 0.1546, + "reward": 0.5295759215950966, + "reward_std": 0.1610003411769867, + "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3387276902794838, + "rewards/tag_count_reward": 0.4514509215950966, "step": 3267 }, { "clip_ratio": 0.0, - "completion_length": 1588.6853332519531, + "completion_length": 1720.274658203125, "epoch": 0.9761780300201628, - "grad_norm": 14.073223114013672, - "kl": 0.228271484375, - "learning_rate": 1.6964390622252189e-10, - "loss": 0.1521, - "reward": 0.369419664144516, - "reward_std": 0.19755128026008606, - "rewards/accuracy_reward": 0.0290178582072258, + "grad_norm": 40.18429946899414, + "kl": 3.697265625, + "learning_rate": 8.482195311126095e-10, + "loss": 0.2371, + "reward": 0.4726562798023224, + "reward_std": 0.14589036256074905, + "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404017984867096, + "rewards/tag_count_reward": 0.4369419887661934, "step": 3268 }, { "clip_ratio": 0.0, - "completion_length": 1575.3929138183594, + "completion_length": 1773.0379943847656, "epoch": 0.9764767381076843, - "grad_norm": 11.700677871704102, - "kl": 0.22021484375, - "learning_rate": 1.653786600983753e-10, - "loss": 0.1215, - "reward": 0.4017857387661934, - "reward_std": 0.19829922541975975, - "rewards/accuracy_reward": 0.0714285729918629, + "grad_norm": 6.269368648529053, + "kl": 3.94140625, + "learning_rate": 8.268933004918766e-10, + "loss": 0.2166, + "reward": 0.5066964626312256, + "reward_std": 0.16532974503934383, + "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.4218750223517418, "step": 3269 }, { "clip_ratio": 0.0, - "completion_length": 1582.6652526855469, + "completion_length": 1740.6563110351562, "epoch": 0.9767754461952057, - "grad_norm": 14.14217472076416, - "kl": 0.2353515625, - "learning_rate": 1.6116762915793224e-10, - "loss": 0.1507, - "reward": 0.5117187649011612, - "reward_std": 0.17905160412192345, - "rewards/accuracy_reward": 0.1808035746216774, + "grad_norm": 7.653167247772217, + "kl": 5.02734375, + "learning_rate": 8.058381457896612e-10, + "loss": 0.31, + "reward": 0.604910746216774, + "reward_std": 0.15586625412106514, + "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3309151902794838, + "rewards/tag_count_reward": 0.4129464477300644, "step": 3270 }, { "clip_ratio": 0.0, - "completion_length": 1574.8951721191406, + "completion_length": 1786.3683776855469, "epoch": 0.9770741542827273, - "grad_norm": 12.606785774230957, - "kl": 0.234375, - "learning_rate": 1.5701081798237258e-10, - "loss": 0.11, - "reward": 0.4676339477300644, - "reward_std": 0.1558278501033783, - "rewards/accuracy_reward": 0.1361607201397419, + "grad_norm": 9.851397514343262, + "kl": 4.03125, + "learning_rate": 7.85054089911863e-10, + "loss": 0.2315, + "reward": 0.5541294813156128, + "reward_std": 0.1262793205678463, + "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732238650322, + "rewards/tag_count_reward": 0.4090401977300644, "step": 3271 }, { "clip_ratio": 0.0, - "completion_length": 1598.3817749023438, + "completion_length": 1776.1429443359375, "epoch": 0.9773728623702487, - "grad_norm": 11.295988082885742, - "kl": 0.22900390625, - "learning_rate": 1.5290823109390672e-10, - "loss": 0.1139, - "reward": 0.407366082072258, - "reward_std": 0.24611788988113403, - "rewards/accuracy_reward": 0.08258929220028222, + "grad_norm": 10.898795127868652, + "kl": 3.76953125, + "learning_rate": 7.645411554695336e-10, + "loss": 0.2023, + "reward": 0.5290178805589676, + "reward_std": 0.18615956231951714, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.428571455180645, "step": 3272 }, { "clip_ratio": 0.0, - "completion_length": 1573.2880249023438, + "completion_length": 1752.6719360351562, "epoch": 0.9776715704577701, - "grad_norm": 12.336309432983398, - "kl": 0.227783203125, - "learning_rate": 1.4885987295574776e-10, - "loss": 0.1269, - "reward": 0.4135044813156128, - "reward_std": 0.23573904857039452, - "rewards/accuracy_reward": 0.07589286286383867, + "grad_norm": 3.232449531555176, + "kl": 3.80078125, + "learning_rate": 7.442993647787388e-10, + "loss": 0.2239, + "reward": 0.5440848395228386, + "reward_std": 0.21797686256468296, + "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3273 }, { "clip_ratio": 0.0, - "completion_length": 1617.22998046875, + "completion_length": 1763.4866943359375, "epoch": 0.9779702785452916, - "grad_norm": 12.003737449645996, - "kl": 0.23193359375, - "learning_rate": 1.448657479721116e-10, - "loss": 0.1166, - "reward": 0.3604910895228386, - "reward_std": 0.21915758401155472, - "rewards/accuracy_reward": 0.046875002793967724, + "grad_norm": 24.500986099243164, + "kl": 4.16015625, + "learning_rate": 7.24328739860558e-10, + "loss": 0.2141, + "reward": 0.4860491305589676, + "reward_std": 0.18895114213228226, + "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3136160969734192, + "rewards/tag_count_reward": 0.4190848469734192, "step": 3274 }, { "clip_ratio": 0.0, - "completion_length": 1588.6563110351562, + "completion_length": 1713.9264221191406, "epoch": 0.978268986632813, - "grad_norm": 13.404067039489746, - "kl": 0.218017578125, - "learning_rate": 1.4092586048820576e-10, - "loss": 0.1577, - "reward": 0.4229910895228386, - "reward_std": 0.20141853764653206, - "rewards/accuracy_reward": 0.0825892873108387, + "grad_norm": 7.536870002746582, + "kl": 2.78125, + "learning_rate": 7.046293024410288e-10, + "loss": 0.1921, + "reward": 0.5329241454601288, + "reward_std": 0.18951557576656342, + "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3404018059372902, + "rewards/tag_count_reward": 0.4347098395228386, "step": 3275 }, { "clip_ratio": 0.0, - "completion_length": 1694.4041137695312, + "completion_length": 1813.7880249023438, "epoch": 0.9785676947203346, - "grad_norm": 11.727992057800293, - "kl": 0.26708984375, - "learning_rate": 1.370402147902572e-10, - "loss": 0.1164, - "reward": 0.3582589477300644, - "reward_std": 0.17148517444729805, - "rewards/accuracy_reward": 0.058035717345774174, + "grad_norm": 16.642459869384766, + "kl": 4.640625, + "learning_rate": 6.852010739512859e-10, + "loss": 0.2588, + "reward": 0.4821428805589676, + "reward_std": 0.14625650271773338, + "rewards/accuracy_reward": 0.07142857578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3002232238650322, + "rewards/tag_count_reward": 0.4107143059372902, "step": 3276 }, { "clip_ratio": 0.0, - "completion_length": 1653.13623046875, + "completion_length": 1771.930908203125, "epoch": 0.978866402807856, - "grad_norm": 9.484187126159668, - "kl": 0.25341796875, - "learning_rate": 1.3320881510545668e-10, - "loss": 0.1196, - "reward": 0.4101562723517418, - "reward_std": 0.19490478560328484, - "rewards/accuracy_reward": 0.10937500675208867, + "grad_norm": 14.33617115020752, + "kl": 5.1875, + "learning_rate": 6.660440755272833e-10, + "loss": 0.2658, + "reward": 0.5329241380095482, + "reward_std": 0.17029426246881485, + "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3007812649011612, + "rewards/tag_count_reward": 0.4079241305589676, "step": 3277 }, { "clip_ratio": 0.0, - "completion_length": 1638.5313110351562, + "completion_length": 1792.7188110351562, "epoch": 0.9791651108953775, - "grad_norm": 11.867908477783203, - "kl": 0.226318359375, - "learning_rate": 1.2943166560199226e-10, - "loss": 0.1255, - "reward": 0.4107143059372902, - "reward_std": 0.1960003674030304, - "rewards/accuracy_reward": 0.08035714668221772, + "grad_norm": 8.5479154586792, + "kl": 4.796875, + "learning_rate": 6.471583280099613e-10, + "loss": 0.2599, + "reward": 0.5055803880095482, + "reward_std": 0.16871783696115017, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3303571566939354, + "rewards/tag_count_reward": 0.416294664144516, "step": 3278 }, { "clip_ratio": 0.0, - "completion_length": 1608.6943054199219, + "completion_length": 1733.5558776855469, "epoch": 0.9794638189828989, - "grad_norm": 13.8421630859375, - "kl": 0.23095703125, - "learning_rate": 1.2570877038903803e-10, - "loss": 0.1415, - "reward": 0.3861607387661934, - "reward_std": 0.18740993738174438, - "rewards/accuracy_reward": 0.05803571757860482, + "grad_norm": 7.3055644035339355, + "kl": 3.4765625, + "learning_rate": 6.285438519451902e-10, + "loss": 0.2024, + "reward": 0.4966518133878708, + "reward_std": 0.15309485793113708, + "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4274553805589676, "step": 3279 }, { "clip_ratio": 0.0, - "completion_length": 1626.1652526855469, + "completion_length": 1765.7701721191406, "epoch": 0.9797625270704204, - "grad_norm": 12.902143478393555, - "kl": 0.2412109375, - "learning_rate": 1.2204013351673204e-10, - "loss": 0.1325, - "reward": 0.3900669813156128, - "reward_std": 0.16092448309063911, - "rewards/accuracy_reward": 0.07366071757860482, + "grad_norm": 7.271821022033691, + "kl": 4.45703125, + "learning_rate": 6.102006675836602e-10, + "loss": 0.2843, + "reward": 0.5000000298023224, + "reward_std": 0.13319792784750462, + "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3164062649011612, + "rewards/tag_count_reward": 0.4174107313156128, "step": 3280 }, { "clip_ratio": 0.0, - "completion_length": 1588.0960388183594, + "completion_length": 1744.1920776367188, "epoch": 0.9800612351579419, - "grad_norm": 13.242924690246582, - "kl": 0.2236328125, - "learning_rate": 1.1842575897619833e-10, - "loss": 0.1454, - "reward": 0.3476562649011612, - "reward_std": 0.1698557399213314, - "rewards/accuracy_reward": 0.022321429569274187, + "grad_norm": 8.650383949279785, + "kl": 3.265625, + "learning_rate": 5.921287948809917e-10, + "loss": 0.194, + "reward": 0.4536830633878708, + "reward_std": 0.14453030563890934, + "rewards/accuracy_reward": 0.02008928661234677, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4335937723517418, "step": 3281 }, { "clip_ratio": 0.0, - "completion_length": 1643.3750610351562, + "completion_length": 1774.321533203125, "epoch": 0.9803599432454634, - "grad_norm": 11.2639799118042, - "kl": 0.2158203125, - "learning_rate": 1.1486565069951937e-10, - "loss": 0.1162, - "reward": 0.4151785895228386, - "reward_std": 0.19793709740042686, - "rewards/accuracy_reward": 0.08035714644938707, + "grad_norm": 13.32384204864502, + "kl": 3.7890625, + "learning_rate": 5.743282534975968e-10, + "loss": 0.22, + "reward": 0.5106026902794838, + "reward_std": 0.17397739738225937, + "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214402794838, + "rewards/tag_count_reward": 0.4168526977300644, "step": 3282 }, { "clip_ratio": 0.0, - "completion_length": 1577.1942443847656, + "completion_length": 1688.8438110351562, "epoch": 0.9806586513329848, - "grad_norm": 12.249119758605957, - "kl": 0.217529296875, - "learning_rate": 1.1135981255974147e-10, - "loss": 0.1388, - "reward": 0.4520089477300644, - "reward_std": 0.17988598719239235, - "rewards/accuracy_reward": 0.1138392947614193, + "grad_norm": 5.329275608062744, + "kl": 3.921875, + "learning_rate": 5.567990627987073e-10, + "loss": 0.2316, + "reward": 0.549107164144516, + "reward_std": 0.1544457357376814, + "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696566939354, + "rewards/tag_count_reward": 0.4263393059372902, "step": 3283 }, { "clip_ratio": 0.0, - "completion_length": 1519.4308776855469, + "completion_length": 1686.9777526855469, "epoch": 0.9809573594205063, - "grad_norm": 12.387312889099121, - "kl": 0.20263671875, - "learning_rate": 1.0790824837088041e-10, - "loss": 0.1368, - "reward": 0.3828125149011612, - "reward_std": 0.1765216737985611, - "rewards/accuracy_reward": 0.029017859371379018, + "grad_norm": 7.508930683135986, + "kl": 3.4296875, + "learning_rate": 5.395412418544021e-10, + "loss": 0.2039, + "reward": 0.4559151977300644, + "reward_std": 0.13626448065042496, + "rewards/accuracy_reward": 0.022321429569274187, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3537946566939354, + "rewards/tag_count_reward": 0.4335937723517418, "step": 3284 }, { "clip_ratio": 0.0, - "completion_length": 1596.8371276855469, + "completion_length": 1733.7277221679688, "epoch": 0.9812560675080277, - "grad_norm": 10.857015609741211, - "kl": 0.222412109375, - "learning_rate": 1.0451096188788811e-10, - "loss": 0.1231, - "reward": 0.4101562723517418, - "reward_std": 0.1750124804675579, - "rewards/accuracy_reward": 0.0781250037252903, - "rewards/format_reward": 0.0022321429569274187, - "rewards/tag_count_reward": 0.329799123108387, + "grad_norm": 4.931296348571777, + "kl": 3.90625, + "learning_rate": 5.225548094394405e-10, + "loss": 0.2465, + "reward": 0.5122768059372902, + "reward_std": 0.13777787052094936, + "rewards/accuracy_reward": 0.082589291036129, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4296875298023224, "step": 3285 }, { "clip_ratio": 0.0, - "completion_length": 1606.6629943847656, + "completion_length": 1775.9554138183594, "epoch": 0.9815547755955493, - "grad_norm": 11.261082649230957, - "kl": 0.236328125, - "learning_rate": 1.0116795680669143e-10, - "loss": 0.118, - "reward": 0.3331473395228386, - "reward_std": 0.19656484201550484, - "rewards/accuracy_reward": 0.024553571827709675, + "grad_norm": 58.704505920410156, + "kl": 5.3203125, + "learning_rate": 5.058397840334572e-10, + "loss": 0.3105, + "reward": 0.4386160969734192, + "reward_std": 0.16509902477264404, + "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3085937574505806, + "rewards/tag_count_reward": 0.4073660969734192, "step": 3286 }, { "clip_ratio": 0.0, - "completion_length": 1675.8817749023438, + "completion_length": 1801.3147888183594, "epoch": 0.9818534836830707, - "grad_norm": 12.854827880859375, - "kl": 0.228515625, - "learning_rate": 9.787923676414234e-11, - "loss": 0.1276, - "reward": 0.3398437649011612, - "reward_std": 0.17275283485651016, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 13.329440116882324, + "kl": 3.25, + "learning_rate": 4.893961838207117e-10, + "loss": 0.1886, + "reward": 0.4693080633878708, + "reward_std": 0.1451030671596527, + "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187649011612, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3287 }, { "clip_ratio": 0.0, - "completion_length": 1608.0692443847656, + "completion_length": 1753.1340026855469, "epoch": 0.9821521917705922, - "grad_norm": 11.483778953552246, - "kl": 0.230712890625, - "learning_rate": 9.464480533805108e-11, - "loss": 0.131, - "reward": 0.381138414144516, - "reward_std": 0.19126270338892937, - "rewards/accuracy_reward": 0.0602678582072258, + "grad_norm": 13.64964771270752, + "kl": 4.89453125, + "learning_rate": 4.732240266902554e-10, + "loss": 0.2835, + "reward": 0.4988839402794838, + "reward_std": 0.16150518134236336, + "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3208705484867096, + "rewards/tag_count_reward": 0.4185267984867096, "step": 3288 }, { "clip_ratio": 0.0, - "completion_length": 1603.2746276855469, + "completion_length": 1773.3906860351562, "epoch": 0.9824508998581136, - "grad_norm": 12.769636154174805, - "kl": 0.216796875, - "learning_rate": 9.146466604716407e-11, - "loss": 0.1388, - "reward": 0.3750000223517418, - "reward_std": 0.160092044621706, - "rewards/accuracy_reward": 0.04241071501746774, + "grad_norm": 7.367542266845703, + "kl": 4.0234375, + "learning_rate": 4.573233302358204e-10, + "loss": 0.229, + "reward": 0.4771205559372902, + "reward_std": 0.11112837120890617, + "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4324776977300644, "step": 3289 }, { "clip_ratio": 0.0, - "completion_length": 1554.0425109863281, + "completion_length": 1728.6005249023438, "epoch": 0.9827496079456352, - "grad_norm": 11.113096237182617, - "kl": 0.221923828125, - "learning_rate": 8.833882235115276e-11, - "loss": 0.1519, - "reward": 0.3833705484867096, - "reward_std": 0.20608589425683022, - "rewards/accuracy_reward": 0.05803571664728224, + "grad_norm": 6.219101428985596, + "kl": 4.890625, + "learning_rate": 4.4169411175576375e-10, + "loss": 0.3024, + "reward": 0.5050223469734192, + "reward_std": 0.18626913614571095, + "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.325334832072258, + "rewards/tag_count_reward": 0.4179687649011612, "step": 3290 }, { "clip_ratio": 0.0, - "completion_length": 1683.8482971191406, + "completion_length": 1831.977783203125, "epoch": 0.9830483160331566, - "grad_norm": 12.416014671325684, - "kl": 0.22705078125, - "learning_rate": 8.52672776506358e-11, - "loss": 0.1179, - "reward": 0.4291294813156128, - "reward_std": 0.19017718732357025, - "rewards/accuracy_reward": 0.10044643143191934, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "grad_norm": 8.061516761779785, + "kl": 3.2265625, + "learning_rate": 4.2633638825317895e-10, + "loss": 0.1925, + "reward": 0.5301339402794838, + "reward_std": 0.14275600761175156, + "rewards/accuracy_reward": 0.10491071944124997, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4252232387661934, "step": 3291 }, { "clip_ratio": 0.0, - "completion_length": 1601.4777526855469, + "completion_length": 1737.0670471191406, "epoch": 0.9833470241206781, - "grad_norm": 11.95669174194336, - "kl": 0.215576171875, - "learning_rate": 8.22500352871569e-11, - "loss": 0.136, - "reward": 0.4229910969734192, - "reward_std": 0.19187147915363312, - "rewards/accuracy_reward": 0.09375000488944352, + "grad_norm": 8.11727523803711, + "kl": 4.359375, + "learning_rate": 4.1125017643578454e-10, + "loss": 0.2643, + "reward": 0.5189732238650322, + "reward_std": 0.1531633771955967, + "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4185268059372902, "step": 3292 }, { "clip_ratio": 0.0, - "completion_length": 1577.9777526855469, + "completion_length": 1755.993408203125, "epoch": 0.9836457322081995, - "grad_norm": 12.01956558227539, - "kl": 0.237060546875, - "learning_rate": 7.928709854316817e-11, - "loss": 0.1434, - "reward": 0.3978794738650322, - "reward_std": 0.19059866294264793, - "rewards/accuracy_reward": 0.09151786309666932, + "grad_norm": 9.283466339111328, + "kl": 5.078125, + "learning_rate": 3.9643549271584085e-10, + "loss": 0.2901, + "reward": 0.5279018133878708, + "reward_std": 0.18255575746297836, + "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3063616156578064, + "rewards/tag_count_reward": 0.416294664144516, "step": 3293 }, { "clip_ratio": 0.0, - "completion_length": 1643.5647888183594, + "completion_length": 1793.6250915527344, "epoch": 0.983944440295721, - "grad_norm": 11.089889526367188, - "kl": 0.244873046875, - "learning_rate": 7.637847064206338e-11, - "loss": 0.1227, - "reward": 0.3398437574505806, - "reward_std": 0.19196967780590057, - "rewards/accuracy_reward": 0.029017858672887087, + "grad_norm": 12.233380317687988, + "kl": 5.1484375, + "learning_rate": 3.818923532103169e-10, + "loss": 0.2952, + "reward": 0.4570312723517418, + "reward_std": 0.16420544311404228, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3108259066939354, + "rewards/tag_count_reward": 0.4123884215950966, "step": 3294 }, { "clip_ratio": 0.0, - "completion_length": 1557.8572082519531, + "completion_length": 1703.9107666015625, "epoch": 0.9842431483832424, - "grad_norm": 9.272443771362305, - "kl": 0.222412109375, - "learning_rate": 7.352415474813911e-11, - "loss": 0.1061, - "reward": 0.3738839477300644, - "reward_std": 0.13912712037563324, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 15.064461708068848, + "kl": 4.37890625, + "learning_rate": 3.6762077374069554e-10, + "loss": 0.2533, + "reward": 0.4771205633878708, + "reward_std": 0.1393560878932476, + "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3314732313156128, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3295 }, { "clip_ratio": 0.0, - "completion_length": 1620.0201721191406, + "completion_length": 1794.0045166015625, "epoch": 0.984541856470764, - "grad_norm": 10.319615364074707, - "kl": 0.225341796875, - "learning_rate": 7.072415396661702e-11, - "loss": 0.1077, - "reward": 0.3777901977300644, - "reward_std": 0.19506684318184853, - "rewards/accuracy_reward": 0.060267860535532236, + "grad_norm": 9.326604843139648, + "kl": 3.88671875, + "learning_rate": 3.5362076983308507e-10, + "loss": 0.207, + "reward": 0.4776785895228386, + "reward_std": 0.12650376372039318, + "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.317522332072258, + "rewards/tag_count_reward": 0.4218750149011612, "step": 3296 }, { "clip_ratio": 0.0, - "completion_length": 1575.4152526855469, + "completion_length": 1744.7701721191406, "epoch": 0.9848405645582854, - "grad_norm": 10.739787101745605, - "kl": 0.216064453125, - "learning_rate": 6.7978471343616e-11, - "loss": 0.1037, - "reward": 0.4475446566939354, - "reward_std": 0.20618730783462524, - "rewards/accuracy_reward": 0.11830357764847577, + "grad_norm": 6.801052570343018, + "kl": 3.234375, + "learning_rate": 3.3989235671808003e-10, + "loss": 0.1779, + "reward": 0.553571455180645, + "reward_std": 0.151376748457551, + "rewards/accuracy_reward": 0.12276786169968545, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3292410895228386, + "rewards/tag_count_reward": 0.4308035895228386, "step": 3297 }, { "clip_ratio": 0.0, - "completion_length": 1583.915283203125, + "completion_length": 1767.1875915527344, "epoch": 0.9851392726458069, - "grad_norm": 12.713809967041016, - "kl": 0.2294921875, - "learning_rate": 6.528710986618003e-11, - "loss": 0.1472, - "reward": 0.3978794813156128, - "reward_std": 0.18574070930480957, - "rewards/accuracy_reward": 0.0625000037252903, + "grad_norm": 19.659704208374023, + "kl": 4.140625, + "learning_rate": 3.264355493309001e-10, + "loss": 0.2255, + "reward": 0.5039062798023224, + "reward_std": 0.14806809090077877, + "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.423549123108387, "step": 3298 }, { "clip_ratio": 0.0, - "completion_length": 1667.7879943847656, + "completion_length": 1774.8259582519531, "epoch": 0.9854379807333283, - "grad_norm": 10.34865665435791, - "kl": 0.243408203125, - "learning_rate": 6.265007246223363e-11, - "loss": 0.1107, - "reward": 0.3459821492433548, - "reward_std": 0.15629084035754204, - "rewards/accuracy_reward": 0.04017857322469354, + "grad_norm": 4.726123332977295, + "kl": 3.734375, + "learning_rate": 3.132503623111682e-10, + "loss": 0.2153, + "reward": 0.4693080559372902, + "reward_std": 0.1282098963856697, + "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.305803582072258, + "rewards/tag_count_reward": 0.420200914144516, "step": 3299 }, { "clip_ratio": 0.0, - "completion_length": 1608.294677734375, + "completion_length": 1733.2947387695312, "epoch": 0.9857366888208499, - "grad_norm": 10.366968154907227, - "kl": 0.21533203125, - "learning_rate": 6.006736200062645e-11, - "loss": 0.1213, - "reward": 0.372767873108387, - "reward_std": 0.1929560899734497, - "rewards/accuracy_reward": 0.037946428870782256, + "grad_norm": 7.275485038757324, + "kl": 3.72265625, + "learning_rate": 3.0033681000313225e-10, + "loss": 0.2229, + "reward": 0.487165205180645, + "reward_std": 0.1699267439544201, + "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3348214477300644, + "rewards/tag_count_reward": 0.4335937798023224, "step": 3300 }, { "clip_ratio": 0.0, - "completion_length": 1591.58935546875, + "completion_length": 1714.2277526855469, "epoch": 0.9860353969083713, - "grad_norm": 11.7572021484375, - "kl": 0.228759765625, - "learning_rate": 5.753898129109424e-11, - "loss": 0.1369, - "reward": 0.4464285969734192, - "reward_std": 0.18479575216770172, - "rewards/accuracy_reward": 0.1272321492433548, + "grad_norm": 8.473512649536133, + "kl": 3.55859375, + "learning_rate": 2.8769490645547123e-10, + "loss": 0.2159, + "reward": 0.5770089626312256, + "reward_std": 0.14893153123557568, + "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3191964402794838, + "rewards/tag_count_reward": 0.4341518059372902, "step": 3301 }, { "clip_ratio": 0.0, - "completion_length": 1631.0759887695312, + "completion_length": 1810.8192749023438, "epoch": 0.9863341049958928, - "grad_norm": 11.382895469665527, - "kl": 0.216796875, - "learning_rate": 5.506493308425342e-11, - "loss": 0.1289, - "reward": 0.3353794813156128, - "reward_std": 0.17385680601000786, - "rewards/accuracy_reward": 0.013392857741564512, + "grad_norm": 7.660743713378906, + "kl": 4.27734375, + "learning_rate": 2.753246654212671e-10, + "loss": 0.238, + "reward": 0.4481026902794838, + "reward_std": 0.16958937607705593, + "rewards/accuracy_reward": 0.026785716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3219866156578064, + "rewards/tag_count_reward": 0.4213169813156128, "step": 3302 }, { "clip_ratio": 0.0, - "completion_length": 1650.9777526855469, + "completion_length": 1778.71435546875, "epoch": 0.9866328130834142, - "grad_norm": 8.197102546691895, - "kl": 0.23583984375, - "learning_rate": 5.26452200716454e-11, - "loss": 0.1104, - "reward": 0.367745541036129, - "reward_std": 0.1726953163743019, - "rewards/accuracy_reward": 0.06026786006987095, + "grad_norm": 16.448728561401367, + "kl": 6.02734375, + "learning_rate": 2.6322610035822703e-10, + "loss": 0.3524, + "reward": 0.4492187723517418, + "reward_std": 0.13968057744204998, + "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3074776902794838, + "rewards/tag_count_reward": 0.395647332072258, "step": 3303 }, { "clip_ratio": 0.0, - "completion_length": 1573.9777526855469, + "completion_length": 1746.3929138183594, "epoch": 0.9869315211709357, - "grad_norm": 11.534990310668945, - "kl": 0.203125, - "learning_rate": 5.02798448856645e-11, - "loss": 0.1244, - "reward": 0.4386160895228386, - "reward_std": 0.16030015796422958, - "rewards/accuracy_reward": 0.1004464328289032, + "grad_norm": 13.308065414428711, + "kl": 2.57421875, + "learning_rate": 2.513992244283225e-10, + "loss": 0.1552, + "reward": 0.550223246216774, + "reward_std": 0.1146874176338315, + "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.338169664144516, + "rewards/tag_count_reward": 0.4453125149011612, "step": 3304 }, { "clip_ratio": 0.0, - "completion_length": 1685.0513916015625, + "completion_length": 1791.4577026367188, "epoch": 0.9872302292584572, - "grad_norm": 11.230953216552734, - "kl": 0.244873046875, - "learning_rate": 4.79688100996134e-11, - "loss": 0.1209, - "reward": 0.3738839477300644, - "reward_std": 0.19338925182819366, - "rewards/accuracy_reward": 0.07142857555299997, + "grad_norm": 8.713001251220703, + "kl": 4.21484375, + "learning_rate": 2.3984405049806697e-10, + "loss": 0.2302, + "reward": 0.5016741156578064, + "reward_std": 0.1467285230755806, + "rewards/accuracy_reward": 0.08258929196745157, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3024553693830967, + "rewards/tag_count_reward": 0.4190848469734192, "step": 3305 }, { "clip_ratio": 0.0, - "completion_length": 1670.3482666015625, + "completion_length": 1801.8348999023438, "epoch": 0.9875289373459787, - "grad_norm": 11.430513381958008, - "kl": 0.23486328125, - "learning_rate": 4.57121182276754e-11, - "loss": 0.1187, - "reward": 0.356026791036129, - "reward_std": 0.17320964485406876, - "rewards/accuracy_reward": 0.04687500186264515, + "grad_norm": 11.928108215332031, + "kl": 3.708984375, + "learning_rate": 2.28560591138377e-10, + "loss": 0.2027, + "reward": 0.4765625149011612, + "reward_std": 0.13067300617694855, + "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3091517984867096, + "rewards/tag_count_reward": 0.4252232313156128, "step": 3306 }, { "clip_ratio": 0.0, - "completion_length": 1631.3973999023438, + "completion_length": 1772.0982971191406, "epoch": 0.9878276454335001, - "grad_norm": 11.062445640563965, - "kl": 0.240966796875, - "learning_rate": 4.350977172490333e-11, - "loss": 0.1217, - "reward": 0.3699776902794838, - "reward_std": 0.1967807374894619, + "grad_norm": 4.269120693206787, + "kl": 4.5546875, + "learning_rate": 2.1754885862451667e-10, + "loss": 0.2662, + "reward": 0.4743303880095482, + "reward_std": 0.14944340288639069, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3097098395228386, + "rewards/tag_count_reward": 0.4140625223517418, "step": 3307 }, { "clip_ratio": 0.0, - "completion_length": 1586.6563110351562, + "completion_length": 1707.4554138183594, "epoch": 0.9881263535210216, - "grad_norm": 13.057943344116211, - "kl": 0.211669921875, - "learning_rate": 4.136177298724175e-11, - "loss": 0.1713, - "reward": 0.4335937798023224, - "reward_std": 0.20741034299135208, - "rewards/accuracy_reward": 0.09151786379516125, + "grad_norm": 10.0681734085083, + "kl": 3.6796875, + "learning_rate": 2.0680886493620875e-10, + "loss": 0.2325, + "reward": 0.5295759290456772, + "reward_std": 0.16445429995656013, + "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.342075914144516, + "rewards/tag_count_reward": 0.431361623108387, "step": 3308 }, { "clip_ratio": 0.0, - "completion_length": 1560.6875610351562, + "completion_length": 1698.4219665527344, "epoch": 0.988425061608543, - "grad_norm": 10.373737335205078, - "kl": 0.21826171875, - "learning_rate": 3.9268124351493625e-11, - "loss": 0.1285, - "reward": 0.4743303805589676, - "reward_std": 0.1819387599825859, - "rewards/accuracy_reward": 0.14062500558793545, + "grad_norm": 9.034782409667969, + "kl": 3.931640625, + "learning_rate": 1.9634062175746814e-10, + "loss": 0.2395, + "reward": 0.5770089626312256, + "reward_std": 0.1477005798369646, + "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.333705373108387, + "rewards/tag_count_reward": 0.4296875223517418, "step": 3309 }, { "clip_ratio": 0.0, - "completion_length": 1580.22998046875, + "completion_length": 1788.0826721191406, "epoch": 0.9887237696960646, - "grad_norm": 11.565031051635742, - "kl": 0.213623046875, - "learning_rate": 3.722882809535366e-11, - "loss": 0.1318, - "reward": 0.4302455559372902, - "reward_std": 0.2261097989976406, - "rewards/accuracy_reward": 0.08928572246804833, + "grad_norm": 13.084781646728516, + "kl": 4.1953125, + "learning_rate": 1.861441404767683e-10, + "loss": 0.2236, + "reward": 0.5156250223517418, + "reward_std": 0.18328437954187393, + "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598246216774, + "rewards/tag_count_reward": 0.4196428805589676, "step": 3310 }, { "clip_ratio": 0.0, - "completion_length": 1561.2500915527344, + "completion_length": 1739.72998046875, "epoch": 0.989022477783586, - "grad_norm": 11.088973999023438, - "kl": 0.2314453125, - "learning_rate": 3.5243886437363865e-11, - "loss": 0.1338, - "reward": 0.3593750149011612, - "reward_std": 0.18723535537719727, - "rewards/accuracy_reward": 0.0357142873108387, + "grad_norm": 17.625808715820312, + "kl": 4.48046875, + "learning_rate": 1.7621943218681933e-10, + "loss": 0.2524, + "reward": 0.4614955559372902, + "reward_std": 0.1440868340432644, + "rewards/accuracy_reward": 0.042410714784637094, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607238650322, + "rewards/tag_count_reward": 0.4190848395228386, "step": 3311 }, { "clip_ratio": 0.0, - "completion_length": 1688.2679443359375, + "completion_length": 1820.7478637695312, "epoch": 0.9893211858711075, - "grad_norm": 11.70609188079834, - "kl": 0.231201171875, - "learning_rate": 3.331330153695799e-11, - "loss": 0.1339, - "reward": 0.3175223395228386, - "reward_std": 0.1614343449473381, + "grad_norm": 17.064064025878906, + "kl": 5.22265625, + "learning_rate": 1.6656650768478998e-10, + "loss": 0.2759, + "reward": 0.412388414144516, + "reward_std": 0.11719540879130363, "rewards/accuracy_reward": 0.004464285913854837, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.407924123108387, "step": 3312 }, { "clip_ratio": 0.0, - "completion_length": 1611.4866638183594, + "completion_length": 1775.1473999023438, "epoch": 0.9896198939586289, - "grad_norm": 10.969061851501465, - "kl": 0.218994140625, - "learning_rate": 3.14370754944171e-11, - "loss": 0.1391, - "reward": 0.3989955559372902, - "reward_std": 0.21643897891044617, - "rewards/accuracy_reward": 0.06696428963914514, + "grad_norm": 7.499997138977051, + "kl": 3.52734375, + "learning_rate": 1.5718537747208548e-10, + "loss": 0.1934, + "reward": 0.5089285969734192, + "reward_std": 0.1658635027706623, + "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312649011612, + "rewards/tag_count_reward": 0.424107164144516, "step": 3313 }, { "clip_ratio": 0.0, - "completion_length": 1672.5491638183594, + "completion_length": 1788.7701721191406, "epoch": 0.9899186020461505, - "grad_norm": 10.869104385375977, - "kl": 0.237548828125, - "learning_rate": 2.9615210350891765e-11, - "loss": 0.1101, - "reward": 0.345982164144516, - "reward_std": 0.17672763392329216, - "rewards/accuracy_reward": 0.0468750037252903, + "grad_norm": 7.739022254943848, + "kl": 4.26953125, + "learning_rate": 1.4807605175445882e-10, + "loss": 0.2513, + "reward": 0.491071455180645, + "reward_std": 0.17251384630799294, + "rewards/accuracy_reward": 0.0714285729918629, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2991071566939354, + "rewards/tag_count_reward": 0.4196428805589676, "step": 3314 }, { "clip_ratio": 0.0, - "completion_length": 1643.7188720703125, + "completion_length": 1814.1875915527344, "epoch": 0.9902173101336719, - "grad_norm": 11.394458770751953, - "kl": 0.231689453125, - "learning_rate": 2.7847708088396538e-11, - "loss": 0.1267, - "reward": 0.3470982238650322, - "reward_std": 0.17947103083133698, - "rewards/accuracy_reward": 0.044642859138548374, + "grad_norm": 7.714453220367432, + "kl": 4.052734375, + "learning_rate": 1.392385404419827e-10, + "loss": 0.2182, + "reward": 0.4654018133878708, + "reward_std": 0.1209937110543251, + "rewards/accuracy_reward": 0.044642860535532236, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.302455373108387, + "rewards/tag_count_reward": 0.4207589402794838, "step": 3315 }, { "clip_ratio": 0.0, - "completion_length": 1649.5625915527344, + "completion_length": 1833.8505249023438, "epoch": 0.9905160182211933, - "grad_norm": 11.121983528137207, - "kl": 0.23095703125, - "learning_rate": 2.6134570629793294e-11, - "loss": 0.1206, - "reward": 0.3984375223517418, - "reward_std": 0.20894617214798927, - "rewards/accuracy_reward": 0.07589286053553224, + "grad_norm": 9.077740669250488, + "kl": 4.37109375, + "learning_rate": 1.3067285314896648e-10, + "loss": 0.2411, + "reward": 0.4849330559372902, + "reward_std": 0.16769181191921234, + "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3225446566939354, + "rewards/tag_count_reward": 0.4112723395228386, "step": 3316 }, { "clip_ratio": 0.0, - "completion_length": 1612.2612609863281, + "completion_length": 1734.7701721191406, "epoch": 0.9908147263087148, - "grad_norm": 11.104179382324219, - "kl": 0.21435546875, - "learning_rate": 2.4475799838813425e-11, - "loss": 0.1196, - "reward": 0.3800223469734192, - "reward_std": 0.16959594562649727, - "rewards/accuracy_reward": 0.05133928661234677, + "grad_norm": 4.371685981750488, + "kl": 3.955078125, + "learning_rate": 1.2237899919406714e-10, + "loss": 0.2241, + "reward": 0.4815848544239998, + "reward_std": 0.13078913651406765, + "rewards/accuracy_reward": 0.06473214412108064, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.4168526977300644, "step": 3317 }, { "clip_ratio": 0.0, - "completion_length": 1669.3460388183594, + "completion_length": 1858.1429443359375, "epoch": 0.9911134343962362, - "grad_norm": 11.419222831726074, - "kl": 0.232666015625, - "learning_rate": 2.287139752003564e-11, - "loss": 0.1231, - "reward": 0.3878348395228386, - "reward_std": 0.16751063242554665, - "rewards/accuracy_reward": 0.08258928917348385, + "grad_norm": 31.608108520507812, + "kl": 5.953125, + "learning_rate": 1.143569876001782e-10, + "loss": 0.3017, + "reward": 0.4921875298023224, + "reward_std": 0.15660439990460873, + "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3052455484867096, + "rewards/tag_count_reward": 0.3917410895228386, "step": 3318 }, { "clip_ratio": 0.0, - "completion_length": 1647.52685546875, + "completion_length": 1727.3728637695312, "epoch": 0.9914121424837578, - "grad_norm": 11.314994812011719, - "kl": 0.2255859375, - "learning_rate": 2.1321365418891513e-11, - "loss": 0.1472, - "reward": 0.3666294738650322, - "reward_std": 0.19409587979316711, - "rewards/accuracy_reward": 0.05133928661234677, + "grad_norm": 11.799995422363281, + "kl": 3.189453125, + "learning_rate": 1.0660682709445757e-10, + "loss": 0.204, + "reward": 0.4944196715950966, + "reward_std": 0.13499074801802635, + "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901902794838, + "rewards/tag_count_reward": 0.4363839477300644, "step": 3319 }, { "clip_ratio": 0.0, - "completion_length": 1594.8080749511719, + "completion_length": 1747.4130249023438, "epoch": 0.9917108505712792, - "grad_norm": 11.076885223388672, - "kl": 0.224853515625, - "learning_rate": 1.982570522166549e-11, - "loss": 0.1164, - "reward": 0.4419642984867096, - "reward_std": 0.17097217962145805, - "rewards/accuracy_reward": 0.1183035746216774, + "grad_norm": 3.338597536087036, + "kl": 3.98046875, + "learning_rate": 9.912852610832744e-11, + "loss": 0.2451, + "reward": 0.576450914144516, + "reward_std": 0.14545562490820885, + "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3236607313156128, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3320 }, { "clip_ratio": 0.0, - "completion_length": 1602.40185546875, + "completion_length": 1754.5670471191406, "epoch": 0.9920095586588007, - "grad_norm": 10.715211868286133, - "kl": 0.2236328125, - "learning_rate": 1.8384418555489334e-11, - "loss": 0.1173, - "reward": 0.4252232387661934, - "reward_std": 0.21134080737829208, - "rewards/accuracy_reward": 0.0959821492433548, + "grad_norm": 7.347782611846924, + "kl": 3.78125, + "learning_rate": 9.192209277744667e-11, + "loss": 0.2276, + "reward": 0.5401785895228386, + "reward_std": 0.18300871923565865, + "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.329241082072258, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3321 }, { "clip_ratio": 0.0, - "completion_length": 1565.4554443359375, + "completion_length": 1747.68310546875, "epoch": 0.9923082667463221, - "grad_norm": 10.489834785461426, - "kl": 0.227294921875, - "learning_rate": 1.6997506988342125e-11, - "loss": 0.1238, - "reward": 0.3973214477300644, - "reward_std": 0.19635630398988724, - "rewards/accuracy_reward": 0.06919643259607255, + "grad_norm": 8.304889678955078, + "kl": 3.779296875, + "learning_rate": 8.498753494171063e-11, + "loss": 0.2293, + "reward": 0.512276828289032, + "reward_std": 0.18688325583934784, + "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3281250149011612, + "rewards/tag_count_reward": 0.4229910969734192, "step": 3322 }, { "clip_ratio": 0.0, - "completion_length": 1551.2031860351562, + "completion_length": 1716.9688415527344, "epoch": 0.9926069748338436, - "grad_norm": 11.922907829284668, - "kl": 0.224853515625, - "learning_rate": 1.566497202904471e-11, - "loss": 0.1333, - "reward": 0.4659598395228386, - "reward_std": 0.23576608300209045, - "rewards/accuracy_reward": 0.1250000037252903, + "grad_norm": 6.801143169403076, + "kl": 3.220703125, + "learning_rate": 7.832486014522355e-11, + "loss": 0.1794, + "reward": 0.573660746216774, + "reward_std": 0.21418777480721474, + "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3409598395228386, + "rewards/tag_count_reward": 0.435267873108387, "step": 3323 }, { "clip_ratio": 0.0, - "completion_length": 1644.3951721191406, + "completion_length": 1798.6965026855469, "epoch": 0.992905682921365, - "grad_norm": 11.513749122619629, - "kl": 0.234130859375, - "learning_rate": 1.4386815127270802e-11, - "loss": 0.1021, - "reward": 0.349330373108387, - "reward_std": 0.20854488387703896, - "rewards/accuracy_reward": 0.024553573224693537, + "grad_norm": 9.317425727844238, + "kl": 4.25, + "learning_rate": 7.193407563635401e-11, + "loss": 0.2238, + "reward": 0.4698661044239998, + "reward_std": 0.16084278095513582, + "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3247767984867096, + "rewards/tag_count_reward": 0.4274553880095482, "step": 3324 }, { "clip_ratio": 0.0, - "completion_length": 1589.8438110351562, + "completion_length": 1687.2745971679688, "epoch": 0.9932043910088866, - "grad_norm": 12.373759269714355, - "kl": 0.23193359375, - "learning_rate": 1.3163037673519228e-11, - "loss": 0.1468, - "reward": 0.329799123108387, - "reward_std": 0.17190862074494362, - "rewards/accuracy_reward": 0.015625000465661287, + "grad_norm": 13.472347259521484, + "kl": 3.583984375, + "learning_rate": 6.581518836759614e-11, + "loss": 0.2217, + "reward": 0.4553571566939354, + "reward_std": 0.12054107896983624, + "rewards/accuracy_reward": 0.020089286379516125, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.314174123108387, + "rewards/tag_count_reward": 0.4352678805589676, "step": 3325 }, { "clip_ratio": 0.0, - "completion_length": 1609.6630249023438, + "completion_length": 1762.6607971191406, "epoch": 0.993503099096408, - "grad_norm": 12.38908576965332, - "kl": 0.2275390625, - "learning_rate": 1.1993640999147237e-11, - "loss": 0.1512, - "reward": 0.4224330484867096, - "reward_std": 0.18874871358275414, - "rewards/accuracy_reward": 0.09598214738070965, + "grad_norm": 8.46340560913086, + "kl": 3.80859375, + "learning_rate": 5.996820499573619e-11, + "loss": 0.2379, + "reward": 0.5357143059372902, + "reward_std": 0.1387111134827137, + "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.326450914144516, + "rewards/tag_count_reward": 0.4308035895228386, "step": 3326 }, { "clip_ratio": 0.0, - "completion_length": 1595.3951416015625, + "completion_length": 1825.2032165527344, "epoch": 0.9938018071839295, - "grad_norm": 10.983471870422363, - "kl": 0.233154296875, - "learning_rate": 1.0878626376342737e-11, - "loss": 0.119, - "reward": 0.3755580559372902, - "reward_std": 0.19597885757684708, - "rewards/accuracy_reward": 0.06026786123402417, + "grad_norm": 14.501029968261719, + "kl": 4.38671875, + "learning_rate": 5.439313188171368e-11, + "loss": 0.2311, + "reward": 0.479910746216774, + "reward_std": 0.1539971437305212, + "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3152901977300644, + "rewards/tag_count_reward": 0.4151785895228386, "step": 3327 }, { "clip_ratio": 0.0, - "completion_length": 1550.4911193847656, + "completion_length": 1724.6920471191406, "epoch": 0.9941005152714509, - "grad_norm": 12.416254997253418, - "kl": 0.216552734375, - "learning_rate": 9.817995018135406e-12, - "loss": 0.1541, - "reward": 0.3989955484867096, - "reward_std": 0.21979640424251556, - "rewards/accuracy_reward": 0.06250000139698386, + "grad_norm": 8.068860054016113, + "kl": 4.01953125, + "learning_rate": 4.9089975090677025e-11, + "loss": 0.2372, + "reward": 0.4988839402794838, + "reward_std": 0.19038104265928268, + "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3364955484867096, + "rewards/tag_count_reward": 0.4207589477300644, "step": 3328 }, { "clip_ratio": 0.0, - "completion_length": 1622.77685546875, + "completion_length": 1769.8862609863281, "epoch": 0.9943992233589725, - "grad_norm": 11.287613868713379, - "kl": 0.222900390625, - "learning_rate": 8.811748078385584e-12, - "loss": 0.1186, - "reward": 0.4246651902794838, - "reward_std": 0.19767218455672264, - "rewards/accuracy_reward": 0.0959821455180645, + "grad_norm": 5.389657497406006, + "kl": 3.8828125, + "learning_rate": 4.4058740391927917e-11, + "loss": 0.2158, + "reward": 0.5496651977300644, + "reward_std": 0.1852264553308487, + "rewards/accuracy_reward": 0.12946429522708058, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830484867096, + "rewards/tag_count_reward": 0.420200914144516, "step": 3329 }, { "clip_ratio": 0.0, - "completion_length": 1637.58935546875, + "completion_length": 1758.8215026855469, "epoch": 0.9946979314464939, - "grad_norm": 11.991843223571777, - "kl": 0.220947265625, - "learning_rate": 7.859886651784275e-12, - "loss": 0.1299, - "reward": 0.3443080484867096, - "reward_std": 0.19517416507005692, - "rewards/accuracy_reward": 0.03125000046566129, + "grad_norm": 16.15484046936035, + "kl": 3.390625, + "learning_rate": 3.929943325892138e-11, + "loss": 0.225, + "reward": 0.4648437723517418, + "reward_std": 0.14931855909526348, + "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3130580559372902, + "rewards/tag_count_reward": 0.4224330633878708, "step": 3330 }, { "clip_ratio": 0.0, - "completion_length": 1683.6719360351562, + "completion_length": 1794.384033203125, "epoch": 0.9949966395340154, - "grad_norm": 11.595287322998047, - "kl": 0.2275390625, - "learning_rate": 6.962411773875354e-12, - "loss": 0.1116, - "reward": 0.3582589402794838, - "reward_std": 0.19626830890774727, - "rewards/accuracy_reward": 0.0424107164144516, + "grad_norm": 12.177657127380371, + "kl": 3.935546875, + "learning_rate": 3.481205886937677e-11, + "loss": 0.2182, + "reward": 0.4687500298023224, + "reward_std": 0.1693696491420269, + "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3158482238650322, + "rewards/tag_count_reward": 0.4151785895228386, "step": 3331 }, { "clip_ratio": 0.0, - "completion_length": 1685.0223999023438, + "completion_length": 1795.6005249023438, "epoch": 0.9952953476215368, - "grad_norm": 10.348727226257324, - "kl": 0.241455078125, - "learning_rate": 6.119324421016703e-12, - "loss": 0.1155, - "reward": 0.3677455559372902, - "reward_std": 0.19751853868365288, - "rewards/accuracy_reward": 0.06473214598372579, + "grad_norm": 15.174638748168945, + "kl": 4.70703125, + "learning_rate": 3.059662210508351e-11, + "loss": 0.2515, + "reward": 0.4838169887661934, + "reward_std": 0.1500650830566883, + "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3030134066939354, + "rewards/tag_count_reward": 0.4123884066939354, "step": 3332 }, { "clip_ratio": 0.0, - "completion_length": 1674.3058776855469, + "completion_length": 1783.1764526367188, "epoch": 0.9955940557090583, - "grad_norm": 11.795738220214844, - "kl": 0.233642578125, - "learning_rate": 5.330625510407971e-12, - "loss": 0.133, - "reward": 0.404017873108387, - "reward_std": 0.18059901148080826, - "rewards/accuracy_reward": 0.08705357182770967, + "grad_norm": 26.510358810424805, + "kl": 4.046875, + "learning_rate": 2.6653127552039857e-11, + "loss": 0.2588, + "reward": 0.5329241305589676, + "reward_std": 0.14724184572696686, + "rewards/accuracy_reward": 0.1071428582072258, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3169642984867096, + "rewards/tag_count_reward": 0.4257812723517418, "step": 3333 }, { "clip_ratio": 0.0, - "completion_length": 1539.0804443359375, + "completion_length": 1705.9889526367188, "epoch": 0.9958927637965798, - "grad_norm": 12.194656372070312, - "kl": 0.21484375, - "learning_rate": 4.596315900073921e-12, - "loss": 0.1429, - "reward": 0.4207589477300644, - "reward_std": 0.1747603341937065, - "rewards/accuracy_reward": 0.082589291036129, + "grad_norm": 19.114683151245117, + "kl": 3.1484375, + "learning_rate": 2.298157950036961e-11, + "loss": 0.1983, + "reward": 0.5290178805589676, + "reward_std": 0.12315829284489155, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3381696492433548, + "rewards/tag_count_reward": 0.4419642984867096, "step": 3334 }, { "clip_ratio": 0.0, - "completion_length": 1661.4576721191406, + "completion_length": 1832.0670166015625, "epoch": 0.9961914718841013, - "grad_norm": 10.356828689575195, - "kl": 0.232177734375, - "learning_rate": 3.9163963888699804e-12, - "loss": 0.1075, - "reward": 0.3738839477300644, - "reward_std": 0.19179831445217133, - "rewards/accuracy_reward": 0.055803573690354824, + "grad_norm": 18.255840301513672, + "kl": 4.87890625, + "learning_rate": 1.95819819443499e-11, + "loss": 0.2573, + "reward": 0.4793526902794838, + "reward_std": 0.151786956936121, + "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.318080373108387, + "rewards/tag_count_reward": 0.4168526977300644, "step": 3335 }, { "clip_ratio": 0.0, - "completion_length": 1605.0871276855469, + "completion_length": 1745.6697387695312, "epoch": 0.9964901799716227, - "grad_norm": 11.246313095092773, - "kl": 0.255126953125, - "learning_rate": 3.2908677164822373e-12, - "loss": 0.12, - "reward": 0.4503348395228386, - "reward_std": 0.1859860084950924, - "rewards/accuracy_reward": 0.11830357694998384, + "grad_norm": 16.174894332885742, + "kl": 3.26953125, + "learning_rate": 1.6454338582411186e-11, + "loss": 0.2027, + "reward": 0.5518973469734192, + "reward_std": 0.16958784870803356, + "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3320312574505806, + "rewards/tag_count_reward": 0.435825914144516, "step": 3336 }, { "clip_ratio": 0.0, - "completion_length": 1591.2120971679688, + "completion_length": 1775.1875915527344, "epoch": 0.9967888880591442, - "grad_norm": 9.969381332397461, - "kl": 0.2197265625, - "learning_rate": 2.719730563427447e-12, - "loss": 0.1175, - "reward": 0.4257812723517418, - "reward_std": 0.19077502563595772, - "rewards/accuracy_reward": 0.10044643143191934, + "grad_norm": 13.705008506774902, + "kl": 4.1171875, + "learning_rate": 1.3598652817137235e-11, + "loss": 0.2314, + "reward": 0.5279018059372902, + "reward_std": 0.13746609166264534, + "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3253348395228386, + "rewards/tag_count_reward": 0.4252232387661934, "step": 3337 }, { "clip_ratio": 0.0, - "completion_length": 1599.2969360351562, + "completion_length": 1734.1540832519531, "epoch": 0.9970875961466656, - "grad_norm": 11.492878913879395, - "kl": 0.2119140625, - "learning_rate": 2.202985551047476e-12, - "loss": 0.127, - "reward": 0.4068080484867096, - "reward_std": 0.1798994019627571, - "rewards/accuracy_reward": 0.07142857206054032, + "grad_norm": 9.357471466064453, + "kl": 3.31640625, + "learning_rate": 1.101492775523738e-11, + "loss": 0.2, + "reward": 0.5206473469734192, + "reward_std": 0.15125216729938984, + "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3353794813156128, + "rewards/tag_count_reward": 0.4380580484867096, "step": 3338 }, { "clip_ratio": 0.0, - "completion_length": 1637.3416137695312, + "completion_length": 1756.18310546875, "epoch": 0.9973863042341872, - "grad_norm": 11.557194709777832, - "kl": 0.224853515625, - "learning_rate": 1.740633241509304e-12, - "loss": 0.124, - "reward": 0.4179687649011612, - "reward_std": 0.16737685352563858, - "rewards/accuracy_reward": 0.0937500037252903, + "grad_norm": 6.2276153564453125, + "kl": 3.11328125, + "learning_rate": 8.703166207546519e-12, + "loss": 0.164, + "reward": 0.5379464626312256, + "reward_std": 0.1417186800390482, + "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3242187723517418, + "rewards/tag_count_reward": 0.4285714477300644, "step": 3339 }, { "clip_ratio": 0.0, - "completion_length": 1565.6697082519531, + "completion_length": 1666.4844360351562, "epoch": 0.9976850123217086, - "grad_norm": 11.757747650146484, - "kl": 0.220458984375, - "learning_rate": 1.3326741378050232e-12, - "loss": 0.1365, - "reward": 0.4129464402794838, - "reward_std": 0.17291375994682312, - "rewards/accuracy_reward": 0.0803571455180645, + "grad_norm": 3.647441864013672, + "kl": 3.408203125, + "learning_rate": 6.663370689025116e-12, + "loss": 0.2087, + "reward": 0.5295759215950966, + "reward_std": 0.1530846431851387, + "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3325893059372902, + "rewards/tag_count_reward": 0.4402901902794838, "step": 3340 }, { "clip_ratio": 0.0, - "completion_length": 1593.6763916015625, + "completion_length": 1797.1630554199219, "epoch": 0.9979837204092301, - "grad_norm": 11.72088623046875, - "kl": 0.211669921875, - "learning_rate": 9.791086837573903e-13, - "loss": 0.1295, - "reward": 0.4023437798023224, - "reward_std": 0.19247733429074287, - "rewards/accuracy_reward": 0.06473214458674192, + "grad_norm": 7.0721611976623535, + "kl": 4.078125, + "learning_rate": 4.895543418786952e-12, + "loss": 0.2267, + "reward": 0.5005580633878708, + "reward_std": 0.14737574383616447, + "rewards/accuracy_reward": 0.07366071944124997, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.337611623108387, + "rewards/tag_count_reward": 0.4268973395228386, "step": 3341 }, { "clip_ratio": 0.0, - "completion_length": 1617.2634582519531, + "completion_length": 1782.3706359863281, "epoch": 0.9982824284967515, - "grad_norm": 12.278242111206055, - "kl": 0.218994140625, - "learning_rate": 6.79937264003172e-13, - "loss": 0.1268, - "reward": 0.427455373108387, - "reward_std": 0.20042739808559418, - "rewards/accuracy_reward": 0.09375000582076609, + "grad_norm": 8.780208587646484, + "kl": 3.83203125, + "learning_rate": 3.39968632001586e-12, + "loss": 0.2099, + "reward": 0.5429687649011612, + "reward_std": 0.1996011659502983, + "rewards/accuracy_reward": 0.1250000107102096, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3337053656578064, + "rewards/tag_count_reward": 0.4179687723517418, "step": 3342 }, { "clip_ratio": 0.0, - "completion_length": 1676.13623046875, + "completion_length": 1744.7188415527344, "epoch": 0.998581136584273, - "grad_norm": 11.229790687561035, - "kl": 0.24658203125, - "learning_rate": 4.351602040209013e-13, - "loss": 0.1394, - "reward": 0.3381696566939354, - "reward_std": 0.1587700955569744, - "rewards/accuracy_reward": 0.04017857206054032, + "grad_norm": 10.397480010986328, + "kl": 3.7578125, + "learning_rate": 2.1758010201045063e-12, + "loss": 0.2329, + "reward": 0.4983259215950966, + "reward_std": 0.16362856328487396, + "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.297991082072258, + "rewards/tag_count_reward": 0.4291294887661934, "step": 3343 }, { "clip_ratio": 0.0, - "completion_length": 1599.5045471191406, + "completion_length": 1725.8215026855469, "epoch": 0.9988798446717945, - "grad_norm": 11.10936164855957, - "kl": 0.217529296875, - "learning_rate": 2.4477777010312174e-13, - "loss": 0.1311, - "reward": 0.4090401902794838, - "reward_std": 0.1931113488972187, - "rewards/accuracy_reward": 0.08035714738070965, + "grad_norm": 4.162888050079346, + "kl": 3.62890625, + "learning_rate": 1.2238888505156087e-12, + "loss": 0.2243, + "reward": 0.5161830559372902, + "reward_std": 0.16230366006493568, + "rewards/accuracy_reward": 0.08705357206054032, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3286830559372902, + "rewards/tag_count_reward": 0.4291294813156128, "step": 3344 }, { "clip_ratio": 0.0, - "completion_length": 1744.1362609863281, + "completion_length": 1858.4331359863281, "epoch": 0.999178552759316, - "grad_norm": 9.91911792755127, - "kl": 0.25439453125, - "learning_rate": 1.0879016935638751e-13, - "loss": 0.1071, - "reward": 0.3264509066939354, - "reward_std": 0.17118894308805466, - "rewards/accuracy_reward": 0.042410716181620955, + "grad_norm": 9.17155647277832, + "kl": 4.8828125, + "learning_rate": 5.439508467819375e-13, + "loss": 0.2617, + "reward": 0.462611623108387, + "reward_std": 0.16037741117179394, + "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2840401902794838, + "rewards/tag_count_reward": 0.4068080484867096, "step": 3345 }, { "clip_ratio": 0.0, - "completion_length": 1571.3215026855469, + "completion_length": 1686.8817749023438, "epoch": 0.9994772608468374, - "grad_norm": 13.584514617919922, - "kl": 0.21728515625, - "learning_rate": 2.7197549734570002e-14, - "loss": 0.1676, - "reward": 0.4765625074505806, - "reward_std": 0.21880872920155525, - "rewards/accuracy_reward": 0.1339285783469677, + "grad_norm": 23.49078941345215, + "kl": 2.82421875, + "learning_rate": 1.3598774867285002e-13, + "loss": 0.1918, + "reward": 0.611049123108387, + "reward_std": 0.17189978808164597, + "rewards/accuracy_reward": 0.16071429592557251, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3426339402794838, + "rewards/tag_count_reward": 0.4503348395228386, "step": 3346 }, { "clip_ratio": 0.0, - "completion_length": 1663.9063110351562, + "completion_length": 1807.5380249023438, "epoch": 0.9997759689343589, - "grad_norm": 9.57776927947998, - "kl": 0.2373046875, + "grad_norm": 17.386194229125977, + "kl": 4.5703125, "learning_rate": 0.0, - "loss": 0.1105, - "reward": 0.3794642984867096, - "reward_std": 0.20539238303899765, - "rewards/accuracy_reward": 0.06696428847499192, + "loss": 0.241, + "reward": 0.4966518133878708, + "reward_std": 0.16918558441102505, + "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.3125000074505806, + "rewards/tag_count_reward": 0.4095982387661934, "step": 3347 }, { "epoch": 0.9997759689343589, "step": 3347, "total_flos": 0.0, - "train_loss": 0.04007197510112119, - "train_runtime": 111902.1384, - "train_samples_per_second": 0.838, - "train_steps_per_second": 0.03 + "train_loss": 0.08076191042202922, + "train_runtime": 127083.2818, + "train_samples_per_second": 0.738, + "train_steps_per_second": 0.026 } ], "logging_steps": 1,