Dfyd-R1-Distill-Qwen-1.5B-GRPO / trainer_state.json
wnj13's picture
Model save
d216d09 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.002,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.004,
"grad_norm": 0.6894801259040833,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0,
"reward": 0.5625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1845.75,
"epoch": 0.006,
"grad_norm": 0.0036829786840826273,
"kl": 4.696846008300781e-05,
"learning_rate": 6e-08,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.008,
"grad_norm": 0.0038558689411729574,
"kl": 9.775161743164062e-05,
"learning_rate": 8e-08,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.01,
"grad_norm": 0.7485816478729248,
"kl": 0.00010919570922851562,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1985.5,
"epoch": 0.012,
"grad_norm": 0.7458791732788086,
"kl": 7.796287536621094e-05,
"learning_rate": 1.2e-07,
"loss": -0.023,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.014,
"grad_norm": 0.003527791704982519,
"kl": 6.628036499023438e-05,
"learning_rate": 1.4e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.016,
"grad_norm": 0.8426324129104614,
"kl": 7.104873657226562e-05,
"learning_rate": 1.6e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.018,
"grad_norm": 0.7430940270423889,
"kl": 9.5367431640625e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.02,
"grad_norm": 0.003660305170342326,
"kl": 5.626678466796875e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.022,
"grad_norm": 0.9972493648529053,
"kl": 0.0001544952392578125,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.024,
"grad_norm": 0.003556522075086832,
"kl": 6.723403930664062e-05,
"learning_rate": 2.4e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1682.25,
"epoch": 0.026,
"grad_norm": 0.0050394581630826,
"kl": 8.845329284667969e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.028,
"grad_norm": 0.004075606819242239,
"kl": 0.0001049041748046875,
"learning_rate": 2.8e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.03,
"grad_norm": 0.6532822251319885,
"kl": 7.2479248046875e-05,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.032,
"grad_norm": 0.7762453556060791,
"kl": 0.00011777877807617188,
"learning_rate": 3.2e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.034,
"grad_norm": 0.7456417679786682,
"kl": 8.58306884765625e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1770.5,
"epoch": 0.036,
"grad_norm": 0.0034950117114931345,
"kl": 3.5762786865234375e-05,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.038,
"grad_norm": 0.003600472817197442,
"kl": 6.866455078125e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1856.25,
"epoch": 0.04,
"grad_norm": 0.8460555076599121,
"kl": 8.106231689453125e-05,
"learning_rate": 4e-07,
"loss": 0.0814,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.042,
"grad_norm": 0.0038911281153559685,
"kl": 0.0001068115234375,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.044,
"grad_norm": 0.7139887809753418,
"kl": 0.0001125335693359375,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1855.25,
"epoch": 0.046,
"grad_norm": 0.005301118828356266,
"kl": 5.6862831115722656e-05,
"learning_rate": 4.6e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.048,
"grad_norm": 0.0036839963868260384,
"kl": 7.62939453125e-05,
"learning_rate": 4.8e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.05,
"grad_norm": 0.003639479400590062,
"kl": 8.678436279296875e-05,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.052,
"grad_norm": 0.8071563243865967,
"kl": 8.535385131835938e-05,
"learning_rate": 5.2e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.054,
"grad_norm": 0.0037776094395667315,
"kl": 0.00011014938354492188,
"learning_rate": 5.4e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1608.75,
"epoch": 0.056,
"grad_norm": 0.6625211834907532,
"kl": 4.398822784423828e-05,
"learning_rate": 5.6e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1841.25,
"epoch": 0.058,
"grad_norm": 0.0036761562805622816,
"kl": 8.678436279296875e-05,
"learning_rate": 5.8e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.06,
"grad_norm": 0.0033802345860749483,
"kl": 5.7697296142578125e-05,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.062,
"grad_norm": 0.003240807680413127,
"kl": 6.29425048828125e-05,
"learning_rate": 6.2e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.064,
"grad_norm": 0.0036673492286354303,
"kl": 7.152557373046875e-05,
"learning_rate": 6.4e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1735.75,
"epoch": 0.066,
"grad_norm": 0.8675172328948975,
"kl": 2.682209014892578e-05,
"learning_rate": 6.6e-07,
"loss": -0.0631,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.068,
"grad_norm": 0.8184774518013,
"kl": 9.775161743164062e-05,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.07,
"grad_norm": 0.003660478862002492,
"kl": 0.00010251998901367188,
"learning_rate": 7e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.072,
"grad_norm": 0.004034126177430153,
"kl": 6.4849853515625e-05,
"learning_rate": 7.2e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.074,
"grad_norm": 0.7589001655578613,
"kl": 0.00010824203491210938,
"learning_rate": 7.4e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.076,
"grad_norm": 0.6517212986946106,
"kl": 8.20159912109375e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.078,
"grad_norm": 0.8392653465270996,
"kl": 0.0001430511474609375,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.08,
"grad_norm": 0.990591824054718,
"kl": 0.0001125335693359375,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.082,
"grad_norm": 0.7600575685501099,
"kl": 0.00014066696166992188,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.084,
"grad_norm": 0.004982170183211565,
"kl": 0.00013589859008789062,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.086,
"grad_norm": 0.6588786840438843,
"kl": 0.00019502639770507812,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.088,
"grad_norm": 0.876471996307373,
"kl": 0.00020933151245117188,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 1857.0,
"epoch": 0.09,
"grad_norm": 0.007592031732201576,
"kl": 0.00015783309936523438,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.092,
"grad_norm": 0.649493932723999,
"kl": 0.00015401840209960938,
"learning_rate": 9.2e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.094,
"grad_norm": 0.0060058352537453175,
"kl": 0.00024318695068359375,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1703.5,
"epoch": 0.096,
"grad_norm": 0.7367803454399109,
"kl": 0.00028061866760253906,
"learning_rate": 9.6e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.098,
"grad_norm": 0.7102321982383728,
"kl": 0.0003223419189453125,
"learning_rate": 9.8e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.1,
"grad_norm": 0.5775962471961975,
"kl": 0.00010824203491210938,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.102,
"grad_norm": 0.0037765211891382933,
"kl": 7.62939453125e-05,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1991.5,
"epoch": 0.104,
"grad_norm": 0.8791236281394958,
"kl": 0.00046634674072265625,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.106,
"grad_norm": 0.006120009813457727,
"kl": 0.0004935264587402344,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.108,
"grad_norm": 0.006084068212658167,
"kl": 0.0002951622009277344,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1732.5,
"epoch": 0.11,
"grad_norm": 0.008206584490835667,
"kl": 0.00015926361083984375,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2031.5,
"epoch": 0.112,
"grad_norm": 0.06737767159938812,
"kl": 0.0014257431030273438,
"learning_rate": 9.996052735444862e-07,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.114,
"grad_norm": 0.6304325461387634,
"kl": 0.0003185272216796875,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.116,
"grad_norm": 0.6972232460975647,
"kl": 0.000560760498046875,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.118,
"grad_norm": 0.026448842138051987,
"kl": 0.0009822845458984375,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.12,
"grad_norm": 0.006011964753270149,
"kl": 0.0007162094116210938,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.122,
"grad_norm": 0.6680987477302551,
"kl": 0.0005612373352050781,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1933.75,
"epoch": 0.124,
"grad_norm": 0.9241335988044739,
"kl": 0.0006771087646484375,
"learning_rate": 9.98421786662277e-07,
"loss": -0.0443,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.126,
"grad_norm": 0.009985378012061119,
"kl": 0.0005092620849609375,
"learning_rate": 9.981479793771866e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.128,
"grad_norm": 0.03178563341498375,
"kl": 0.0007648468017578125,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1685.75,
"epoch": 0.13,
"grad_norm": 0.9064852595329285,
"kl": 0.00341796875,
"learning_rate": 9.975348529157229e-07,
"loss": -0.0454,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.132,
"grad_norm": 0.7919860482215881,
"kl": 0.001010894775390625,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1887.25,
"epoch": 0.134,
"grad_norm": 0.8760141134262085,
"kl": 0.000766754150390625,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0659,
"reward": 0.375,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.136,
"grad_norm": 0.004815725143998861,
"kl": 0.0005044937133789062,
"learning_rate": 9.964516155915151e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1797.25,
"epoch": 0.138,
"grad_norm": 0.8277981877326965,
"kl": 0.0012836456298828125,
"learning_rate": 9.960469931131936e-07,
"loss": 0.1147,
"reward": 0.9375,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2000.25,
"epoch": 0.14,
"grad_norm": 0.8641435503959656,
"kl": 0.0011048316955566406,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0173,
"reward": 0.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.142,
"grad_norm": 0.009465116076171398,
"kl": 0.00019884109497070312,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1618.25,
"epoch": 0.144,
"grad_norm": 0.8773428797721863,
"kl": 0.0106658935546875,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0216,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.146,
"grad_norm": 0.0040911422111094,
"kl": 0.0006923675537109375,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.148,
"grad_norm": 0.0059630973264575005,
"kl": 0.000446319580078125,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1836.75,
"epoch": 0.15,
"grad_norm": 1.146507978439331,
"kl": 0.0004787445068359375,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0918,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.152,
"grad_norm": 0.7911372184753418,
"kl": 0.00030231475830078125,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.154,
"grad_norm": 0.8156778216362,
"kl": 0.0006389617919921875,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1915.25,
"epoch": 0.156,
"grad_norm": 0.005794746335595846,
"kl": 0.0005588531494140625,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1918.0,
"epoch": 0.158,
"grad_norm": 1.0309467315673828,
"kl": 0.0005645751953125,
"learning_rate": 9.908088623197048e-07,
"loss": -0.0514,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.16,
"grad_norm": 0.9030879735946655,
"kl": 0.0043182373046875,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0002,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.162,
"grad_norm": 0.8063321113586426,
"kl": 0.00075531005859375,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.164,
"grad_norm": 0.01648077741265297,
"kl": 0.00039196014404296875,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1855.0,
"epoch": 0.166,
"grad_norm": 0.7015650272369385,
"kl": 0.0013475418090820312,
"learning_rate": 9.881105062929221e-07,
"loss": 0.0001,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.168,
"grad_norm": 0.006011520978063345,
"kl": 0.00021600723266601562,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1855.5,
"epoch": 0.17,
"grad_norm": 0.9796985983848572,
"kl": 0.00054168701171875,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0819,
"reward": 0.5,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.172,
"grad_norm": 0.006532273255288601,
"kl": 0.0004496574401855469,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.174,
"grad_norm": 0.007916338741779327,
"kl": 0.0005373954772949219,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.176,
"grad_norm": 0.006668766029179096,
"kl": 0.00024175643920898438,
"learning_rate": 9.8425742251254e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1533.5,
"epoch": 0.178,
"grad_norm": 0.01681283488869667,
"kl": 0.0006351470947265625,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.18,
"grad_norm": 0.6811097860336304,
"kl": 0.0013413429260253906,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.182,
"grad_norm": 0.7697494626045227,
"kl": 0.0030584335327148438,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 1913.25,
"epoch": 0.184,
"grad_norm": 1.1320937871932983,
"kl": 0.001331329345703125,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0536,
"reward": 0.4375,
"reward_std": 0.4419417306780815,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1742.75,
"epoch": 0.186,
"grad_norm": 0.6808719635009766,
"kl": 0.0010509490966796875,
"learning_rate": 9.798752629550546e-07,
"loss": -0.1501,
"reward": 0.3125,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.188,
"grad_norm": 0.007471662946045399,
"kl": 0.0004634857177734375,
"learning_rate": 9.78935800506826e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.19,
"grad_norm": 0.010327517054975033,
"kl": 0.000568389892578125,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.192,
"grad_norm": 0.6696183681488037,
"kl": 0.0009899139404296875,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.194,
"grad_norm": 0.8246662020683289,
"kl": 0.0017538070678710938,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1928.5,
"epoch": 0.196,
"grad_norm": 0.955489456653595,
"kl": 0.0009098052978515625,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0467,
"reward": 0.375,
"reward_std": 0.3535533770918846,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.198,
"grad_norm": 0.03343038633465767,
"kl": 0.0007419586181640625,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.2,
"grad_norm": 0.006429341156035662,
"kl": 0.001178741455078125,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.202,
"grad_norm": 0.00873623974621296,
"kl": 0.0005130767822265625,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1959.0,
"epoch": 0.204,
"grad_norm": 0.017306441441178322,
"kl": 0.000946044921875,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.206,
"grad_norm": 0.7247556447982788,
"kl": 0.0008764266967773438,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.208,
"grad_norm": 0.008871670812368393,
"kl": 0.00033664703369140625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.21,
"grad_norm": 0.007749219890683889,
"kl": 0.00040531158447265625,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.212,
"grad_norm": 0.010708320885896683,
"kl": 0.0010166168212890625,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.214,
"grad_norm": 0.007295151706784964,
"kl": 0.0003333091735839844,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.216,
"grad_norm": 0.7922310829162598,
"kl": 0.000408172607421875,
"learning_rate": 9.636109026648554e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.218,
"grad_norm": 0.007899758405983448,
"kl": 0.0006399154663085938,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.22,
"grad_norm": 0.9048980474472046,
"kl": 0.001056671142578125,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.222,
"grad_norm": 0.010189698077738285,
"kl": 0.00031948089599609375,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 1794.5,
"epoch": 0.224,
"grad_norm": 1.0683528184890747,
"kl": 0.0034933090209960938,
"learning_rate": 9.58499865339809e-07,
"loss": -0.1162,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.226,
"grad_norm": 0.011858138255774975,
"kl": 0.000347137451171875,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.228,
"grad_norm": 0.012356019578874111,
"kl": 0.000885009765625,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 1752.5,
"epoch": 0.23,
"grad_norm": 1.0531798601150513,
"kl": 0.001102447509765625,
"learning_rate": 9.54457320834625e-07,
"loss": 0.1434,
"reward": 0.1875,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.232,
"grad_norm": 0.012715999968349934,
"kl": 0.00096893310546875,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 2033.5,
"epoch": 0.234,
"grad_norm": 0.7506431937217712,
"kl": 0.0050945281982421875,
"learning_rate": 9.516636183034564e-07,
"loss": 0.0002,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.236,
"grad_norm": 0.020277904346585274,
"kl": 0.00078582763671875,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 1624.5,
"epoch": 0.238,
"grad_norm": 0.9856612086296082,
"kl": 0.0022125244140625,
"learning_rate": 9.487916106540465e-07,
"loss": -0.0067,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.24,
"grad_norm": 0.007632279768586159,
"kl": 0.000583648681640625,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.242,
"grad_norm": 0.007833893410861492,
"kl": 0.0008401870727539062,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.244,
"grad_norm": 0.00713867275044322,
"kl": 0.0011892318725585938,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.246,
"grad_norm": 0.011064039543271065,
"kl": 0.0003185272216796875,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.248,
"grad_norm": 0.009095462039113045,
"kl": 0.0006237030029296875,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.25,
"grad_norm": 0.007876625284552574,
"kl": 0.00144195556640625,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.252,
"grad_norm": 0.05358020216226578,
"kl": 0.0010623931884765625,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0,
"reward": 0.625,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.254,
"grad_norm": 0.030433854088187218,
"kl": 0.0003848075866699219,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.256,
"grad_norm": 0.852528989315033,
"kl": 0.0011425018310546875,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.258,
"grad_norm": 0.013770471327006817,
"kl": 0.0007495880126953125,
"learning_rate": 9.332771203643714e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.26,
"grad_norm": 0.7055062055587769,
"kl": 0.0012664794921875,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.262,
"grad_norm": 0.7149041295051575,
"kl": 0.002803802490234375,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0001,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 1480.25,
"epoch": 0.264,
"grad_norm": 0.014864159747958183,
"kl": 0.031280517578125,
"learning_rate": 9.282549715730579e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.266,
"grad_norm": 0.8071376085281372,
"kl": 0.00067901611328125,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2047.25,
"epoch": 0.268,
"grad_norm": 0.009836402721703053,
"kl": 0.0012359619140625,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2012.25,
"epoch": 0.27,
"grad_norm": 0.007896827533841133,
"kl": 0.0009002685546875,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.272,
"grad_norm": 0.0065347570925951,
"kl": 0.000980377197265625,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.274,
"grad_norm": 0.8659334778785706,
"kl": 0.00147247314453125,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.276,
"grad_norm": 0.009184672497212887,
"kl": 0.001373291015625,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.278,
"grad_norm": 0.00881196279078722,
"kl": 0.001476287841796875,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1876.0,
"epoch": 0.28,
"grad_norm": 0.7441470623016357,
"kl": 0.0006885528564453125,
"learning_rate": 9.140576474687263e-07,
"loss": 0.06,
"reward": 0.3125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.282,
"grad_norm": 0.008356408216059208,
"kl": 0.00101470947265625,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1991.5,
"epoch": 0.284,
"grad_norm": 0.9580811262130737,
"kl": 0.006805419921875,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0003,
"reward": 0.5625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.286,
"grad_norm": 0.008105803281068802,
"kl": 0.002086639404296875,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.288,
"grad_norm": 0.007048532832413912,
"kl": 0.001430511474609375,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 1714.5,
"epoch": 0.29,
"grad_norm": 0.881592869758606,
"kl": 0.001071929931640625,
"learning_rate": 9.046048391230247e-07,
"loss": 0.1707,
"reward": 0.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.292,
"grad_norm": 0.008095495402812958,
"kl": 0.00165557861328125,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.294,
"grad_norm": 0.019286898896098137,
"kl": 0.00104522705078125,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.296,
"grad_norm": 0.007973677478730679,
"kl": 0.0015087127685546875,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.298,
"grad_norm": 0.007949120365083218,
"kl": 0.0004978179931640625,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.3,
"grad_norm": 0.007726718205958605,
"kl": 0.001689910888671875,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.302,
"grad_norm": 0.00826709158718586,
"kl": 0.0013580322265625,
"learning_rate": 8.926922383915315e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.304,
"grad_norm": 0.007963276468217373,
"kl": 0.00203704833984375,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.306,
"grad_norm": 0.008207273669540882,
"kl": 0.0008563995361328125,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.308,
"grad_norm": 0.010204545222222805,
"kl": 0.0006160736083984375,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.31,
"grad_norm": 0.00880539882928133,
"kl": 0.001453399658203125,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.312,
"grad_norm": 0.010449129156768322,
"kl": 0.0010318756103515625,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.314,
"grad_norm": 0.010188284330070019,
"kl": 0.000789642333984375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1963.5,
"epoch": 0.316,
"grad_norm": 0.7694103717803955,
"kl": 0.000640869140625,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0318,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 1807.0,
"epoch": 0.318,
"grad_norm": 0.8925904631614685,
"kl": 0.001445770263671875,
"learning_rate": 8.758773376468604e-07,
"loss": -0.1087,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.32,
"grad_norm": 0.7490972280502319,
"kl": 0.01116180419921875,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0004,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.322,
"grad_norm": 0.008408155292272568,
"kl": 0.0014629364013671875,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.324,
"grad_norm": 0.8539191484451294,
"kl": 0.0008831024169921875,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.326,
"grad_norm": 0.009831397794187069,
"kl": 0.000682830810546875,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.328,
"grad_norm": 0.011773304082453251,
"kl": 0.001129150390625,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.33,
"grad_norm": 0.7276328206062317,
"kl": 0.001377105712890625,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1512.75,
"epoch": 0.332,
"grad_norm": 0.008593901991844177,
"kl": 0.0006933212280273438,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0,
"reward": 0.375,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.334,
"grad_norm": 0.011681273579597473,
"kl": 0.000759124755859375,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.336,
"grad_norm": 0.9719880819320679,
"kl": 0.00112152099609375,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1850.75,
"epoch": 0.338,
"grad_norm": 0.017566794529557228,
"kl": 0.002292633056640625,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2016.0,
"epoch": 0.34,
"grad_norm": 1.1285712718963623,
"kl": 0.00118255615234375,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0114,
"reward": 0.3125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.342,
"grad_norm": 0.009710059501230717,
"kl": 0.001407623291015625,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.344,
"grad_norm": 0.009776381775736809,
"kl": 0.0014629364013671875,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.346,
"grad_norm": 0.00871388241648674,
"kl": 0.000507354736328125,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.348,
"grad_norm": 0.00912429578602314,
"kl": 0.000762939453125,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.35,
"grad_norm": 0.011816666461527348,
"kl": 0.0008449554443359375,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.352,
"grad_norm": 0.00985003262758255,
"kl": 0.001529693603515625,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.354,
"grad_norm": 0.00919476430863142,
"kl": 0.00167083740234375,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.356,
"grad_norm": 0.01162977609783411,
"kl": 0.00091552734375,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 1970.5,
"epoch": 0.358,
"grad_norm": 0.01332745049148798,
"kl": 0.001888275146484375,
"learning_rate": 8.295165011252396e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 1730.75,
"epoch": 0.36,
"grad_norm": 0.013342260383069515,
"kl": 0.00089263916015625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.362,
"grad_norm": 0.012930563651025295,
"kl": 0.001483917236328125,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0001,
"reward": 0.625,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.364,
"grad_norm": 0.03247583284974098,
"kl": 0.00113677978515625,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.366,
"grad_norm": 0.8804360032081604,
"kl": 0.0019130706787109375,
"learning_rate": 8.195606193320136e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.368,
"grad_norm": 0.007631482556462288,
"kl": 0.001468658447265625,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.37,
"grad_norm": 0.008286840282380581,
"kl": 0.001613616943359375,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.372,
"grad_norm": 0.016586236655712128,
"kl": 0.0007076263427734375,
"learning_rate": 8.119553365707802e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.374,
"grad_norm": 0.010034182108938694,
"kl": 0.0008754730224609375,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.376,
"grad_norm": 0.7020014524459839,
"kl": 0.0012836456298828125,
"learning_rate": 8.068211054579943e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.378,
"grad_norm": 1.163500189781189,
"kl": 0.0016021728515625,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.38,
"grad_norm": 0.8533800840377808,
"kl": 0.0010223388671875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.382,
"grad_norm": 0.8227788805961609,
"kl": 0.0030975341796875,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.384,
"grad_norm": 0.011001263745129108,
"kl": 0.0008344650268554688,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.386,
"grad_norm": 0.009392702020704746,
"kl": 0.001972198486328125,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1788.25,
"epoch": 0.388,
"grad_norm": 1.1597681045532227,
"kl": 0.01373291015625,
"learning_rate": 7.911220577405484e-07,
"loss": 0.1207,
"reward": 0.1875,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.39,
"grad_norm": 0.00962373148649931,
"kl": 0.0009326934814453125,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2025.0,
"epoch": 0.392,
"grad_norm": 0.009538036771118641,
"kl": 0.00116729736328125,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.394,
"grad_norm": 0.01050383411347866,
"kl": 0.0007953643798828125,
"learning_rate": 7.831121542179086e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.396,
"grad_norm": 0.009053800255060196,
"kl": 0.001438140869140625,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.398,
"grad_norm": 0.00897100381553173,
"kl": 0.001312255859375,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.4,
"grad_norm": 0.01025310903787613,
"kl": 0.001468658447265625,
"learning_rate": 7.75e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.402,
"grad_norm": 0.054522059857845306,
"kl": 0.0010223388671875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 1953.5,
"epoch": 0.404,
"grad_norm": 0.037722665816545486,
"kl": 0.0011157989501953125,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.406,
"grad_norm": 0.008431609719991684,
"kl": 0.0020599365234375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 1929.0,
"epoch": 0.408,
"grad_norm": 0.9533175826072693,
"kl": 0.001682281494140625,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0001,
"reward": 0.5625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2036.0,
"epoch": 0.41,
"grad_norm": 0.013962327502667904,
"kl": 0.0011119842529296875,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.412,
"grad_norm": 0.010255957953631878,
"kl": 0.0009479522705078125,
"learning_rate": 7.584832158039378e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.414,
"grad_norm": 0.01276308298110962,
"kl": 0.0014495849609375,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 1809.5,
"epoch": 0.416,
"grad_norm": 0.01625184714794159,
"kl": 0.00093841552734375,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.418,
"grad_norm": 0.010438801720738411,
"kl": 0.0012836456298828125,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.42,
"grad_norm": 0.011556737124919891,
"kl": 0.00215911865234375,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.422,
"grad_norm": 0.009882017970085144,
"kl": 0.0005950927734375,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 1786.0,
"epoch": 0.424,
"grad_norm": 0.8873146176338196,
"kl": 0.002838134765625,
"learning_rate": 7.416006812042827e-07,
"loss": -0.0358,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.426,
"grad_norm": 0.01222989521920681,
"kl": 0.0014190673828125,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.428,
"grad_norm": 0.009303831495344639,
"kl": 0.00127410888671875,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.43,
"grad_norm": 0.0105022257193923,
"kl": 0.0008087158203125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.432,
"grad_norm": 0.763167679309845,
"kl": 0.001544952392578125,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0001,
"reward": 0.3125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.434,
"grad_norm": 0.010868730954825878,
"kl": 0.001415252685546875,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.436,
"grad_norm": 0.016456812620162964,
"kl": 0.0006694793701171875,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 1947.75,
"epoch": 0.438,
"grad_norm": 0.5713726282119751,
"kl": 0.0039825439453125,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0002,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.44,
"grad_norm": 0.008825725875794888,
"kl": 0.000873565673828125,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.442,
"grad_norm": 0.015775861218571663,
"kl": 0.0016326904296875,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1892.75,
"epoch": 0.444,
"grad_norm": 0.011794094927608967,
"kl": 0.002288818359375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.446,
"grad_norm": 0.8902475833892822,
"kl": 0.0021953582763671875,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 1976.5,
"epoch": 0.448,
"grad_norm": 0.01060924306511879,
"kl": 0.00159454345703125,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 1884.0,
"epoch": 0.45,
"grad_norm": 0.7874477505683899,
"kl": 0.00165557861328125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0675,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.452,
"grad_norm": 0.009480384178459644,
"kl": 0.001316070556640625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.454,
"grad_norm": 0.8971606492996216,
"kl": 0.00217437744140625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1950.75,
"epoch": 0.456,
"grad_norm": 0.013516876846551895,
"kl": 0.001598358154296875,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.458,
"grad_norm": 0.012519976124167442,
"kl": 0.0012359619140625,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 1604.75,
"epoch": 0.46,
"grad_norm": 1.0533803701400757,
"kl": 0.0024871826171875,
"learning_rate": 6.890576474687263e-07,
"loss": 0.1645,
"reward": 0.75,
"reward_std": 0.7071067541837692,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.462,
"grad_norm": 0.04883728548884392,
"kl": 0.0012359619140625,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.464,
"grad_norm": 0.01617475040256977,
"kl": 0.00106048583984375,
"learning_rate": 6.83068622519821e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.466,
"grad_norm": 0.01098883431404829,
"kl": 0.00160980224609375,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.468,
"grad_norm": 0.010283930227160454,
"kl": 0.001750946044921875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.47,
"grad_norm": 0.7373048663139343,
"kl": 0.00131988525390625,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.472,
"grad_norm": 0.01158232893794775,
"kl": 0.001461029052734375,
"learning_rate": 6.710139192768694e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.474,
"grad_norm": 0.8555125594139099,
"kl": 0.001068115234375,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.476,
"grad_norm": 0.8464781641960144,
"kl": 0.006557464599609375,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0003,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.478,
"grad_norm": 0.14680787920951843,
"kl": 0.003086090087890625,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.48,
"grad_norm": 0.8916466236114502,
"kl": 0.010036468505859375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0004,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.482,
"grad_norm": 0.012613446451723576,
"kl": 0.001827239990234375,
"learning_rate": 6.558139508961654e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.484,
"grad_norm": 0.012667362578213215,
"kl": 0.001438140869140625,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.486,
"grad_norm": 0.011633522808551788,
"kl": 0.0007953643798828125,
"learning_rate": 6.496968239287603e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.488,
"grad_norm": 0.010322234593331814,
"kl": 0.00118255615234375,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.49,
"grad_norm": 0.013764915056526661,
"kl": 0.001338958740234375,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.492,
"grad_norm": 0.016423719003796577,
"kl": 0.0011043548583984375,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.494,
"grad_norm": 0.011886836029589176,
"kl": 0.0008907318115234375,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.496,
"grad_norm": 0.013753964565694332,
"kl": 0.0009822845458984375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.498,
"grad_norm": 0.012257426045835018,
"kl": 0.00170135498046875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.5,
"grad_norm": 0.7262430191040039,
"kl": 0.00194549560546875,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.502,
"grad_norm": 0.7270606756210327,
"kl": 0.0007076263427734375,
"learning_rate": 6.25045936022246e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.504,
"grad_norm": 0.017829036340117455,
"kl": 0.0005779266357421875,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.506,
"grad_norm": 0.0317312628030777,
"kl": 0.0013294219970703125,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 1972.25,
"epoch": 0.508,
"grad_norm": 0.026640823110938072,
"kl": 0.000835418701171875,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.51,
"grad_norm": 0.012969509698450565,
"kl": 0.001567840576171875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.512,
"grad_norm": 0.012548292055726051,
"kl": 0.0010986328125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.514,
"grad_norm": 0.008928977884352207,
"kl": 0.000873565673828125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.516,
"grad_norm": 0.8101487755775452,
"kl": 0.0009307861328125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.518,
"grad_norm": 0.8978201746940613,
"kl": 0.0007915496826171875,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.52,
"grad_norm": 1.092624306678772,
"kl": 0.00101470947265625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.522,
"grad_norm": 0.7928449511528015,
"kl": 0.0011920928955078125,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 1534.75,
"epoch": 0.524,
"grad_norm": 1.0032625198364258,
"kl": 0.0021257400512695312,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0615,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.526,
"grad_norm": 0.011720544658601284,
"kl": 0.000644683837890625,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.528,
"grad_norm": 0.011771513149142265,
"kl": 0.0007781982421875,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.53,
"grad_norm": 0.013503954745829105,
"kl": 0.001155853271484375,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.532,
"grad_norm": 0.09234623610973358,
"kl": 0.0018482208251953125,
"learning_rate": 5.78255733788191e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.534,
"grad_norm": 0.011625738814473152,
"kl": 0.0007114410400390625,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 1589.75,
"epoch": 0.536,
"grad_norm": 0.9924006462097168,
"kl": 0.0024566650390625,
"learning_rate": 5.71982396408026e-07,
"loss": -0.0413,
"reward": 0.875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.538,
"grad_norm": 0.726823627948761,
"kl": 0.001861572265625,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.54,
"grad_norm": 0.011368845589458942,
"kl": 0.001186370849609375,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.542,
"grad_norm": 0.014150974340736866,
"kl": 0.0012617111206054688,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.544,
"grad_norm": 0.03309360519051552,
"kl": 0.00139617919921875,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.546,
"grad_norm": 0.027406711131334305,
"kl": 0.002048492431640625,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 1862.75,
"epoch": 0.548,
"grad_norm": 0.872232973575592,
"kl": 0.001857757568359375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.0781,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1893.5,
"epoch": 0.55,
"grad_norm": 0.013754754327237606,
"kl": 0.00106048583984375,
"learning_rate": 5.5e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.552,
"grad_norm": 0.8218861222267151,
"kl": 0.0015106201171875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.554,
"grad_norm": 0.01286914199590683,
"kl": 0.001323699951171875,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.556,
"grad_norm": 0.8357925415039062,
"kl": 0.00146484375,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0001,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.558,
"grad_norm": 0.011115595698356628,
"kl": 0.001163482666015625,
"learning_rate": 5.37435262574394e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.56,
"grad_norm": 0.7368820309638977,
"kl": 0.0016918182373046875,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1845.5,
"epoch": 0.562,
"grad_norm": 0.8080930709838867,
"kl": 0.002285003662109375,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0872,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.564,
"grad_norm": 0.6212303638458252,
"kl": 0.0017833709716796875,
"learning_rate": 5.28017603591974e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.566,
"grad_norm": 0.8698393106460571,
"kl": 0.001556396484375,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.568,
"grad_norm": 0.6195830702781677,
"kl": 0.001125335693359375,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.57,
"grad_norm": 0.014679288491606712,
"kl": 0.00112152099609375,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.572,
"grad_norm": 0.011439023539423943,
"kl": 0.000751495361328125,
"learning_rate": 5.154764373429315e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.574,
"grad_norm": 0.013943897560238838,
"kl": 0.00086212158203125,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.576,
"grad_norm": 0.011966060847043991,
"kl": 0.001087188720703125,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.578,
"grad_norm": 0.7932072877883911,
"kl": 0.002170562744140625,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.58,
"grad_norm": 0.01169038936495781,
"kl": 0.001129150390625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.582,
"grad_norm": 0.85643470287323,
"kl": 0.002471923828125,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.584,
"grad_norm": 1.001603364944458,
"kl": 0.01709747314453125,
"learning_rate": 4.967182142620745e-07,
"loss": 0.0007,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1886.25,
"epoch": 0.586,
"grad_norm": 0.7774127721786499,
"kl": 0.001667022705078125,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0664,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.588,
"grad_norm": 1.0563451051712036,
"kl": 0.00324249267578125,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.59,
"grad_norm": 0.012082475237548351,
"kl": 0.00081634521484375,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2003.5,
"epoch": 0.592,
"grad_norm": 0.590258002281189,
"kl": 0.001506805419921875,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0161,
"reward": 0.625,
"reward_std": 0.5303300619125366,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.594,
"grad_norm": 0.7182537913322449,
"kl": 0.00550079345703125,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0002,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1947.25,
"epoch": 0.596,
"grad_norm": 1.0634351968765259,
"kl": 0.0025787353515625,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0387,
"reward": 0.6875,
"reward_std": 0.6187184080481529,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.598,
"grad_norm": 0.054920367896556854,
"kl": 0.00203704833984375,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.6,
"grad_norm": 0.012694926001131535,
"kl": 0.000698089599609375,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 1795.5,
"epoch": 0.602,
"grad_norm": 0.759120523929596,
"kl": 0.0024261474609375,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.604,
"grad_norm": 0.010465751402080059,
"kl": 0.001453399658203125,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1853.0,
"epoch": 0.606,
"grad_norm": 0.8468216061592102,
"kl": 0.00121307373046875,
"learning_rate": 4.6259454195101267e-07,
"loss": -0.0831,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.608,
"grad_norm": 0.9815247654914856,
"kl": 0.001689910888671875,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 1868.75,
"epoch": 0.61,
"grad_norm": 0.8037129044532776,
"kl": 0.000972747802734375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.075,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.612,
"grad_norm": 0.011851584538817406,
"kl": 0.001220703125,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.614,
"grad_norm": 0.012452667579054832,
"kl": 0.000675201416015625,
"learning_rate": 4.503031760712397e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.616,
"grad_norm": 0.8061856627464294,
"kl": 0.001941680908203125,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.618,
"grad_norm": 0.011598404496908188,
"kl": 0.0009021759033203125,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.62,
"grad_norm": 0.013049165718257427,
"kl": 0.0008697509765625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.622,
"grad_norm": 0.01000931765884161,
"kl": 0.0010738372802734375,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.624,
"grad_norm": 0.017103025689721107,
"kl": 0.00090789794921875,
"learning_rate": 4.350494089288943e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.626,
"grad_norm": 0.015600494109094143,
"kl": 0.001026153564453125,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.628,
"grad_norm": 0.68843674659729,
"kl": 0.0009326934814453125,
"learning_rate": 4.2898608072313045e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.63,
"grad_norm": 0.02028859592974186,
"kl": 0.001399993896484375,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.632,
"grad_norm": 0.010004539042711258,
"kl": 0.0006055831909179688,
"learning_rate": 4.2294634442070553e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.634,
"grad_norm": 0.012407040223479271,
"kl": 0.001323699951171875,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.636,
"grad_norm": 0.7615489363670349,
"kl": 0.001617431640625,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.638,
"grad_norm": 0.7873152494430542,
"kl": 0.00064849853515625,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.64,
"grad_norm": 0.012965940870344639,
"kl": 0.000850677490234375,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.642,
"grad_norm": 0.013525367714464664,
"kl": 0.000804901123046875,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.644,
"grad_norm": 0.8479946255683899,
"kl": 0.0014972686767578125,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.646,
"grad_norm": 0.7696375250816345,
"kl": 0.002063751220703125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.648,
"grad_norm": 0.012891444377601147,
"kl": 0.00150299072265625,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.65,
"grad_norm": 1.1341594457626343,
"kl": 0.002437591552734375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0001,
"reward": 0.375,
"reward_std": 0.5303300842642784,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.652,
"grad_norm": 0.015124933794140816,
"kl": 0.0007419586181640625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.654,
"grad_norm": 0.7730603814125061,
"kl": 0.002941131591796875,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.656,
"grad_norm": 0.02122451364994049,
"kl": 0.000957489013671875,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.658,
"grad_norm": 0.01871700957417488,
"kl": 0.0010528564453125,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.66,
"grad_norm": 0.0117810582742095,
"kl": 0.0014801025390625,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1353.5,
"epoch": 0.662,
"grad_norm": 0.017292601987719536,
"kl": 0.0030651092529296875,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0001,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1955.25,
"epoch": 0.664,
"grad_norm": 0.7610009908676147,
"kl": 0.001689910888671875,
"learning_rate": 3.7561798609655373e-07,
"loss": 0.0353,
"reward": 0.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.666,
"grad_norm": 0.013269172981381416,
"kl": 0.0008087158203125,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.668,
"grad_norm": 0.01594419591128826,
"kl": 0.000812530517578125,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.67,
"grad_norm": 0.010347824543714523,
"kl": 0.000751495361328125,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.672,
"grad_norm": 0.014122740365564823,
"kl": 0.0012054443359375,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.674,
"grad_norm": 0.013101037591695786,
"kl": 0.001033782958984375,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.676,
"grad_norm": 0.666260838508606,
"kl": 0.00324249267578125,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 2022.75,
"epoch": 0.678,
"grad_norm": 0.7250146269798279,
"kl": 0.00144195556640625,
"learning_rate": 3.555614130391079e-07,
"loss": 0.009,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 2003.25,
"epoch": 0.68,
"grad_norm": 0.7383328676223755,
"kl": 0.00299072265625,
"learning_rate": 3.5273298394491515e-07,
"loss": -0.016,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.682,
"grad_norm": 0.011795282363891602,
"kl": 0.000850677490234375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.684,
"grad_norm": 0.9384401440620422,
"kl": 0.002079010009765625,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.686,
"grad_norm": 0.010377887636423111,
"kl": 0.0008144378662109375,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.688,
"grad_norm": 0.7026439309120178,
"kl": 0.001007080078125,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.69,
"grad_norm": 0.011138451285660267,
"kl": 0.0008373260498046875,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.692,
"grad_norm": 0.7246440649032593,
"kl": 0.0035762786865234375,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.694,
"grad_norm": 0.01353926956653595,
"kl": 0.001087188720703125,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1851.25,
"epoch": 0.696,
"grad_norm": 1.2388075590133667,
"kl": 0.00121307373046875,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0841,
"reward": 0.1875,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 2005.75,
"epoch": 0.698,
"grad_norm": 0.7063978314399719,
"kl": 0.002201080322265625,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.0153,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.7,
"grad_norm": 0.009749515913426876,
"kl": 0.001163482666015625,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.702,
"grad_norm": 0.01226428709924221,
"kl": 0.00107574462890625,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.704,
"grad_norm": 0.01303025335073471,
"kl": 0.001399993896484375,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.706,
"grad_norm": 0.028528396040201187,
"kl": 0.001117706298828125,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 1785.25,
"epoch": 0.708,
"grad_norm": 0.9730527997016907,
"kl": 0.001430511474609375,
"learning_rate": 3.142063423134644e-07,
"loss": 0.122,
"reward": 0.875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.71,
"grad_norm": 0.013550005853176117,
"kl": 0.001338958740234375,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.712,
"grad_norm": 0.010767250321805477,
"kl": 0.00067138671875,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.714,
"grad_norm": 0.012552580796182156,
"kl": 0.00140380859375,
"learning_rate": 3.062313053727671e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.716,
"grad_norm": 0.6516157984733582,
"kl": 0.00159454345703125,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.718,
"grad_norm": 0.012717018835246563,
"kl": 0.0008392333984375,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.72,
"grad_norm": 0.014254845678806305,
"kl": 0.001018524169921875,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.722,
"grad_norm": 0.017017841339111328,
"kl": 0.0009784698486328125,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 1976.75,
"epoch": 0.724,
"grad_norm": 0.8839628100395203,
"kl": 0.002727508544921875,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0265,
"reward": 0.3125,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.726,
"grad_norm": 0.010562002658843994,
"kl": 0.00103759765625,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.728,
"grad_norm": 0.013268781825900078,
"kl": 0.0009174346923828125,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 1927.75,
"epoch": 0.73,
"grad_norm": 0.8109666705131531,
"kl": 0.0020313262939453125,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0471,
"reward": 0.875,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.732,
"grad_norm": 0.012277526780962944,
"kl": 0.000827789306640625,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.734,
"grad_norm": 0.7054362893104553,
"kl": 0.0069103240966796875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0003,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.736,
"grad_norm": 0.012917861342430115,
"kl": 0.0012969970703125,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.738,
"grad_norm": 0.011064048856496811,
"kl": 0.0007419586181640625,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.74,
"grad_norm": 0.014153026975691319,
"kl": 0.00079345703125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 1957.5,
"epoch": 0.742,
"grad_norm": 0.02183438278734684,
"kl": 0.002010345458984375,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 1967.0,
"epoch": 0.744,
"grad_norm": 0.6841965317726135,
"kl": 0.00157928466796875,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0304,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.746,
"grad_norm": 0.010570400394499302,
"kl": 0.00075531005859375,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.748,
"grad_norm": 0.013351581990718842,
"kl": 0.000873565673828125,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 1565.75,
"epoch": 0.75,
"grad_norm": 0.9974377751350403,
"kl": 0.008136749267578125,
"learning_rate": 2.6074557564105724e-07,
"loss": -0.3145,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 1881.75,
"epoch": 0.752,
"grad_norm": 1.1940348148345947,
"kl": 0.0038909912109375,
"learning_rate": 2.583460445215911e-07,
"loss": 0.0687,
"reward": 0.3125,
"reward_std": 0.2651650384068489,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.754,
"grad_norm": 0.01335156336426735,
"kl": 0.0006580352783203125,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.756,
"grad_norm": 0.011012405157089233,
"kl": 0.00142669677734375,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.758,
"grad_norm": 0.16783788800239563,
"kl": 0.00443267822265625,
"learning_rate": 2.512332043064913e-07,
"loss": 0.0002,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.76,
"grad_norm": 0.011893996968865395,
"kl": 0.001003265380859375,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.762,
"grad_norm": 0.7219942212104797,
"kl": 0.00760650634765625,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0003,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.764,
"grad_norm": 0.04678433761000633,
"kl": 0.0009021759033203125,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.766,
"grad_norm": 0.011990766040980816,
"kl": 0.0006542205810546875,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.768,
"grad_norm": 0.011609113775193691,
"kl": 0.0009002685546875,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.77,
"grad_norm": 0.029996510595083237,
"kl": 0.001007080078125,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.772,
"grad_norm": 0.7205003499984741,
"kl": 0.001007080078125,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.774,
"grad_norm": 0.01278019044548273,
"kl": 0.000644683837890625,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.776,
"grad_norm": 0.050757694989442825,
"kl": 0.0013141632080078125,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.778,
"grad_norm": 0.8608619570732117,
"kl": 0.00222015380859375,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0001,
"reward": 0.625,
"reward_std": 0.5303300768136978,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.78,
"grad_norm": 0.01890842616558075,
"kl": 0.0016078948974609375,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 1757.5,
"epoch": 0.782,
"grad_norm": 0.9615358710289001,
"kl": 0.00257110595703125,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.1401,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 2038.25,
"epoch": 0.784,
"grad_norm": 0.7946398854255676,
"kl": 0.0009212493896484375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0034,
"reward": 0.8125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3125,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.786,
"grad_norm": 0.01277601532638073,
"kl": 0.001102447509765625,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 1800.5,
"epoch": 0.788,
"grad_norm": 0.9031627774238586,
"kl": 0.00704193115234375,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.1129,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.79,
"grad_norm": 0.017206581309437752,
"kl": 0.001468658447265625,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 1820.75,
"epoch": 0.792,
"grad_norm": 0.9008931517601013,
"kl": 0.0047149658203125,
"learning_rate": 2.134908592756607e-07,
"loss": 0.0002,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 2010.25,
"epoch": 0.794,
"grad_norm": 0.8031813502311707,
"kl": 0.00140380859375,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0135,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.796,
"grad_norm": 0.015710551291704178,
"kl": 0.0016117095947265625,
"learning_rate": 2.0935222495670968e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.798,
"grad_norm": 0.7256066203117371,
"kl": 0.00640869140625,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0003,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.8,
"grad_norm": 0.011895825155079365,
"kl": 0.00131988525390625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.802,
"grad_norm": 0.01262570358812809,
"kl": 0.00177001953125,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.804,
"grad_norm": 0.014729364775121212,
"kl": 0.00090789794921875,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.806,
"grad_norm": 0.012951449491083622,
"kl": 0.00146484375,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.808,
"grad_norm": 0.6974371671676636,
"kl": 0.0015716552734375,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0001,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.81,
"grad_norm": 0.012584330514073372,
"kl": 0.001598358154296875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.812,
"grad_norm": 0.011754573322832584,
"kl": 0.00107574462890625,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.814,
"grad_norm": 0.02457410842180252,
"kl": 0.0015869140625,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.816,
"grad_norm": 0.7841270565986633,
"kl": 0.00121307373046875,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.818,
"grad_norm": 0.01260992232710123,
"kl": 0.0014495849609375,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.82,
"grad_norm": 0.011464129202067852,
"kl": 0.0010471343994140625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 1915.0,
"epoch": 0.822,
"grad_norm": 0.8047151565551758,
"kl": 0.00250244140625,
"learning_rate": 1.8410465752883758e-07,
"loss": -0.0527,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.824,
"grad_norm": 0.0124558350071311,
"kl": 0.00130462646484375,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.826,
"grad_norm": 0.01380992028862238,
"kl": 0.0008144378662109375,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.828,
"grad_norm": 0.9551144242286682,
"kl": 0.011430740356445312,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0005,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.83,
"grad_norm": 0.012345471419394016,
"kl": 0.001438140869140625,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 1821.0,
"epoch": 0.832,
"grad_norm": 0.7977136969566345,
"kl": 0.001773834228515625,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.0981,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.834,
"grad_norm": 0.011019429191946983,
"kl": 0.0006771087646484375,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.836,
"grad_norm": 0.015206689946353436,
"kl": 0.0016021728515625,
"learning_rate": 1.7174502842694212e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.838,
"grad_norm": 0.04012497141957283,
"kl": 0.000911712646484375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.84,
"grad_norm": 0.01337823923677206,
"kl": 0.0009021759033203125,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.842,
"grad_norm": 0.010002349503338337,
"kl": 0.0014495849609375,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.844,
"grad_norm": 0.716344952583313,
"kl": 0.00653076171875,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0003,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.846,
"grad_norm": 0.037377193570137024,
"kl": 0.001346588134765625,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.848,
"grad_norm": 0.011378524824976921,
"kl": 0.0007991790771484375,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.85,
"grad_norm": 0.012089181691408157,
"kl": 0.00154876708984375,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.852,
"grad_norm": 0.7250344753265381,
"kl": 0.002223968505859375,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 1880.5,
"epoch": 0.854,
"grad_norm": 0.8333255648612976,
"kl": 0.0015106201171875,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0692,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1326.75,
"epoch": 0.856,
"grad_norm": 1.4510780572891235,
"kl": 0.0024261474609375,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0319,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.858,
"grad_norm": 0.01227201521396637,
"kl": 0.0006866455078125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.86,
"grad_norm": 0.012068121694028378,
"kl": 0.0009098052978515625,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.862,
"grad_norm": 0.012486455962061882,
"kl": 0.000980377197265625,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.864,
"grad_norm": 0.013689450919628143,
"kl": 0.00124359130859375,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.866,
"grad_norm": 0.5863283276557922,
"kl": 0.00223541259765625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.868,
"grad_norm": 0.01356459315866232,
"kl": 0.00078582763671875,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.87,
"grad_norm": 0.009759028442203999,
"kl": 0.0011749267578125,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.872,
"grad_norm": 0.02292313612997532,
"kl": 0.0009002685546875,
"learning_rate": 1.4417536311769885e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.874,
"grad_norm": 0.013289421796798706,
"kl": 0.00077056884765625,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.876,
"grad_norm": 0.0265584085136652,
"kl": 0.00119781494140625,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 2008.25,
"epoch": 0.878,
"grad_norm": 0.013609832152724266,
"kl": 0.000782012939453125,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.88,
"grad_norm": 0.010987906716763973,
"kl": 0.00115203857421875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.882,
"grad_norm": 0.01149643212556839,
"kl": 0.0007152557373046875,
"learning_rate": 1.3763677169699217e-07,
"loss": 0.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.884,
"grad_norm": 0.012063896283507347,
"kl": 0.000858306884765625,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.886,
"grad_norm": 0.6539027094841003,
"kl": 0.0033817291259765625,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.888,
"grad_norm": 0.683651864528656,
"kl": 0.001430511474609375,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.89,
"grad_norm": 0.014244834892451763,
"kl": 0.001583099365234375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 1551.25,
"epoch": 0.892,
"grad_norm": 0.04096902534365654,
"kl": 0.003017425537109375,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0001,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.894,
"grad_norm": 0.013455760665237904,
"kl": 0.00115203857421875,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 1822.0,
"epoch": 0.896,
"grad_norm": 1.1271547079086304,
"kl": 0.0069122314453125,
"learning_rate": 1.2932844562179352e-07,
"loss": -0.0998,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.898,
"grad_norm": 0.6208535432815552,
"kl": 0.0009765625,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0,
"reward": 0.5,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.9,
"grad_norm": 0.009892701171338558,
"kl": 0.0006608963012695312,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.902,
"grad_norm": 0.0119470888748765,
"kl": 0.000751495361328125,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 1717.0,
"epoch": 0.904,
"grad_norm": 0.015732314437627792,
"kl": 0.001445770263671875,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.0,
"reward": 0.125,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.906,
"grad_norm": 0.7161591649055481,
"kl": 0.001621246337890625,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.908,
"grad_norm": 0.009790318086743355,
"kl": 0.0006418228149414062,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.91,
"grad_norm": 0.01178540289402008,
"kl": 0.0012798309326171875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 1967.75,
"epoch": 0.912,
"grad_norm": 0.918179452419281,
"kl": 0.00170135498046875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0301,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.914,
"grad_norm": 0.7712357640266418,
"kl": 0.00215911865234375,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0001,
"reward": 0.6875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1705.5,
"epoch": 0.916,
"grad_norm": 0.8978808522224426,
"kl": 0.00180816650390625,
"learning_rate": 1.1920622611056974e-07,
"loss": 0.0779,
"reward": 0.25,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.918,
"grad_norm": 0.8227681517601013,
"kl": 0.00136566162109375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.92,
"grad_norm": 0.009116187691688538,
"kl": 0.00128936767578125,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.922,
"grad_norm": 0.014834447763860226,
"kl": 0.00095367431640625,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.924,
"grad_norm": 0.013181531801819801,
"kl": 0.0008907318115234375,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.926,
"grad_norm": 0.010957694612443447,
"kl": 0.0006504058837890625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.928,
"grad_norm": 0.8815539479255676,
"kl": 0.0010204315185546875,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.93,
"grad_norm": 0.010624675080180168,
"kl": 0.0008087158203125,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.932,
"grad_norm": 0.013844382017850876,
"kl": 0.001033782958984375,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.934,
"grad_norm": 0.012338408268988132,
"kl": 0.000690460205078125,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 1994.0,
"epoch": 0.936,
"grad_norm": 0.7254545092582703,
"kl": 0.0010471343994140625,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0197,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.938,
"grad_norm": 0.01179241482168436,
"kl": 0.0006885528564453125,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.94,
"grad_norm": 0.6890900135040283,
"kl": 0.001373291015625,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.942,
"grad_norm": 0.5639128684997559,
"kl": 0.00131988525390625,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.944,
"grad_norm": 0.012823720462620258,
"kl": 0.001026153564453125,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.946,
"grad_norm": 0.009964537806808949,
"kl": 0.0007228851318359375,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.948,
"grad_norm": 0.7185282707214355,
"kl": 0.0028629302978515625,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.95,
"grad_norm": 0.012274986132979393,
"kl": 0.00087738037109375,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 1744.25,
"epoch": 0.952,
"grad_norm": 0.8758648633956909,
"kl": 0.002716064453125,
"learning_rate": 1.063017833182728e-07,
"loss": 0.1492,
"reward": 0.1875,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.954,
"grad_norm": 0.014645918272435665,
"kl": 0.000720977783203125,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.956,
"grad_norm": 0.013319121673703194,
"kl": 0.0009250640869140625,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.958,
"grad_norm": 0.009549077600240707,
"kl": 0.0014190673828125,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.96,
"grad_norm": 0.015292295254766941,
"kl": 0.0009307861328125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.962,
"grad_norm": 1.031322717666626,
"kl": 0.003509521484375,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0001,
"reward": 0.125,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.125,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.964,
"grad_norm": 0.011882193386554718,
"kl": 0.001026153564453125,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 1847.0,
"epoch": 0.966,
"grad_norm": 0.6594410538673401,
"kl": 0.002811431884765625,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0864,
"reward": 0.4375,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.968,
"grad_norm": 0.013277575373649597,
"kl": 0.000789642333984375,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.97,
"grad_norm": 0.7425467371940613,
"kl": 0.00323486328125,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0001,
"reward": 0.0625,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0625,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.972,
"grad_norm": 0.01078079268336296,
"kl": 0.00154876708984375,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.974,
"grad_norm": 0.012354187667369843,
"kl": 0.0005855560302734375,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.976,
"grad_norm": 0.7250139117240906,
"kl": 0.0037212371826171875,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.978,
"grad_norm": 0.014553495682775974,
"kl": 0.00141143798828125,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0001,
"reward": 0.75,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.98,
"grad_norm": 0.01307358592748642,
"kl": 0.000835418701171875,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 1908.5,
"epoch": 0.982,
"grad_norm": 0.7941016554832458,
"kl": 0.0010204315185546875,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0558,
"reward": 0.375,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.375,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.984,
"grad_norm": 0.008943392895162106,
"kl": 0.000537872314453125,
"learning_rate": 1.0070165611810855e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.986,
"grad_norm": 0.012137016281485558,
"kl": 0.001377105712890625,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0001,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.988,
"grad_norm": 0.01392819918692112,
"kl": 0.0009784698486328125,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.99,
"grad_norm": 0.010369054041802883,
"kl": 0.000942230224609375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.992,
"grad_norm": 0.011974328197538853,
"kl": 0.0006275177001953125,
"learning_rate": 1.0017544823184055e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.994,
"grad_norm": 0.01267942413687706,
"kl": 0.00104522705078125,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.996,
"grad_norm": 0.876571536064148,
"kl": 0.003925323486328125,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0002,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 0.998,
"grad_norm": 0.6709615588188171,
"kl": 0.001743316650390625,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0001,
"reward": 0.1875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1875,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 2048.0,
"epoch": 1.0,
"grad_norm": 0.014002146199345589,
"kl": 0.0007781982421875,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.25,
"step": 500
},
{
"epoch": 1.0,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.0033100851627775683,
"train_runtime": 14646.384,
"train_samples_per_second": 0.068,
"train_steps_per_second": 0.034
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}