|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.002, |
|
"grad_norm": 0.0, |
|
"kl": 0.0, |
|
"learning_rate": 2e-08, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.004, |
|
"grad_norm": 0.6894801259040833, |
|
"kl": 0.0, |
|
"learning_rate": 4e-08, |
|
"loss": 0.0, |
|
"reward": 0.5625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1845.75, |
|
"epoch": 0.006, |
|
"grad_norm": 0.0036829786840826273, |
|
"kl": 4.696846008300781e-05, |
|
"learning_rate": 6e-08, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.008, |
|
"grad_norm": 0.0038558689411729574, |
|
"kl": 9.775161743164062e-05, |
|
"learning_rate": 8e-08, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.01, |
|
"grad_norm": 0.7485816478729248, |
|
"kl": 0.00010919570922851562, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1985.5, |
|
"epoch": 0.012, |
|
"grad_norm": 0.7458791732788086, |
|
"kl": 7.796287536621094e-05, |
|
"learning_rate": 1.2e-07, |
|
"loss": -0.023, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.014, |
|
"grad_norm": 0.003527791704982519, |
|
"kl": 6.628036499023438e-05, |
|
"learning_rate": 1.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.016, |
|
"grad_norm": 0.8426324129104614, |
|
"kl": 7.104873657226562e-05, |
|
"learning_rate": 1.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.018, |
|
"grad_norm": 0.7430940270423889, |
|
"kl": 9.5367431640625e-05, |
|
"learning_rate": 1.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.02, |
|
"grad_norm": 0.003660305170342326, |
|
"kl": 5.626678466796875e-05, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.022, |
|
"grad_norm": 0.9972493648529053, |
|
"kl": 0.0001544952392578125, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.024, |
|
"grad_norm": 0.003556522075086832, |
|
"kl": 6.723403930664062e-05, |
|
"learning_rate": 2.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1682.25, |
|
"epoch": 0.026, |
|
"grad_norm": 0.0050394581630826, |
|
"kl": 8.845329284667969e-05, |
|
"learning_rate": 2.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.028, |
|
"grad_norm": 0.004075606819242239, |
|
"kl": 0.0001049041748046875, |
|
"learning_rate": 2.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.03, |
|
"grad_norm": 0.6532822251319885, |
|
"kl": 7.2479248046875e-05, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.032, |
|
"grad_norm": 0.7762453556060791, |
|
"kl": 0.00011777877807617188, |
|
"learning_rate": 3.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.034, |
|
"grad_norm": 0.7456417679786682, |
|
"kl": 8.58306884765625e-05, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1770.5, |
|
"epoch": 0.036, |
|
"grad_norm": 0.0034950117114931345, |
|
"kl": 3.5762786865234375e-05, |
|
"learning_rate": 3.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.038, |
|
"grad_norm": 0.003600472817197442, |
|
"kl": 6.866455078125e-05, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1856.25, |
|
"epoch": 0.04, |
|
"grad_norm": 0.8460555076599121, |
|
"kl": 8.106231689453125e-05, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0814, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.042, |
|
"grad_norm": 0.0038911281153559685, |
|
"kl": 0.0001068115234375, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.044, |
|
"grad_norm": 0.7139887809753418, |
|
"kl": 0.0001125335693359375, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1855.25, |
|
"epoch": 0.046, |
|
"grad_norm": 0.005301118828356266, |
|
"kl": 5.6862831115722656e-05, |
|
"learning_rate": 4.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.048, |
|
"grad_norm": 0.0036839963868260384, |
|
"kl": 7.62939453125e-05, |
|
"learning_rate": 4.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.05, |
|
"grad_norm": 0.003639479400590062, |
|
"kl": 8.678436279296875e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.052, |
|
"grad_norm": 0.8071563243865967, |
|
"kl": 8.535385131835938e-05, |
|
"learning_rate": 5.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.054, |
|
"grad_norm": 0.0037776094395667315, |
|
"kl": 0.00011014938354492188, |
|
"learning_rate": 5.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1608.75, |
|
"epoch": 0.056, |
|
"grad_norm": 0.6625211834907532, |
|
"kl": 4.398822784423828e-05, |
|
"learning_rate": 5.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1841.25, |
|
"epoch": 0.058, |
|
"grad_norm": 0.0036761562805622816, |
|
"kl": 8.678436279296875e-05, |
|
"learning_rate": 5.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.06, |
|
"grad_norm": 0.0033802345860749483, |
|
"kl": 5.7697296142578125e-05, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.062, |
|
"grad_norm": 0.003240807680413127, |
|
"kl": 6.29425048828125e-05, |
|
"learning_rate": 6.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.064, |
|
"grad_norm": 0.0036673492286354303, |
|
"kl": 7.152557373046875e-05, |
|
"learning_rate": 6.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1735.75, |
|
"epoch": 0.066, |
|
"grad_norm": 0.8675172328948975, |
|
"kl": 2.682209014892578e-05, |
|
"learning_rate": 6.6e-07, |
|
"loss": -0.0631, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.068, |
|
"grad_norm": 0.8184774518013, |
|
"kl": 9.775161743164062e-05, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.07, |
|
"grad_norm": 0.003660478862002492, |
|
"kl": 0.00010251998901367188, |
|
"learning_rate": 7e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.072, |
|
"grad_norm": 0.004034126177430153, |
|
"kl": 6.4849853515625e-05, |
|
"learning_rate": 7.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.074, |
|
"grad_norm": 0.7589001655578613, |
|
"kl": 0.00010824203491210938, |
|
"learning_rate": 7.4e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.076, |
|
"grad_norm": 0.6517212986946106, |
|
"kl": 8.20159912109375e-05, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.078, |
|
"grad_norm": 0.8392653465270996, |
|
"kl": 0.0001430511474609375, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.08, |
|
"grad_norm": 0.990591824054718, |
|
"kl": 0.0001125335693359375, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.082, |
|
"grad_norm": 0.7600575685501099, |
|
"kl": 0.00014066696166992188, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.084, |
|
"grad_norm": 0.004982170183211565, |
|
"kl": 0.00013589859008789062, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.086, |
|
"grad_norm": 0.6588786840438843, |
|
"kl": 0.00019502639770507812, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.088, |
|
"grad_norm": 0.876471996307373, |
|
"kl": 0.00020933151245117188, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1857.0, |
|
"epoch": 0.09, |
|
"grad_norm": 0.007592031732201576, |
|
"kl": 0.00015783309936523438, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.092, |
|
"grad_norm": 0.649493932723999, |
|
"kl": 0.00015401840209960938, |
|
"learning_rate": 9.2e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.094, |
|
"grad_norm": 0.0060058352537453175, |
|
"kl": 0.00024318695068359375, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1703.5, |
|
"epoch": 0.096, |
|
"grad_norm": 0.7367803454399109, |
|
"kl": 0.00028061866760253906, |
|
"learning_rate": 9.6e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.098, |
|
"grad_norm": 0.7102321982383728, |
|
"kl": 0.0003223419189453125, |
|
"learning_rate": 9.8e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.1, |
|
"grad_norm": 0.5775962471961975, |
|
"kl": 0.00010824203491210938, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.102, |
|
"grad_norm": 0.0037765211891382933, |
|
"kl": 7.62939453125e-05, |
|
"learning_rate": 9.999890338174275e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1991.5, |
|
"epoch": 0.104, |
|
"grad_norm": 0.8791236281394958, |
|
"kl": 0.00046634674072265625, |
|
"learning_rate": 9.999561358041868e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.106, |
|
"grad_norm": 0.006120009813457727, |
|
"kl": 0.0004935264587402344, |
|
"learning_rate": 9.999013075636804e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.108, |
|
"grad_norm": 0.006084068212658167, |
|
"kl": 0.0002951622009277344, |
|
"learning_rate": 9.998245517681593e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1732.5, |
|
"epoch": 0.11, |
|
"grad_norm": 0.008206584490835667, |
|
"kl": 0.00015926361083984375, |
|
"learning_rate": 9.997258721585931e-07, |
|
"loss": 0.0, |
|
"reward": 0.5, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2031.5, |
|
"epoch": 0.112, |
|
"grad_norm": 0.06737767159938812, |
|
"kl": 0.0014257431030273438, |
|
"learning_rate": 9.996052735444862e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.114, |
|
"grad_norm": 0.6304325461387634, |
|
"kl": 0.0003185272216796875, |
|
"learning_rate": 9.994627618036452e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.116, |
|
"grad_norm": 0.6972232460975647, |
|
"kl": 0.000560760498046875, |
|
"learning_rate": 9.992983438818915e-07, |
|
"loss": 0.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.118, |
|
"grad_norm": 0.026448842138051987, |
|
"kl": 0.0009822845458984375, |
|
"learning_rate": 9.991120277927223e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.12, |
|
"grad_norm": 0.006011964753270149, |
|
"kl": 0.0007162094116210938, |
|
"learning_rate": 9.989038226169207e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.122, |
|
"grad_norm": 0.6680987477302551, |
|
"kl": 0.0005612373352050781, |
|
"learning_rate": 9.98673738502114e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1933.75, |
|
"epoch": 0.124, |
|
"grad_norm": 0.9241335988044739, |
|
"kl": 0.0006771087646484375, |
|
"learning_rate": 9.98421786662277e-07, |
|
"loss": -0.0443, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.126, |
|
"grad_norm": 0.009985378012061119, |
|
"kl": 0.0005092620849609375, |
|
"learning_rate": 9.981479793771866e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.128, |
|
"grad_norm": 0.03178563341498375, |
|
"kl": 0.0007648468017578125, |
|
"learning_rate": 9.97852329991824e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1685.75, |
|
"epoch": 0.13, |
|
"grad_norm": 0.9064852595329285, |
|
"kl": 0.00341796875, |
|
"learning_rate": 9.975348529157229e-07, |
|
"loss": -0.0454, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.132, |
|
"grad_norm": 0.7919860482215881, |
|
"kl": 0.001010894775390625, |
|
"learning_rate": 9.971955636222684e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1887.25, |
|
"epoch": 0.134, |
|
"grad_norm": 0.8760141134262085, |
|
"kl": 0.000766754150390625, |
|
"learning_rate": 9.968344786479415e-07, |
|
"loss": 0.0659, |
|
"reward": 0.375, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.136, |
|
"grad_norm": 0.004815725143998861, |
|
"kl": 0.0005044937133789062, |
|
"learning_rate": 9.964516155915151e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1797.25, |
|
"epoch": 0.138, |
|
"grad_norm": 0.8277981877326965, |
|
"kl": 0.0012836456298828125, |
|
"learning_rate": 9.960469931131936e-07, |
|
"loss": 0.1147, |
|
"reward": 0.9375, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2000.25, |
|
"epoch": 0.14, |
|
"grad_norm": 0.8641435503959656, |
|
"kl": 0.0011048316955566406, |
|
"learning_rate": 9.956206309337066e-07, |
|
"loss": 0.0173, |
|
"reward": 0.25, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.142, |
|
"grad_norm": 0.009465116076171398, |
|
"kl": 0.00019884109497070312, |
|
"learning_rate": 9.951725498333448e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1618.25, |
|
"epoch": 0.144, |
|
"grad_norm": 0.8773428797721863, |
|
"kl": 0.0106658935546875, |
|
"learning_rate": 9.947027716509488e-07, |
|
"loss": 0.0216, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.146, |
|
"grad_norm": 0.0040911422111094, |
|
"kl": 0.0006923675537109375, |
|
"learning_rate": 9.942113192828444e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.148, |
|
"grad_norm": 0.0059630973264575005, |
|
"kl": 0.000446319580078125, |
|
"learning_rate": 9.93698216681727e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1836.75, |
|
"epoch": 0.15, |
|
"grad_norm": 1.146507978439331, |
|
"kl": 0.0004787445068359375, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": -0.0918, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.152, |
|
"grad_norm": 0.7911372184753418, |
|
"kl": 0.00030231475830078125, |
|
"learning_rate": 9.926071618660237e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.154, |
|
"grad_norm": 0.8156778216362, |
|
"kl": 0.0006389617919921875, |
|
"learning_rate": 9.9202926282791e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1915.25, |
|
"epoch": 0.156, |
|
"grad_norm": 0.005794746335595846, |
|
"kl": 0.0005588531494140625, |
|
"learning_rate": 9.91429819907136e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1918.0, |
|
"epoch": 0.158, |
|
"grad_norm": 1.0309467315673828, |
|
"kl": 0.0005645751953125, |
|
"learning_rate": 9.908088623197048e-07, |
|
"loss": -0.0514, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.16, |
|
"grad_norm": 0.9030879735946655, |
|
"kl": 0.0043182373046875, |
|
"learning_rate": 9.901664203302124e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.162, |
|
"grad_norm": 0.8063321113586426, |
|
"kl": 0.00075531005859375, |
|
"learning_rate": 9.895025252503755e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.164, |
|
"grad_norm": 0.01648077741265297, |
|
"kl": 0.00039196014404296875, |
|
"learning_rate": 9.888172094375033e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1855.0, |
|
"epoch": 0.166, |
|
"grad_norm": 0.7015650272369385, |
|
"kl": 0.0013475418090820312, |
|
"learning_rate": 9.881105062929221e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.168, |
|
"grad_norm": 0.006011520978063345, |
|
"kl": 0.00021600723266601562, |
|
"learning_rate": 9.873824502603459e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1855.5, |
|
"epoch": 0.17, |
|
"grad_norm": 0.9796985983848572, |
|
"kl": 0.00054168701171875, |
|
"learning_rate": 9.866330768241983e-07, |
|
"loss": 0.0819, |
|
"reward": 0.5, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.172, |
|
"grad_norm": 0.006532273255288601, |
|
"kl": 0.0004496574401855469, |
|
"learning_rate": 9.85862422507884e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.174, |
|
"grad_norm": 0.007916338741779327, |
|
"kl": 0.0005373954772949219, |
|
"learning_rate": 9.850705248720068e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.176, |
|
"grad_norm": 0.006668766029179096, |
|
"kl": 0.00024175643920898438, |
|
"learning_rate": 9.8425742251254e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1533.5, |
|
"epoch": 0.178, |
|
"grad_norm": 0.01681283488869667, |
|
"kl": 0.0006351470947265625, |
|
"learning_rate": 9.83423155058946e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.18, |
|
"grad_norm": 0.6811097860336304, |
|
"kl": 0.0013413429260253906, |
|
"learning_rate": 9.825677631722435e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.182, |
|
"grad_norm": 0.7697494626045227, |
|
"kl": 0.0030584335327148438, |
|
"learning_rate": 9.816912885430258e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1913.25, |
|
"epoch": 0.184, |
|
"grad_norm": 1.1320937871932983, |
|
"kl": 0.001331329345703125, |
|
"learning_rate": 9.807937738894303e-07, |
|
"loss": 0.0536, |
|
"reward": 0.4375, |
|
"reward_std": 0.4419417306780815, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1742.75, |
|
"epoch": 0.186, |
|
"grad_norm": 0.6808719635009766, |
|
"kl": 0.0010509490966796875, |
|
"learning_rate": 9.798752629550546e-07, |
|
"loss": -0.1501, |
|
"reward": 0.3125, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.188, |
|
"grad_norm": 0.007471662946045399, |
|
"kl": 0.0004634857177734375, |
|
"learning_rate": 9.78935800506826e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.19, |
|
"grad_norm": 0.010327517054975033, |
|
"kl": 0.000568389892578125, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.192, |
|
"grad_norm": 0.6696183681488037, |
|
"kl": 0.0009899139404296875, |
|
"learning_rate": 9.769942052400235e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.194, |
|
"grad_norm": 0.8246662020683289, |
|
"kl": 0.0017538070678710938, |
|
"learning_rate": 9.759921670520634e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1928.5, |
|
"epoch": 0.196, |
|
"grad_norm": 0.955489456653595, |
|
"kl": 0.0009098052978515625, |
|
"learning_rate": 9.749693666068663e-07, |
|
"loss": 0.0467, |
|
"reward": 0.375, |
|
"reward_std": 0.3535533770918846, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.198, |
|
"grad_norm": 0.03343038633465767, |
|
"kl": 0.0007419586181640625, |
|
"learning_rate": 9.739258537542835e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.2, |
|
"grad_norm": 0.006429341156035662, |
|
"kl": 0.001178741455078125, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.202, |
|
"grad_norm": 0.00873623974621296, |
|
"kl": 0.0005130767822265625, |
|
"learning_rate": 9.717768952713511e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1959.0, |
|
"epoch": 0.204, |
|
"grad_norm": 0.017306441441178322, |
|
"kl": 0.000946044921875, |
|
"learning_rate": 9.706715543782064e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.206, |
|
"grad_norm": 0.7247556447982788, |
|
"kl": 0.0008764266967773438, |
|
"learning_rate": 9.695457105469804e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.208, |
|
"grad_norm": 0.008871670812368393, |
|
"kl": 0.00033664703369140625, |
|
"learning_rate": 9.683994186497132e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.21, |
|
"grad_norm": 0.007749219890683889, |
|
"kl": 0.00040531158447265625, |
|
"learning_rate": 9.672327345550543e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.212, |
|
"grad_norm": 0.010708320885896683, |
|
"kl": 0.0010166168212890625, |
|
"learning_rate": 9.66045715125541e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.214, |
|
"grad_norm": 0.007295151706784964, |
|
"kl": 0.0003333091735839844, |
|
"learning_rate": 9.648384182148252e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.216, |
|
"grad_norm": 0.7922310829162598, |
|
"kl": 0.000408172607421875, |
|
"learning_rate": 9.636109026648554e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.218, |
|
"grad_norm": 0.007899758405983448, |
|
"kl": 0.0006399154663085938, |
|
"learning_rate": 9.623632283030077e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.22, |
|
"grad_norm": 0.9048980474472046, |
|
"kl": 0.001056671142578125, |
|
"learning_rate": 9.610954559391704e-07, |
|
"loss": 0.0, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.222, |
|
"grad_norm": 0.010189698077738285, |
|
"kl": 0.00031948089599609375, |
|
"learning_rate": 9.598076473627796e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1794.5, |
|
"epoch": 0.224, |
|
"grad_norm": 1.0683528184890747, |
|
"kl": 0.0034933090209960938, |
|
"learning_rate": 9.58499865339809e-07, |
|
"loss": -0.1162, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.226, |
|
"grad_norm": 0.011858138255774975, |
|
"kl": 0.000347137451171875, |
|
"learning_rate": 9.571721736097088e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.228, |
|
"grad_norm": 0.012356019578874111, |
|
"kl": 0.000885009765625, |
|
"learning_rate": 9.55824636882301e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1752.5, |
|
"epoch": 0.23, |
|
"grad_norm": 1.0531798601150513, |
|
"kl": 0.001102447509765625, |
|
"learning_rate": 9.54457320834625e-07, |
|
"loss": 0.1434, |
|
"reward": 0.1875, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.232, |
|
"grad_norm": 0.012715999968349934, |
|
"kl": 0.00096893310546875, |
|
"learning_rate": 9.530702921077358e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2033.5, |
|
"epoch": 0.234, |
|
"grad_norm": 0.7506431937217712, |
|
"kl": 0.0050945281982421875, |
|
"learning_rate": 9.516636183034564e-07, |
|
"loss": 0.0002, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.236, |
|
"grad_norm": 0.020277904346585274, |
|
"kl": 0.00078582763671875, |
|
"learning_rate": 9.502373679810839e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1624.5, |
|
"epoch": 0.238, |
|
"grad_norm": 0.9856612086296082, |
|
"kl": 0.0022125244140625, |
|
"learning_rate": 9.487916106540465e-07, |
|
"loss": -0.0067, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.24, |
|
"grad_norm": 0.007632279768586159, |
|
"kl": 0.000583648681640625, |
|
"learning_rate": 9.473264167865171e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.242, |
|
"grad_norm": 0.007833893410861492, |
|
"kl": 0.0008401870727539062, |
|
"learning_rate": 9.458418577899774e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.244, |
|
"grad_norm": 0.00713867275044322, |
|
"kl": 0.0011892318725585938, |
|
"learning_rate": 9.443380060197385e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.246, |
|
"grad_norm": 0.011064039543271065, |
|
"kl": 0.0003185272216796875, |
|
"learning_rate": 9.428149347714143e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.248, |
|
"grad_norm": 0.009095462039113045, |
|
"kl": 0.0006237030029296875, |
|
"learning_rate": 9.412727182773486e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.25, |
|
"grad_norm": 0.007876625284552574, |
|
"kl": 0.00144195556640625, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.252, |
|
"grad_norm": 0.05358020216226578, |
|
"kl": 0.0010623931884765625, |
|
"learning_rate": 9.381311511432658e-07, |
|
"loss": 0.0, |
|
"reward": 0.625, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.254, |
|
"grad_norm": 0.030433854088187218, |
|
"kl": 0.0003848075866699219, |
|
"learning_rate": 9.36531953618799e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.256, |
|
"grad_norm": 0.852528989315033, |
|
"kl": 0.0011425018310546875, |
|
"learning_rate": 9.34913917072228e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.258, |
|
"grad_norm": 0.013770471327006817, |
|
"kl": 0.0007495880126953125, |
|
"learning_rate": 9.332771203643714e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.26, |
|
"grad_norm": 0.7055062055587769, |
|
"kl": 0.0012664794921875, |
|
"learning_rate": 9.316216432703916e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.262, |
|
"grad_norm": 0.7149041295051575, |
|
"kl": 0.002803802490234375, |
|
"learning_rate": 9.299475664759068e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1480.25, |
|
"epoch": 0.264, |
|
"grad_norm": 0.014864159747958183, |
|
"kl": 0.031280517578125, |
|
"learning_rate": 9.282549715730579e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.266, |
|
"grad_norm": 0.8071376085281372, |
|
"kl": 0.00067901611328125, |
|
"learning_rate": 9.265439410565328e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2047.25, |
|
"epoch": 0.268, |
|
"grad_norm": 0.009836402721703053, |
|
"kl": 0.0012359619140625, |
|
"learning_rate": 9.248145583195447e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2012.25, |
|
"epoch": 0.27, |
|
"grad_norm": 0.007896827533841133, |
|
"kl": 0.0009002685546875, |
|
"learning_rate": 9.230669076497687e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.272, |
|
"grad_norm": 0.0065347570925951, |
|
"kl": 0.000980377197265625, |
|
"learning_rate": 9.213010742252327e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.274, |
|
"grad_norm": 0.8659334778785706, |
|
"kl": 0.00147247314453125, |
|
"learning_rate": 9.195171441101668e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.276, |
|
"grad_norm": 0.009184672497212887, |
|
"kl": 0.001373291015625, |
|
"learning_rate": 9.177152042508077e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.278, |
|
"grad_norm": 0.00881196279078722, |
|
"kl": 0.001476287841796875, |
|
"learning_rate": 9.158953424711624e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1876.0, |
|
"epoch": 0.28, |
|
"grad_norm": 0.7441470623016357, |
|
"kl": 0.0006885528564453125, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.06, |
|
"reward": 0.3125, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.282, |
|
"grad_norm": 0.008356408216059208, |
|
"kl": 0.00101470947265625, |
|
"learning_rate": 9.122022088101613e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1991.5, |
|
"epoch": 0.284, |
|
"grad_norm": 0.9580811262130737, |
|
"kl": 0.006805419921875, |
|
"learning_rate": 9.103291169269299e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.286, |
|
"grad_norm": 0.008105803281068802, |
|
"kl": 0.002086639404296875, |
|
"learning_rate": 9.084384631108882e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.288, |
|
"grad_norm": 0.007048532832413912, |
|
"kl": 0.001430511474609375, |
|
"learning_rate": 9.065303395098358e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1714.5, |
|
"epoch": 0.29, |
|
"grad_norm": 0.881592869758606, |
|
"kl": 0.001071929931640625, |
|
"learning_rate": 9.046048391230247e-07, |
|
"loss": 0.1707, |
|
"reward": 0.25, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.292, |
|
"grad_norm": 0.008095495402812958, |
|
"kl": 0.00165557861328125, |
|
"learning_rate": 9.026620557966279e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.294, |
|
"grad_norm": 0.019286898896098137, |
|
"kl": 0.00104522705078125, |
|
"learning_rate": 9.007020842191634e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.296, |
|
"grad_norm": 0.007973677478730679, |
|
"kl": 0.0015087127685546875, |
|
"learning_rate": 8.987250199168808e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.298, |
|
"grad_norm": 0.007949120365083218, |
|
"kl": 0.0004978179931640625, |
|
"learning_rate": 8.967309592491052e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.3, |
|
"grad_norm": 0.007726718205958605, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.302, |
|
"grad_norm": 0.00826709158718586, |
|
"kl": 0.0013580322265625, |
|
"learning_rate": 8.926922383915315e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.304, |
|
"grad_norm": 0.007963276468217373, |
|
"kl": 0.00203704833984375, |
|
"learning_rate": 8.906477750432903e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.306, |
|
"grad_norm": 0.008207273669540882, |
|
"kl": 0.0008563995361328125, |
|
"learning_rate": 8.88586709003076e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.308, |
|
"grad_norm": 0.010204545222222805, |
|
"kl": 0.0006160736083984375, |
|
"learning_rate": 8.865091407243394e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.31, |
|
"grad_norm": 0.00880539882928133, |
|
"kl": 0.001453399658203125, |
|
"learning_rate": 8.844151714648274e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.312, |
|
"grad_norm": 0.010449129156768322, |
|
"kl": 0.0010318756103515625, |
|
"learning_rate": 8.823049032816478e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.314, |
|
"grad_norm": 0.010188284330070019, |
|
"kl": 0.000789642333984375, |
|
"learning_rate": 8.801784390262943e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1963.5, |
|
"epoch": 0.316, |
|
"grad_norm": 0.7694103717803955, |
|
"kl": 0.000640869140625, |
|
"learning_rate": 8.780358823396352e-07, |
|
"loss": 0.0318, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1807.0, |
|
"epoch": 0.318, |
|
"grad_norm": 0.8925904631614685, |
|
"kl": 0.001445770263671875, |
|
"learning_rate": 8.758773376468604e-07, |
|
"loss": -0.1087, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.32, |
|
"grad_norm": 0.7490972280502319, |
|
"kl": 0.01116180419921875, |
|
"learning_rate": 8.737029101523929e-07, |
|
"loss": 0.0004, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.322, |
|
"grad_norm": 0.008408155292272568, |
|
"kl": 0.0014629364013671875, |
|
"learning_rate": 8.715127058347614e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.324, |
|
"grad_norm": 0.8539191484451294, |
|
"kl": 0.0008831024169921875, |
|
"learning_rate": 8.693068314414344e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.326, |
|
"grad_norm": 0.009831397794187069, |
|
"kl": 0.000682830810546875, |
|
"learning_rate": 8.670853944836176e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.328, |
|
"grad_norm": 0.011773304082453251, |
|
"kl": 0.001129150390625, |
|
"learning_rate": 8.648485032310144e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.33, |
|
"grad_norm": 0.7276328206062317, |
|
"kl": 0.001377105712890625, |
|
"learning_rate": 8.625962667065487e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1512.75, |
|
"epoch": 0.332, |
|
"grad_norm": 0.008593901991844177, |
|
"kl": 0.0006933212280273438, |
|
"learning_rate": 8.603287946810513e-07, |
|
"loss": 0.0, |
|
"reward": 0.375, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.334, |
|
"grad_norm": 0.011681273579597473, |
|
"kl": 0.000759124755859375, |
|
"learning_rate": 8.580461976679099e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.336, |
|
"grad_norm": 0.9719880819320679, |
|
"kl": 0.00112152099609375, |
|
"learning_rate": 8.557485869176825e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1850.75, |
|
"epoch": 0.338, |
|
"grad_norm": 0.017566794529557228, |
|
"kl": 0.002292633056640625, |
|
"learning_rate": 8.534360744126753e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2016.0, |
|
"epoch": 0.34, |
|
"grad_norm": 1.1285712718963623, |
|
"kl": 0.00118255615234375, |
|
"learning_rate": 8.511087728614862e-07, |
|
"loss": 0.0114, |
|
"reward": 0.3125, |
|
"reward_std": 0.2651650384068489, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.342, |
|
"grad_norm": 0.009710059501230717, |
|
"kl": 0.001407623291015625, |
|
"learning_rate": 8.487667956935087e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.344, |
|
"grad_norm": 0.009776381775736809, |
|
"kl": 0.0014629364013671875, |
|
"learning_rate": 8.464102570534061e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.346, |
|
"grad_norm": 0.00871388241648674, |
|
"kl": 0.000507354736328125, |
|
"learning_rate": 8.440392717955475e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.348, |
|
"grad_norm": 0.00912429578602314, |
|
"kl": 0.000762939453125, |
|
"learning_rate": 8.416539554784089e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.35, |
|
"grad_norm": 0.011816666461527348, |
|
"kl": 0.0008449554443359375, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.352, |
|
"grad_norm": 0.00985003262758255, |
|
"kl": 0.001529693603515625, |
|
"learning_rate": 8.368407953869103e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.354, |
|
"grad_norm": 0.00919476430863142, |
|
"kl": 0.00167083740234375, |
|
"learning_rate": 8.344131861991828e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.356, |
|
"grad_norm": 0.01162977609783411, |
|
"kl": 0.00091552734375, |
|
"learning_rate": 8.319717151140072e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1970.5, |
|
"epoch": 0.358, |
|
"grad_norm": 0.01332745049148798, |
|
"kl": 0.001888275146484375, |
|
"learning_rate": 8.295165011252396e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1730.75, |
|
"epoch": 0.36, |
|
"grad_norm": 0.013342260383069515, |
|
"kl": 0.00089263916015625, |
|
"learning_rate": 8.270476638965461e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.362, |
|
"grad_norm": 0.012930563651025295, |
|
"kl": 0.001483917236328125, |
|
"learning_rate": 8.245653237555705e-07, |
|
"loss": 0.0001, |
|
"reward": 0.625, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.364, |
|
"grad_norm": 0.03247583284974098, |
|
"kl": 0.00113677978515625, |
|
"learning_rate": 8.220696016880687e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.366, |
|
"grad_norm": 0.8804360032081604, |
|
"kl": 0.0019130706787109375, |
|
"learning_rate": 8.195606193320136e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.368, |
|
"grad_norm": 0.007631482556462288, |
|
"kl": 0.001468658447265625, |
|
"learning_rate": 8.170384989716657e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.37, |
|
"grad_norm": 0.008286840282380581, |
|
"kl": 0.001613616943359375, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.372, |
|
"grad_norm": 0.016586236655712128, |
|
"kl": 0.0007076263427734375, |
|
"learning_rate": 8.119553365707802e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.374, |
|
"grad_norm": 0.010034182108938694, |
|
"kl": 0.0008754730224609375, |
|
"learning_rate": 8.093945422764069e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.376, |
|
"grad_norm": 0.7020014524459839, |
|
"kl": 0.0012836456298828125, |
|
"learning_rate": 8.068211054579943e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.378, |
|
"grad_norm": 1.163500189781189, |
|
"kl": 0.0016021728515625, |
|
"learning_rate": 8.04235151541222e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.38, |
|
"grad_norm": 0.8533800840377808, |
|
"kl": 0.0010223388671875, |
|
"learning_rate": 8.01636806561836e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.382, |
|
"grad_norm": 0.8227788805961609, |
|
"kl": 0.0030975341796875, |
|
"learning_rate": 7.990261971595048e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.384, |
|
"grad_norm": 0.011001263745129108, |
|
"kl": 0.0008344650268554688, |
|
"learning_rate": 7.964034505716476e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.386, |
|
"grad_norm": 0.009392702020704746, |
|
"kl": 0.001972198486328125, |
|
"learning_rate": 7.93768694627233e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1788.25, |
|
"epoch": 0.388, |
|
"grad_norm": 1.1597681045532227, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 7.911220577405484e-07, |
|
"loss": 0.1207, |
|
"reward": 0.1875, |
|
"reward_std": 0.2651650384068489, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.39, |
|
"grad_norm": 0.00962373148649931, |
|
"kl": 0.0009326934814453125, |
|
"learning_rate": 7.884636689049422e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2025.0, |
|
"epoch": 0.392, |
|
"grad_norm": 0.009538036771118641, |
|
"kl": 0.00116729736328125, |
|
"learning_rate": 7.857936576865356e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.394, |
|
"grad_norm": 0.01050383411347866, |
|
"kl": 0.0007953643798828125, |
|
"learning_rate": 7.831121542179086e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.396, |
|
"grad_norm": 0.009053800255060196, |
|
"kl": 0.001438140869140625, |
|
"learning_rate": 7.804192891917571e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.398, |
|
"grad_norm": 0.00897100381553173, |
|
"kl": 0.001312255859375, |
|
"learning_rate": 7.777151938545235e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.4, |
|
"grad_norm": 0.01025310903787613, |
|
"kl": 0.001468658447265625, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.402, |
|
"grad_norm": 0.054522059857845306, |
|
"kl": 0.0010223388671875, |
|
"learning_rate": 7.72273839962904e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1953.5, |
|
"epoch": 0.404, |
|
"grad_norm": 0.037722665816545486, |
|
"kl": 0.0011157989501953125, |
|
"learning_rate": 7.695368466124296e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.406, |
|
"grad_norm": 0.008431609719991684, |
|
"kl": 0.0020599365234375, |
|
"learning_rate": 7.667891533457718e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1929.0, |
|
"epoch": 0.408, |
|
"grad_norm": 0.9533175826072693, |
|
"kl": 0.001682281494140625, |
|
"learning_rate": 7.640308940816239e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2036.0, |
|
"epoch": 0.41, |
|
"grad_norm": 0.013962327502667904, |
|
"kl": 0.0011119842529296875, |
|
"learning_rate": 7.612622032536507e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.412, |
|
"grad_norm": 0.010255957953631878, |
|
"kl": 0.0009479522705078125, |
|
"learning_rate": 7.584832158039378e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.414, |
|
"grad_norm": 0.01276308298110962, |
|
"kl": 0.0014495849609375, |
|
"learning_rate": 7.556940671764124e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1809.5, |
|
"epoch": 0.416, |
|
"grad_norm": 0.01625184714794159, |
|
"kl": 0.00093841552734375, |
|
"learning_rate": 7.528948933102438e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.418, |
|
"grad_norm": 0.010438801720738411, |
|
"kl": 0.0012836456298828125, |
|
"learning_rate": 7.500858306332172e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.42, |
|
"grad_norm": 0.011556737124919891, |
|
"kl": 0.00215911865234375, |
|
"learning_rate": 7.472670160550848e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.422, |
|
"grad_norm": 0.009882017970085144, |
|
"kl": 0.0005950927734375, |
|
"learning_rate": 7.444385869608921e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1786.0, |
|
"epoch": 0.424, |
|
"grad_norm": 0.8873146176338196, |
|
"kl": 0.002838134765625, |
|
"learning_rate": 7.416006812042827e-07, |
|
"loss": -0.0358, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.426, |
|
"grad_norm": 0.01222989521920681, |
|
"kl": 0.0014190673828125, |
|
"learning_rate": 7.387534371007797e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.428, |
|
"grad_norm": 0.009303831495344639, |
|
"kl": 0.00127410888671875, |
|
"learning_rate": 7.358969934210438e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.43, |
|
"grad_norm": 0.0105022257193923, |
|
"kl": 0.0008087158203125, |
|
"learning_rate": 7.330314893841101e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.432, |
|
"grad_norm": 0.763167679309845, |
|
"kl": 0.001544952392578125, |
|
"learning_rate": 7.301570646506027e-07, |
|
"loss": 0.0001, |
|
"reward": 0.3125, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.434, |
|
"grad_norm": 0.010868730954825878, |
|
"kl": 0.001415252685546875, |
|
"learning_rate": 7.27273859315928e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.436, |
|
"grad_norm": 0.016456812620162964, |
|
"kl": 0.0006694793701171875, |
|
"learning_rate": 7.243820139034464e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1947.75, |
|
"epoch": 0.438, |
|
"grad_norm": 0.5713726282119751, |
|
"kl": 0.0039825439453125, |
|
"learning_rate": 7.214816693576234e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.44, |
|
"grad_norm": 0.008825725875794888, |
|
"kl": 0.000873565673828125, |
|
"learning_rate": 7.185729670371604e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.442, |
|
"grad_norm": 0.015775861218571663, |
|
"kl": 0.0016326904296875, |
|
"learning_rate": 7.156560487081051e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1892.75, |
|
"epoch": 0.444, |
|
"grad_norm": 0.011794094927608967, |
|
"kl": 0.002288818359375, |
|
"learning_rate": 7.127310565369415e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.446, |
|
"grad_norm": 0.8902475833892822, |
|
"kl": 0.0021953582763671875, |
|
"learning_rate": 7.097981330836616e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1976.5, |
|
"epoch": 0.448, |
|
"grad_norm": 0.01060924306511879, |
|
"kl": 0.00159454345703125, |
|
"learning_rate": 7.068574212948169e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1884.0, |
|
"epoch": 0.45, |
|
"grad_norm": 0.7874477505683899, |
|
"kl": 0.00165557861328125, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0675, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.452, |
|
"grad_norm": 0.009480384178459644, |
|
"kl": 0.001316070556640625, |
|
"learning_rate": 7.009532063876148e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.454, |
|
"grad_norm": 0.8971606492996216, |
|
"kl": 0.00217437744140625, |
|
"learning_rate": 6.979899910323624e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1950.75, |
|
"epoch": 0.456, |
|
"grad_norm": 0.013516876846551895, |
|
"kl": 0.001598358154296875, |
|
"learning_rate": 6.950195628537299e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.458, |
|
"grad_norm": 0.012519976124167442, |
|
"kl": 0.0012359619140625, |
|
"learning_rate": 6.920420666261961e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1604.75, |
|
"epoch": 0.46, |
|
"grad_norm": 1.0533803701400757, |
|
"kl": 0.0024871826171875, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.1645, |
|
"reward": 0.75, |
|
"reward_std": 0.7071067541837692, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.462, |
|
"grad_norm": 0.04883728548884392, |
|
"kl": 0.0012359619140625, |
|
"learning_rate": 6.860664508377001e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.464, |
|
"grad_norm": 0.01617475040256977, |
|
"kl": 0.00106048583984375, |
|
"learning_rate": 6.83068622519821e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.466, |
|
"grad_norm": 0.01098883431404829, |
|
"kl": 0.00160980224609375, |
|
"learning_rate": 6.800643086250121e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.468, |
|
"grad_norm": 0.010283930227160454, |
|
"kl": 0.001750946044921875, |
|
"learning_rate": 6.770536555792944e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.47, |
|
"grad_norm": 0.7373048663139343, |
|
"kl": 0.00131988525390625, |
|
"learning_rate": 6.740368101176495e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.472, |
|
"grad_norm": 0.01158232893794775, |
|
"kl": 0.001461029052734375, |
|
"learning_rate": 6.710139192768694e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.474, |
|
"grad_norm": 0.8555125594139099, |
|
"kl": 0.001068115234375, |
|
"learning_rate": 6.679851303883891e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.476, |
|
"grad_norm": 0.8464781641960144, |
|
"kl": 0.006557464599609375, |
|
"learning_rate": 6.649505910711058e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.478, |
|
"grad_norm": 0.14680787920951843, |
|
"kl": 0.003086090087890625, |
|
"learning_rate": 6.619104492241847e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.48, |
|
"grad_norm": 0.8916466236114502, |
|
"kl": 0.010036468505859375, |
|
"learning_rate": 6.588648530198504e-07, |
|
"loss": 0.0004, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.482, |
|
"grad_norm": 0.012613446451723576, |
|
"kl": 0.001827239990234375, |
|
"learning_rate": 6.558139508961654e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.484, |
|
"grad_norm": 0.012667362578213215, |
|
"kl": 0.001438140869140625, |
|
"learning_rate": 6.527578915497951e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.486, |
|
"grad_norm": 0.011633522808551788, |
|
"kl": 0.0007953643798828125, |
|
"learning_rate": 6.496968239287603e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.488, |
|
"grad_norm": 0.010322234593331814, |
|
"kl": 0.00118255615234375, |
|
"learning_rate": 6.466308972251785e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.49, |
|
"grad_norm": 0.013764915056526661, |
|
"kl": 0.001338958740234375, |
|
"learning_rate": 6.435602608679916e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.492, |
|
"grad_norm": 0.016423719003796577, |
|
"kl": 0.0011043548583984375, |
|
"learning_rate": 6.404850645156841e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.494, |
|
"grad_norm": 0.011886836029589176, |
|
"kl": 0.0008907318115234375, |
|
"learning_rate": 6.374054580489873e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.496, |
|
"grad_norm": 0.013753964565694332, |
|
"kl": 0.0009822845458984375, |
|
"learning_rate": 6.343215915635761e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.498, |
|
"grad_norm": 0.012257426045835018, |
|
"kl": 0.00170135498046875, |
|
"learning_rate": 6.31233615362752e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.5, |
|
"grad_norm": 0.7262430191040039, |
|
"kl": 0.00194549560546875, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.502, |
|
"grad_norm": 0.7270606756210327, |
|
"kl": 0.0007076263427734375, |
|
"learning_rate": 6.25045936022246e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.504, |
|
"grad_norm": 0.017829036340117455, |
|
"kl": 0.0005779266357421875, |
|
"learning_rate": 6.219465344613258e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.506, |
|
"grad_norm": 0.0317312628030777, |
|
"kl": 0.0013294219970703125, |
|
"learning_rate": 6.188436263278172e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1972.25, |
|
"epoch": 0.508, |
|
"grad_norm": 0.026640823110938072, |
|
"kl": 0.000835418701171875, |
|
"learning_rate": 6.157373628530852e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.51, |
|
"grad_norm": 0.012969509698450565, |
|
"kl": 0.001567840576171875, |
|
"learning_rate": 6.126278954320294e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.512, |
|
"grad_norm": 0.012548292055726051, |
|
"kl": 0.0010986328125, |
|
"learning_rate": 6.095153756157051e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.514, |
|
"grad_norm": 0.008928977884352207, |
|
"kl": 0.000873565673828125, |
|
"learning_rate": 6.06399955103937e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.516, |
|
"grad_norm": 0.8101487755775452, |
|
"kl": 0.0009307861328125, |
|
"learning_rate": 6.032817857379256e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.518, |
|
"grad_norm": 0.8978201746940613, |
|
"kl": 0.0007915496826171875, |
|
"learning_rate": 6.001610194928464e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.52, |
|
"grad_norm": 1.092624306678772, |
|
"kl": 0.00101470947265625, |
|
"learning_rate": 5.97037808470444e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.522, |
|
"grad_norm": 0.7928449511528015, |
|
"kl": 0.0011920928955078125, |
|
"learning_rate": 5.939123048916173e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1534.75, |
|
"epoch": 0.524, |
|
"grad_norm": 1.0032625198364258, |
|
"kl": 0.0021257400512695312, |
|
"learning_rate": 5.907846610890011e-07, |
|
"loss": 0.0615, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.526, |
|
"grad_norm": 0.011720544658601284, |
|
"kl": 0.000644683837890625, |
|
"learning_rate": 5.87655029499542e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.528, |
|
"grad_norm": 0.011771513149142265, |
|
"kl": 0.0007781982421875, |
|
"learning_rate": 5.845235626570683e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.53, |
|
"grad_norm": 0.013503954745829105, |
|
"kl": 0.001155853271484375, |
|
"learning_rate": 5.813904131848564e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.532, |
|
"grad_norm": 0.09234623610973358, |
|
"kl": 0.0018482208251953125, |
|
"learning_rate": 5.78255733788191e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.534, |
|
"grad_norm": 0.011625738814473152, |
|
"kl": 0.0007114410400390625, |
|
"learning_rate": 5.751196772469237e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1589.75, |
|
"epoch": 0.536, |
|
"grad_norm": 0.9924006462097168, |
|
"kl": 0.0024566650390625, |
|
"learning_rate": 5.71982396408026e-07, |
|
"loss": -0.0413, |
|
"reward": 0.875, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.538, |
|
"grad_norm": 0.726823627948761, |
|
"kl": 0.001861572265625, |
|
"learning_rate": 5.688440441781398e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.54, |
|
"grad_norm": 0.011368845589458942, |
|
"kl": 0.001186370849609375, |
|
"learning_rate": 5.657047735161255e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.542, |
|
"grad_norm": 0.014150974340736866, |
|
"kl": 0.0012617111206054688, |
|
"learning_rate": 5.625647374256061e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.544, |
|
"grad_norm": 0.03309360519051552, |
|
"kl": 0.00139617919921875, |
|
"learning_rate": 5.594240889475106e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.546, |
|
"grad_norm": 0.027406711131334305, |
|
"kl": 0.002048492431640625, |
|
"learning_rate": 5.562829811526154e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1862.75, |
|
"epoch": 0.548, |
|
"grad_norm": 0.872232973575592, |
|
"kl": 0.001857757568359375, |
|
"learning_rate": 5.531415671340826e-07, |
|
"loss": 0.0781, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1893.5, |
|
"epoch": 0.55, |
|
"grad_norm": 0.013754754327237606, |
|
"kl": 0.00106048583984375, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.552, |
|
"grad_norm": 0.8218861222267151, |
|
"kl": 0.0015106201171875, |
|
"learning_rate": 5.468584328659172e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.554, |
|
"grad_norm": 0.01286914199590683, |
|
"kl": 0.001323699951171875, |
|
"learning_rate": 5.437170188473847e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.556, |
|
"grad_norm": 0.8357925415039062, |
|
"kl": 0.00146484375, |
|
"learning_rate": 5.405759110524894e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.558, |
|
"grad_norm": 0.011115595698356628, |
|
"kl": 0.001163482666015625, |
|
"learning_rate": 5.37435262574394e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.56, |
|
"grad_norm": 0.7368820309638977, |
|
"kl": 0.0016918182373046875, |
|
"learning_rate": 5.342952264838747e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1845.5, |
|
"epoch": 0.562, |
|
"grad_norm": 0.8080930709838867, |
|
"kl": 0.002285003662109375, |
|
"learning_rate": 5.311559558218603e-07, |
|
"loss": 0.0872, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.564, |
|
"grad_norm": 0.6212303638458252, |
|
"kl": 0.0017833709716796875, |
|
"learning_rate": 5.28017603591974e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.566, |
|
"grad_norm": 0.8698393106460571, |
|
"kl": 0.001556396484375, |
|
"learning_rate": 5.248803227530763e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.568, |
|
"grad_norm": 0.6195830702781677, |
|
"kl": 0.001125335693359375, |
|
"learning_rate": 5.21744266211809e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.57, |
|
"grad_norm": 0.014679288491606712, |
|
"kl": 0.00112152099609375, |
|
"learning_rate": 5.186095868151436e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.572, |
|
"grad_norm": 0.011439023539423943, |
|
"kl": 0.000751495361328125, |
|
"learning_rate": 5.154764373429315e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.574, |
|
"grad_norm": 0.013943897560238838, |
|
"kl": 0.00086212158203125, |
|
"learning_rate": 5.123449705004581e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.576, |
|
"grad_norm": 0.011966060847043991, |
|
"kl": 0.001087188720703125, |
|
"learning_rate": 5.09215338910999e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.578, |
|
"grad_norm": 0.7932072877883911, |
|
"kl": 0.002170562744140625, |
|
"learning_rate": 5.060876951083828e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.58, |
|
"grad_norm": 0.01169038936495781, |
|
"kl": 0.001129150390625, |
|
"learning_rate": 5.02962191529556e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.582, |
|
"grad_norm": 0.85643470287323, |
|
"kl": 0.002471923828125, |
|
"learning_rate": 4.998389805071536e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.584, |
|
"grad_norm": 1.001603364944458, |
|
"kl": 0.01709747314453125, |
|
"learning_rate": 4.967182142620745e-07, |
|
"loss": 0.0007, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1886.25, |
|
"epoch": 0.586, |
|
"grad_norm": 0.7774127721786499, |
|
"kl": 0.001667022705078125, |
|
"learning_rate": 4.93600044896063e-07, |
|
"loss": 0.0664, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.588, |
|
"grad_norm": 1.0563451051712036, |
|
"kl": 0.00324249267578125, |
|
"learning_rate": 4.904846243842949e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.59, |
|
"grad_norm": 0.012082475237548351, |
|
"kl": 0.00081634521484375, |
|
"learning_rate": 4.873721045679706e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2003.5, |
|
"epoch": 0.592, |
|
"grad_norm": 0.590258002281189, |
|
"kl": 0.001506805419921875, |
|
"learning_rate": 4.842626371469149e-07, |
|
"loss": 0.0161, |
|
"reward": 0.625, |
|
"reward_std": 0.5303300619125366, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.594, |
|
"grad_norm": 0.7182537913322449, |
|
"kl": 0.00550079345703125, |
|
"learning_rate": 4.811563736721829e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1947.25, |
|
"epoch": 0.596, |
|
"grad_norm": 1.0634351968765259, |
|
"kl": 0.0025787353515625, |
|
"learning_rate": 4.780534655386743e-07, |
|
"loss": 0.0387, |
|
"reward": 0.6875, |
|
"reward_std": 0.6187184080481529, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.598, |
|
"grad_norm": 0.054920367896556854, |
|
"kl": 0.00203704833984375, |
|
"learning_rate": 4.749540639777539e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.6, |
|
"grad_norm": 0.012694926001131535, |
|
"kl": 0.000698089599609375, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1795.5, |
|
"epoch": 0.602, |
|
"grad_norm": 0.759120523929596, |
|
"kl": 0.0024261474609375, |
|
"learning_rate": 4.68766384637248e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.604, |
|
"grad_norm": 0.010465751402080059, |
|
"kl": 0.001453399658203125, |
|
"learning_rate": 4.656784084364238e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1853.0, |
|
"epoch": 0.606, |
|
"grad_norm": 0.8468216061592102, |
|
"kl": 0.00121307373046875, |
|
"learning_rate": 4.6259454195101267e-07, |
|
"loss": -0.0831, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.608, |
|
"grad_norm": 0.9815247654914856, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 4.59514935484316e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1868.75, |
|
"epoch": 0.61, |
|
"grad_norm": 0.8037129044532776, |
|
"kl": 0.000972747802734375, |
|
"learning_rate": 4.5643973913200837e-07, |
|
"loss": 0.075, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.612, |
|
"grad_norm": 0.011851584538817406, |
|
"kl": 0.001220703125, |
|
"learning_rate": 4.5336910277482155e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.614, |
|
"grad_norm": 0.012452667579054832, |
|
"kl": 0.000675201416015625, |
|
"learning_rate": 4.503031760712397e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.616, |
|
"grad_norm": 0.8061856627464294, |
|
"kl": 0.001941680908203125, |
|
"learning_rate": 4.4724210845020494e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.618, |
|
"grad_norm": 0.011598404496908188, |
|
"kl": 0.0009021759033203125, |
|
"learning_rate": 4.441860491038345e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.62, |
|
"grad_norm": 0.013049165718257427, |
|
"kl": 0.0008697509765625, |
|
"learning_rate": 4.4113514698014953e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.622, |
|
"grad_norm": 0.01000931765884161, |
|
"kl": 0.0010738372802734375, |
|
"learning_rate": 4.3808955077581546e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.624, |
|
"grad_norm": 0.017103025689721107, |
|
"kl": 0.00090789794921875, |
|
"learning_rate": 4.350494089288943e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.626, |
|
"grad_norm": 0.015600494109094143, |
|
"kl": 0.001026153564453125, |
|
"learning_rate": 4.3201486961161093e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.628, |
|
"grad_norm": 0.68843674659729, |
|
"kl": 0.0009326934814453125, |
|
"learning_rate": 4.2898608072313045e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.63, |
|
"grad_norm": 0.02028859592974186, |
|
"kl": 0.001399993896484375, |
|
"learning_rate": 4.2596318988235037e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.632, |
|
"grad_norm": 0.010004539042711258, |
|
"kl": 0.0006055831909179688, |
|
"learning_rate": 4.2294634442070553e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.634, |
|
"grad_norm": 0.012407040223479271, |
|
"kl": 0.001323699951171875, |
|
"learning_rate": 4.1993569137498776e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.636, |
|
"grad_norm": 0.7615489363670349, |
|
"kl": 0.001617431640625, |
|
"learning_rate": 4.1693137748017915e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.638, |
|
"grad_norm": 0.7873152494430542, |
|
"kl": 0.00064849853515625, |
|
"learning_rate": 4.1393354916230005e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.64, |
|
"grad_norm": 0.012965940870344639, |
|
"kl": 0.000850677490234375, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.642, |
|
"grad_norm": 0.013525367714464664, |
|
"kl": 0.000804901123046875, |
|
"learning_rate": 4.079579333738039e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.644, |
|
"grad_norm": 0.8479946255683899, |
|
"kl": 0.0014972686767578125, |
|
"learning_rate": 4.0498043714627006e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.646, |
|
"grad_norm": 0.7696375250816345, |
|
"kl": 0.002063751220703125, |
|
"learning_rate": 4.020100089676376e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.648, |
|
"grad_norm": 0.012891444377601147, |
|
"kl": 0.00150299072265625, |
|
"learning_rate": 3.9904679361238526e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.65, |
|
"grad_norm": 1.1341594457626343, |
|
"kl": 0.002437591552734375, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0001, |
|
"reward": 0.375, |
|
"reward_std": 0.5303300842642784, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.652, |
|
"grad_norm": 0.015124933794140816, |
|
"kl": 0.0007419586181640625, |
|
"learning_rate": 3.931425787051832e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.654, |
|
"grad_norm": 0.7730603814125061, |
|
"kl": 0.002941131591796875, |
|
"learning_rate": 3.902018669163384e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.656, |
|
"grad_norm": 0.02122451364994049, |
|
"kl": 0.000957489013671875, |
|
"learning_rate": 3.872689434630585e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.658, |
|
"grad_norm": 0.01871700957417488, |
|
"kl": 0.0010528564453125, |
|
"learning_rate": 3.843439512918949e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.66, |
|
"grad_norm": 0.0117810582742095, |
|
"kl": 0.0014801025390625, |
|
"learning_rate": 3.8142703296283953e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1353.5, |
|
"epoch": 0.662, |
|
"grad_norm": 0.017292601987719536, |
|
"kl": 0.0030651092529296875, |
|
"learning_rate": 3.785183306423767e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1955.25, |
|
"epoch": 0.664, |
|
"grad_norm": 0.7610009908676147, |
|
"kl": 0.001689910888671875, |
|
"learning_rate": 3.7561798609655373e-07, |
|
"loss": 0.0353, |
|
"reward": 0.25, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.666, |
|
"grad_norm": 0.013269172981381416, |
|
"kl": 0.0008087158203125, |
|
"learning_rate": 3.72726140684072e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.668, |
|
"grad_norm": 0.01594419591128826, |
|
"kl": 0.000812530517578125, |
|
"learning_rate": 3.6984293534939737e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.67, |
|
"grad_norm": 0.010347824543714523, |
|
"kl": 0.000751495361328125, |
|
"learning_rate": 3.6696851061588994e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.672, |
|
"grad_norm": 0.014122740365564823, |
|
"kl": 0.0012054443359375, |
|
"learning_rate": 3.641030065789562e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.674, |
|
"grad_norm": 0.013101037591695786, |
|
"kl": 0.001033782958984375, |
|
"learning_rate": 3.612465628992203e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.676, |
|
"grad_norm": 0.666260838508606, |
|
"kl": 0.00324249267578125, |
|
"learning_rate": 3.5839931879571725e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2022.75, |
|
"epoch": 0.678, |
|
"grad_norm": 0.7250146269798279, |
|
"kl": 0.00144195556640625, |
|
"learning_rate": 3.555614130391079e-07, |
|
"loss": 0.009, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2003.25, |
|
"epoch": 0.68, |
|
"grad_norm": 0.7383328676223755, |
|
"kl": 0.00299072265625, |
|
"learning_rate": 3.5273298394491515e-07, |
|
"loss": -0.016, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.682, |
|
"grad_norm": 0.011795282363891602, |
|
"kl": 0.000850677490234375, |
|
"learning_rate": 3.4991416936678276e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.684, |
|
"grad_norm": 0.9384401440620422, |
|
"kl": 0.002079010009765625, |
|
"learning_rate": 3.471051066897562e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.686, |
|
"grad_norm": 0.010377887636423111, |
|
"kl": 0.0008144378662109375, |
|
"learning_rate": 3.4430593282358777e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.688, |
|
"grad_norm": 0.7026439309120178, |
|
"kl": 0.001007080078125, |
|
"learning_rate": 3.4151678419606233e-07, |
|
"loss": 0.0, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.69, |
|
"grad_norm": 0.011138451285660267, |
|
"kl": 0.0008373260498046875, |
|
"learning_rate": 3.387377967463493e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.692, |
|
"grad_norm": 0.7246440649032593, |
|
"kl": 0.0035762786865234375, |
|
"learning_rate": 3.359691059183761e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.694, |
|
"grad_norm": 0.01353926956653595, |
|
"kl": 0.001087188720703125, |
|
"learning_rate": 3.3321084665422803e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1851.25, |
|
"epoch": 0.696, |
|
"grad_norm": 1.2388075590133667, |
|
"kl": 0.00121307373046875, |
|
"learning_rate": 3.3046315338757026e-07, |
|
"loss": 0.0841, |
|
"reward": 0.1875, |
|
"reward_std": 0.2651650384068489, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2005.75, |
|
"epoch": 0.698, |
|
"grad_norm": 0.7063978314399719, |
|
"kl": 0.002201080322265625, |
|
"learning_rate": 3.2772616003709616e-07, |
|
"loss": 0.0153, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.7, |
|
"grad_norm": 0.009749515913426876, |
|
"kl": 0.001163482666015625, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.702, |
|
"grad_norm": 0.01226428709924221, |
|
"kl": 0.00107574462890625, |
|
"learning_rate": 3.222848061454764e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.704, |
|
"grad_norm": 0.01303025335073471, |
|
"kl": 0.001399993896484375, |
|
"learning_rate": 3.195807108082429e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.706, |
|
"grad_norm": 0.028528396040201187, |
|
"kl": 0.001117706298828125, |
|
"learning_rate": 3.168878457820915e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1785.25, |
|
"epoch": 0.708, |
|
"grad_norm": 0.9730527997016907, |
|
"kl": 0.001430511474609375, |
|
"learning_rate": 3.142063423134644e-07, |
|
"loss": 0.122, |
|
"reward": 0.875, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.71, |
|
"grad_norm": 0.013550005853176117, |
|
"kl": 0.001338958740234375, |
|
"learning_rate": 3.115363310950578e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.712, |
|
"grad_norm": 0.010767250321805477, |
|
"kl": 0.00067138671875, |
|
"learning_rate": 3.0887794225945143e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.714, |
|
"grad_norm": 0.012552580796182156, |
|
"kl": 0.00140380859375, |
|
"learning_rate": 3.062313053727671e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.716, |
|
"grad_norm": 0.6516157984733582, |
|
"kl": 0.00159454345703125, |
|
"learning_rate": 3.0359654942835247e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.718, |
|
"grad_norm": 0.012717018835246563, |
|
"kl": 0.0008392333984375, |
|
"learning_rate": 3.0097380284049523e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.72, |
|
"grad_norm": 0.014254845678806305, |
|
"kl": 0.001018524169921875, |
|
"learning_rate": 2.9836319343816397e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.722, |
|
"grad_norm": 0.017017841339111328, |
|
"kl": 0.0009784698486328125, |
|
"learning_rate": 2.9576484845877793e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1976.75, |
|
"epoch": 0.724, |
|
"grad_norm": 0.8839628100395203, |
|
"kl": 0.002727508544921875, |
|
"learning_rate": 2.931788945420058e-07, |
|
"loss": 0.0265, |
|
"reward": 0.3125, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.726, |
|
"grad_norm": 0.010562002658843994, |
|
"kl": 0.00103759765625, |
|
"learning_rate": 2.9060545772359305e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.728, |
|
"grad_norm": 0.013268781825900078, |
|
"kl": 0.0009174346923828125, |
|
"learning_rate": 2.8804466342921987e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1927.75, |
|
"epoch": 0.73, |
|
"grad_norm": 0.8109666705131531, |
|
"kl": 0.0020313262939453125, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.0471, |
|
"reward": 0.875, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.732, |
|
"grad_norm": 0.012277526780962944, |
|
"kl": 0.000827789306640625, |
|
"learning_rate": 2.829615010283344e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.734, |
|
"grad_norm": 0.7054362893104553, |
|
"kl": 0.0069103240966796875, |
|
"learning_rate": 2.8043938066798645e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.736, |
|
"grad_norm": 0.012917861342430115, |
|
"kl": 0.0012969970703125, |
|
"learning_rate": 2.7793039831193133e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.738, |
|
"grad_norm": 0.011064048856496811, |
|
"kl": 0.0007419586181640625, |
|
"learning_rate": 2.7543467624442956e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.74, |
|
"grad_norm": 0.014153026975691319, |
|
"kl": 0.00079345703125, |
|
"learning_rate": 2.729523361034538e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1957.5, |
|
"epoch": 0.742, |
|
"grad_norm": 0.02183438278734684, |
|
"kl": 0.002010345458984375, |
|
"learning_rate": 2.7048349887476037e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1967.0, |
|
"epoch": 0.744, |
|
"grad_norm": 0.6841965317726135, |
|
"kl": 0.00157928466796875, |
|
"learning_rate": 2.6802828488599294e-07, |
|
"loss": 0.0304, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.746, |
|
"grad_norm": 0.010570400394499302, |
|
"kl": 0.00075531005859375, |
|
"learning_rate": 2.655868138008171e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.748, |
|
"grad_norm": 0.013351581990718842, |
|
"kl": 0.000873565673828125, |
|
"learning_rate": 2.631592046130896e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1565.75, |
|
"epoch": 0.75, |
|
"grad_norm": 0.9974377751350403, |
|
"kl": 0.008136749267578125, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": -0.3145, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1881.75, |
|
"epoch": 0.752, |
|
"grad_norm": 1.1940348148345947, |
|
"kl": 0.0038909912109375, |
|
"learning_rate": 2.583460445215911e-07, |
|
"loss": 0.0687, |
|
"reward": 0.3125, |
|
"reward_std": 0.2651650384068489, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.754, |
|
"grad_norm": 0.01335156336426735, |
|
"kl": 0.0006580352783203125, |
|
"learning_rate": 2.5596072820445254e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.756, |
|
"grad_norm": 0.011012405157089233, |
|
"kl": 0.00142669677734375, |
|
"learning_rate": 2.5358974294659373e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 378 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.758, |
|
"grad_norm": 0.16783788800239563, |
|
"kl": 0.00443267822265625, |
|
"learning_rate": 2.512332043064913e-07, |
|
"loss": 0.0002, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 379 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.76, |
|
"grad_norm": 0.011893996968865395, |
|
"kl": 0.001003265380859375, |
|
"learning_rate": 2.488912271385139e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.762, |
|
"grad_norm": 0.7219942212104797, |
|
"kl": 0.00760650634765625, |
|
"learning_rate": 2.465639255873246e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 381 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.764, |
|
"grad_norm": 0.04678433761000633, |
|
"kl": 0.0009021759033203125, |
|
"learning_rate": 2.4425141308231765e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 382 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.766, |
|
"grad_norm": 0.011990766040980816, |
|
"kl": 0.0006542205810546875, |
|
"learning_rate": 2.4195380233209006e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 383 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.768, |
|
"grad_norm": 0.011609113775193691, |
|
"kl": 0.0009002685546875, |
|
"learning_rate": 2.3967120531894857e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 384 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.77, |
|
"grad_norm": 0.029996510595083237, |
|
"kl": 0.001007080078125, |
|
"learning_rate": 2.374037332934512e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.772, |
|
"grad_norm": 0.7205003499984741, |
|
"kl": 0.001007080078125, |
|
"learning_rate": 2.3515149676898552e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 386 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.774, |
|
"grad_norm": 0.01278019044548273, |
|
"kl": 0.000644683837890625, |
|
"learning_rate": 2.3291460551638237e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 387 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.776, |
|
"grad_norm": 0.050757694989442825, |
|
"kl": 0.0013141632080078125, |
|
"learning_rate": 2.306931685585657e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 388 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.778, |
|
"grad_norm": 0.8608619570732117, |
|
"kl": 0.00222015380859375, |
|
"learning_rate": 2.2848729416523859e-07, |
|
"loss": 0.0001, |
|
"reward": 0.625, |
|
"reward_std": 0.5303300768136978, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 389 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.78, |
|
"grad_norm": 0.01890842616558075, |
|
"kl": 0.0016078948974609375, |
|
"learning_rate": 2.2629708984760706e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1757.5, |
|
"epoch": 0.782, |
|
"grad_norm": 0.9615358710289001, |
|
"kl": 0.00257110595703125, |
|
"learning_rate": 2.2412266235313973e-07, |
|
"loss": 0.1401, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 391 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2038.25, |
|
"epoch": 0.784, |
|
"grad_norm": 0.7946398854255676, |
|
"kl": 0.0009212493896484375, |
|
"learning_rate": 2.2196411766036487e-07, |
|
"loss": 0.0034, |
|
"reward": 0.8125, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3125, |
|
"step": 392 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.786, |
|
"grad_norm": 0.01277601532638073, |
|
"kl": 0.001102447509765625, |
|
"learning_rate": 2.1982156097370557e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 393 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1800.5, |
|
"epoch": 0.788, |
|
"grad_norm": 0.9031627774238586, |
|
"kl": 0.00704193115234375, |
|
"learning_rate": 2.1769509671835223e-07, |
|
"loss": 0.1129, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 394 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.79, |
|
"grad_norm": 0.017206581309437752, |
|
"kl": 0.001468658447265625, |
|
"learning_rate": 2.1558482853517253e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 395 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1820.75, |
|
"epoch": 0.792, |
|
"grad_norm": 0.9008931517601013, |
|
"kl": 0.0047149658203125, |
|
"learning_rate": 2.134908592756607e-07, |
|
"loss": 0.0002, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 396 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2010.25, |
|
"epoch": 0.794, |
|
"grad_norm": 0.8031813502311707, |
|
"kl": 0.00140380859375, |
|
"learning_rate": 2.1141329099692406e-07, |
|
"loss": -0.0135, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 397 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.796, |
|
"grad_norm": 0.015710551291704178, |
|
"kl": 0.0016117095947265625, |
|
"learning_rate": 2.0935222495670968e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 398 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.798, |
|
"grad_norm": 0.7256066203117371, |
|
"kl": 0.00640869140625, |
|
"learning_rate": 2.0730776160846853e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 399 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.8, |
|
"grad_norm": 0.011895825155079365, |
|
"kl": 0.00131988525390625, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 400 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.802, |
|
"grad_norm": 0.01262570358812809, |
|
"kl": 0.00177001953125, |
|
"learning_rate": 2.032690407508949e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 401 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.804, |
|
"grad_norm": 0.014729364775121212, |
|
"kl": 0.00090789794921875, |
|
"learning_rate": 2.0127498008311922e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 402 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.806, |
|
"grad_norm": 0.012951449491083622, |
|
"kl": 0.00146484375, |
|
"learning_rate": 1.9929791578083655e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 403 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.808, |
|
"grad_norm": 0.6974371671676636, |
|
"kl": 0.0015716552734375, |
|
"learning_rate": 1.9733794420337213e-07, |
|
"loss": 0.0001, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 404 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.81, |
|
"grad_norm": 0.012584330514073372, |
|
"kl": 0.001598358154296875, |
|
"learning_rate": 1.9539516087697517e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 405 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.812, |
|
"grad_norm": 0.011754573322832584, |
|
"kl": 0.00107574462890625, |
|
"learning_rate": 1.934696604901642e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 406 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.814, |
|
"grad_norm": 0.02457410842180252, |
|
"kl": 0.0015869140625, |
|
"learning_rate": 1.915615368891117e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 407 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.816, |
|
"grad_norm": 0.7841270565986633, |
|
"kl": 0.00121307373046875, |
|
"learning_rate": 1.8967088307307e-07, |
|
"loss": 0.0, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 408 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.818, |
|
"grad_norm": 0.01260992232710123, |
|
"kl": 0.0014495849609375, |
|
"learning_rate": 1.8779779118983867e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 409 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.82, |
|
"grad_norm": 0.011464129202067852, |
|
"kl": 0.0010471343994140625, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 410 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1915.0, |
|
"epoch": 0.822, |
|
"grad_norm": 0.8047151565551758, |
|
"kl": 0.00250244140625, |
|
"learning_rate": 1.8410465752883758e-07, |
|
"loss": -0.0527, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 411 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.824, |
|
"grad_norm": 0.0124558350071311, |
|
"kl": 0.00130462646484375, |
|
"learning_rate": 1.822847957491922e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 412 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.826, |
|
"grad_norm": 0.01380992028862238, |
|
"kl": 0.0008144378662109375, |
|
"learning_rate": 1.804828558898332e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 413 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.828, |
|
"grad_norm": 0.9551144242286682, |
|
"kl": 0.011430740356445312, |
|
"learning_rate": 1.7869892577476722e-07, |
|
"loss": 0.0005, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 414 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.83, |
|
"grad_norm": 0.012345471419394016, |
|
"kl": 0.001438140869140625, |
|
"learning_rate": 1.7693309235023127e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 415 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1821.0, |
|
"epoch": 0.832, |
|
"grad_norm": 0.7977136969566345, |
|
"kl": 0.001773834228515625, |
|
"learning_rate": 1.7518544168045524e-07, |
|
"loss": 0.0981, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 416 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.834, |
|
"grad_norm": 0.011019429191946983, |
|
"kl": 0.0006771087646484375, |
|
"learning_rate": 1.7345605894346726e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 417 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.836, |
|
"grad_norm": 0.015206689946353436, |
|
"kl": 0.0016021728515625, |
|
"learning_rate": 1.7174502842694212e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 418 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.838, |
|
"grad_norm": 0.04012497141957283, |
|
"kl": 0.000911712646484375, |
|
"learning_rate": 1.7005243352409333e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 419 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.84, |
|
"grad_norm": 0.01337823923677206, |
|
"kl": 0.0009021759033203125, |
|
"learning_rate": 1.6837835672960831e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 420 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.842, |
|
"grad_norm": 0.010002349503338337, |
|
"kl": 0.0014495849609375, |
|
"learning_rate": 1.6672287963562852e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 421 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.844, |
|
"grad_norm": 0.716344952583313, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 1.6508608292777203e-07, |
|
"loss": 0.0003, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 422 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.846, |
|
"grad_norm": 0.037377193570137024, |
|
"kl": 0.001346588134765625, |
|
"learning_rate": 1.6346804638120098e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 423 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.848, |
|
"grad_norm": 0.011378524824976921, |
|
"kl": 0.0007991790771484375, |
|
"learning_rate": 1.6186884885673413e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 424 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.85, |
|
"grad_norm": 0.012089181691408157, |
|
"kl": 0.00154876708984375, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 425 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.852, |
|
"grad_norm": 0.7250344753265381, |
|
"kl": 0.002223968505859375, |
|
"learning_rate": 1.5872728172265146e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 426 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1880.5, |
|
"epoch": 0.854, |
|
"grad_norm": 0.8333255648612976, |
|
"kl": 0.0015106201171875, |
|
"learning_rate": 1.5718506522858572e-07, |
|
"loss": 0.0692, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 427 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1326.75, |
|
"epoch": 0.856, |
|
"grad_norm": 1.4510780572891235, |
|
"kl": 0.0024261474609375, |
|
"learning_rate": 1.5566199398026147e-07, |
|
"loss": 0.0319, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 428 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.858, |
|
"grad_norm": 0.01227201521396637, |
|
"kl": 0.0006866455078125, |
|
"learning_rate": 1.5415814221002265e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 429 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.86, |
|
"grad_norm": 0.012068121694028378, |
|
"kl": 0.0009098052978515625, |
|
"learning_rate": 1.5267358321348285e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 430 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.862, |
|
"grad_norm": 0.012486455962061882, |
|
"kl": 0.000980377197265625, |
|
"learning_rate": 1.5120838934595337e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 431 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.864, |
|
"grad_norm": 0.013689450919628143, |
|
"kl": 0.00124359130859375, |
|
"learning_rate": 1.4976263201891613e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 432 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.866, |
|
"grad_norm": 0.5863283276557922, |
|
"kl": 0.00223541259765625, |
|
"learning_rate": 1.483363816965435e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 433 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.868, |
|
"grad_norm": 0.01356459315866232, |
|
"kl": 0.00078582763671875, |
|
"learning_rate": 1.469297078922642e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 434 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.87, |
|
"grad_norm": 0.009759028442203999, |
|
"kl": 0.0011749267578125, |
|
"learning_rate": 1.4554267916537495e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 435 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.872, |
|
"grad_norm": 0.02292313612997532, |
|
"kl": 0.0009002685546875, |
|
"learning_rate": 1.4417536311769885e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 436 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.874, |
|
"grad_norm": 0.013289421796798706, |
|
"kl": 0.00077056884765625, |
|
"learning_rate": 1.4282782639029128e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 437 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.876, |
|
"grad_norm": 0.0265584085136652, |
|
"kl": 0.00119781494140625, |
|
"learning_rate": 1.4150013466019114e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 438 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2008.25, |
|
"epoch": 0.878, |
|
"grad_norm": 0.013609832152724266, |
|
"kl": 0.000782012939453125, |
|
"learning_rate": 1.4019235263722034e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 439 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.88, |
|
"grad_norm": 0.010987906716763973, |
|
"kl": 0.00115203857421875, |
|
"learning_rate": 1.3890454406082956e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 440 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.882, |
|
"grad_norm": 0.01149643212556839, |
|
"kl": 0.0007152557373046875, |
|
"learning_rate": 1.3763677169699217e-07, |
|
"loss": 0.0, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 441 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.884, |
|
"grad_norm": 0.012063896283507347, |
|
"kl": 0.000858306884765625, |
|
"learning_rate": 1.3638909733514452e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 442 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.886, |
|
"grad_norm": 0.6539027094841003, |
|
"kl": 0.0033817291259765625, |
|
"learning_rate": 1.351615817851748e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 443 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.888, |
|
"grad_norm": 0.683651864528656, |
|
"kl": 0.001430511474609375, |
|
"learning_rate": 1.3395428487445914e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 444 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.89, |
|
"grad_norm": 0.014244834892451763, |
|
"kl": 0.001583099365234375, |
|
"learning_rate": 1.3276726544494571e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 445 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1551.25, |
|
"epoch": 0.892, |
|
"grad_norm": 0.04096902534365654, |
|
"kl": 0.003017425537109375, |
|
"learning_rate": 1.316005813502869e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5, |
|
"step": 446 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.894, |
|
"grad_norm": 0.013455760665237904, |
|
"kl": 0.00115203857421875, |
|
"learning_rate": 1.3045428945301953e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 447 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1822.0, |
|
"epoch": 0.896, |
|
"grad_norm": 1.1271547079086304, |
|
"kl": 0.0069122314453125, |
|
"learning_rate": 1.2932844562179352e-07, |
|
"loss": -0.0998, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 448 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.898, |
|
"grad_norm": 0.6208535432815552, |
|
"kl": 0.0009765625, |
|
"learning_rate": 1.2822310472864885e-07, |
|
"loss": 0.0, |
|
"reward": 0.5, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 449 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.9, |
|
"grad_norm": 0.009892701171338558, |
|
"kl": 0.0006608963012695312, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 450 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.902, |
|
"grad_norm": 0.0119470888748765, |
|
"kl": 0.000751495361328125, |
|
"learning_rate": 1.260741462457165e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 451 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1717.0, |
|
"epoch": 0.904, |
|
"grad_norm": 0.015732314437627792, |
|
"kl": 0.001445770263671875, |
|
"learning_rate": 1.2503063339313356e-07, |
|
"loss": 0.0, |
|
"reward": 0.125, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 452 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.906, |
|
"grad_norm": 0.7161591649055481, |
|
"kl": 0.001621246337890625, |
|
"learning_rate": 1.2400783294793668e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 453 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.908, |
|
"grad_norm": 0.009790318086743355, |
|
"kl": 0.0006418228149414062, |
|
"learning_rate": 1.2300579475997657e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 454 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.91, |
|
"grad_norm": 0.01178540289402008, |
|
"kl": 0.0012798309326171875, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 455 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1967.75, |
|
"epoch": 0.912, |
|
"grad_norm": 0.918179452419281, |
|
"kl": 0.00170135498046875, |
|
"learning_rate": 1.2106419949317388e-07, |
|
"loss": 0.0301, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 456 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.914, |
|
"grad_norm": 0.7712357640266418, |
|
"kl": 0.00215911865234375, |
|
"learning_rate": 1.2012473704494537e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 457 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1705.5, |
|
"epoch": 0.916, |
|
"grad_norm": 0.8978808522224426, |
|
"kl": 0.00180816650390625, |
|
"learning_rate": 1.1920622611056974e-07, |
|
"loss": 0.0779, |
|
"reward": 0.25, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 458 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.918, |
|
"grad_norm": 0.8227681517601013, |
|
"kl": 0.00136566162109375, |
|
"learning_rate": 1.1830871145697412e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 459 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.92, |
|
"grad_norm": 0.009116187691688538, |
|
"kl": 0.00128936767578125, |
|
"learning_rate": 1.1743223682775649e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 460 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.922, |
|
"grad_norm": 0.014834447763860226, |
|
"kl": 0.00095367431640625, |
|
"learning_rate": 1.1657684494105386e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 461 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.924, |
|
"grad_norm": 0.013181531801819801, |
|
"kl": 0.0008907318115234375, |
|
"learning_rate": 1.1574257748745986e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 462 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.926, |
|
"grad_norm": 0.010957694612443447, |
|
"kl": 0.0006504058837890625, |
|
"learning_rate": 1.1492947512799328e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 463 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.928, |
|
"grad_norm": 0.8815539479255676, |
|
"kl": 0.0010204315185546875, |
|
"learning_rate": 1.1413757749211602e-07, |
|
"loss": 0.0, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 464 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.93, |
|
"grad_norm": 0.010624675080180168, |
|
"kl": 0.0008087158203125, |
|
"learning_rate": 1.1336692317580158e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 465 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.932, |
|
"grad_norm": 0.013844382017850876, |
|
"kl": 0.001033782958984375, |
|
"learning_rate": 1.1261754973965422e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 466 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.934, |
|
"grad_norm": 0.012338408268988132, |
|
"kl": 0.000690460205078125, |
|
"learning_rate": 1.1188949370707787e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 467 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1994.0, |
|
"epoch": 0.936, |
|
"grad_norm": 0.7254545092582703, |
|
"kl": 0.0010471343994140625, |
|
"learning_rate": 1.1118279056249653e-07, |
|
"loss": 0.0197, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 468 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.938, |
|
"grad_norm": 0.01179241482168436, |
|
"kl": 0.0006885528564453125, |
|
"learning_rate": 1.1049747474962444e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 469 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.94, |
|
"grad_norm": 0.6890900135040283, |
|
"kl": 0.001373291015625, |
|
"learning_rate": 1.0983357966978745e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 470 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.942, |
|
"grad_norm": 0.5639128684997559, |
|
"kl": 0.00131988525390625, |
|
"learning_rate": 1.0919113768029517e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 471 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.944, |
|
"grad_norm": 0.012823720462620258, |
|
"kl": 0.001026153564453125, |
|
"learning_rate": 1.0857018009286381e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 472 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.946, |
|
"grad_norm": 0.009964537806808949, |
|
"kl": 0.0007228851318359375, |
|
"learning_rate": 1.0797073717209013e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 473 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.948, |
|
"grad_norm": 0.7185282707214355, |
|
"kl": 0.0028629302978515625, |
|
"learning_rate": 1.0739283813397639e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 474 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.95, |
|
"grad_norm": 0.012274986132979393, |
|
"kl": 0.00087738037109375, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 475 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1744.25, |
|
"epoch": 0.952, |
|
"grad_norm": 0.8758648633956909, |
|
"kl": 0.002716064453125, |
|
"learning_rate": 1.063017833182728e-07, |
|
"loss": 0.1492, |
|
"reward": 0.1875, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 476 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.954, |
|
"grad_norm": 0.014645918272435665, |
|
"kl": 0.000720977783203125, |
|
"learning_rate": 1.0578868071715544e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 477 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.956, |
|
"grad_norm": 0.013319121673703194, |
|
"kl": 0.0009250640869140625, |
|
"learning_rate": 1.0529722834905125e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 478 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.958, |
|
"grad_norm": 0.009549077600240707, |
|
"kl": 0.0014190673828125, |
|
"learning_rate": 1.0482745016665526e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 479 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.96, |
|
"grad_norm": 0.015292295254766941, |
|
"kl": 0.0009307861328125, |
|
"learning_rate": 1.0437936906629334e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 480 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.962, |
|
"grad_norm": 1.031322717666626, |
|
"kl": 0.003509521484375, |
|
"learning_rate": 1.0395300688680625e-07, |
|
"loss": 0.0001, |
|
"reward": 0.125, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.125, |
|
"step": 481 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.964, |
|
"grad_norm": 0.011882193386554718, |
|
"kl": 0.001026153564453125, |
|
"learning_rate": 1.0354838440848501e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 482 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1847.0, |
|
"epoch": 0.966, |
|
"grad_norm": 0.6594410538673401, |
|
"kl": 0.002811431884765625, |
|
"learning_rate": 1.0316552135205837e-07, |
|
"loss": 0.0864, |
|
"reward": 0.4375, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 483 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.968, |
|
"grad_norm": 0.013277575373649597, |
|
"kl": 0.000789642333984375, |
|
"learning_rate": 1.0280443637773163e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 484 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.97, |
|
"grad_norm": 0.7425467371940613, |
|
"kl": 0.00323486328125, |
|
"learning_rate": 1.0246514708427701e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.0625, |
|
"step": 485 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.972, |
|
"grad_norm": 0.01078079268336296, |
|
"kl": 0.00154876708984375, |
|
"learning_rate": 1.0214767000817596e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 486 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.974, |
|
"grad_norm": 0.012354187667369843, |
|
"kl": 0.0005855560302734375, |
|
"learning_rate": 1.0185202062281336e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 487 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.976, |
|
"grad_norm": 0.7250139117240906, |
|
"kl": 0.0037212371826171875, |
|
"learning_rate": 1.0157821333772304e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 488 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.978, |
|
"grad_norm": 0.014553495682775974, |
|
"kl": 0.00141143798828125, |
|
"learning_rate": 1.013262614978859e-07, |
|
"loss": 0.0001, |
|
"reward": 0.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 489 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.98, |
|
"grad_norm": 0.01307358592748642, |
|
"kl": 0.000835418701171875, |
|
"learning_rate": 1.0109617738307911e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 490 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1908.5, |
|
"epoch": 0.982, |
|
"grad_norm": 0.7941016554832458, |
|
"kl": 0.0010204315185546875, |
|
"learning_rate": 1.0088797220727779e-07, |
|
"loss": 0.0558, |
|
"reward": 0.375, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.375, |
|
"step": 491 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.984, |
|
"grad_norm": 0.008943392895162106, |
|
"kl": 0.000537872314453125, |
|
"learning_rate": 1.0070165611810855e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 492 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.986, |
|
"grad_norm": 0.012137016281485558, |
|
"kl": 0.001377105712890625, |
|
"learning_rate": 1.005372381963547e-07, |
|
"loss": 0.0001, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 493 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.988, |
|
"grad_norm": 0.01392819918692112, |
|
"kl": 0.0009784698486328125, |
|
"learning_rate": 1.0039472645551372e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 494 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.99, |
|
"grad_norm": 0.010369054041802883, |
|
"kl": 0.000942230224609375, |
|
"learning_rate": 1.002741278414069e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 495 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.992, |
|
"grad_norm": 0.011974328197538853, |
|
"kl": 0.0006275177001953125, |
|
"learning_rate": 1.0017544823184055e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 496 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.994, |
|
"grad_norm": 0.01267942413687706, |
|
"kl": 0.00104522705078125, |
|
"learning_rate": 1.0009869243631952e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 497 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.996, |
|
"grad_norm": 0.876571536064148, |
|
"kl": 0.003925323486328125, |
|
"learning_rate": 1.000438641958131e-07, |
|
"loss": 0.0002, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 498 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 0.998, |
|
"grad_norm": 0.6709615588188171, |
|
"kl": 0.001743316650390625, |
|
"learning_rate": 1.0001096618257236e-07, |
|
"loss": 0.0001, |
|
"reward": 0.1875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.1875, |
|
"step": 499 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2048.0, |
|
"epoch": 1.0, |
|
"grad_norm": 0.014002146199345589, |
|
"kl": 0.0007781982421875, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"reward": 0.25, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.25, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0033100851627775683, |
|
"train_runtime": 14646.384, |
|
"train_samples_per_second": 0.068, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|