Model save

Browse files

Files changed (5) hide show

README.md +67 -0
all_results.json +8 -0
generation_config.json +6 -0
train_results.json +8 -0
trainer_state.json +2292 -0

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+library_name: transformers
+model_name: Qwen2.5-7B-Open-R1-GRPO-Connected-50000-easy-3
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen2.5-7B-Open-R1-GRPO-Connected-50000-easy-3
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="AdAstraAbyssoque/Qwen2.5-7B-Open-R1-GRPO-Connected-50000-easy-3", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/bowen_liu-hkust/huggingface/runs/7aewc3l7)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.16.0.dev0
+- Transformers: 4.49.0
+- Pytorch: 2.5.1
+- Datasets: 3.3.2
+- Tokenizers: 0.21.0
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.31898806129537965,
+    "train_runtime": 4133.207,
+    "train_samples": 50000,
+    "train_samples_per_second": 2.032,
+    "train_steps_per_second": 0.036
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.49.0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.31898806129537965,
+    "train_runtime": 4133.207,
+    "train_samples": 50000,
+    "train_samples_per_second": 2.032,
+    "train_steps_per_second": 0.036
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2292 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.024,
+  "eval_steps": 500,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 423.1785888671875,
+      "epoch": 0.00016,
+      "grad_norm": 1.5622539520263672,
+      "kl": 0.0,
+      "learning_rate": 4.999451708687114e-06,
+      "loss": 0.084,
+      "reward": 0.3575893044471741,
+      "reward_std": 0.4995622932910919,
+      "rewards/connected_reward": 0.1919642984867096,
+      "rewards/format_reward": 0.125,
+      "rewards/tag_count_reward": 0.3437500298023224,
+      "step": 1
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 328.01788330078125,
+      "epoch": 0.00032,
+      "grad_norm": 1.7536635398864746,
+      "kl": 0.00445556640625,
+      "learning_rate": 4.997807075247147e-06,
+      "loss": 0.0909,
+      "reward": 0.8333333134651184,
+      "reward_std": 0.7626916766166687,
+      "rewards/connected_reward": 0.5029762387275696,
+      "rewards/format_reward": 0.2857142984867096,
+      "rewards/tag_count_reward": 0.625,
+      "step": 2
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 365.0714416503906,
+      "epoch": 0.00048,
+      "grad_norm": 1.7543888092041016,
+      "kl": 0.0247802734375,
+      "learning_rate": 4.9950668210706795e-06,
+      "loss": 0.1728,
+      "reward": 1.1412203311920166,
+      "reward_std": 0.7671074271202087,
+      "rewards/connected_reward": 0.6309523582458496,
+      "rewards/format_reward": 0.535714328289032,
+      "rewards/tag_count_reward": 0.8080357313156128,
+      "step": 3
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 359.2500305175781,
+      "epoch": 0.00064,
+      "grad_norm": 1.619277000427246,
+      "kl": 0.03369140625,
+      "learning_rate": 4.9912321481237616e-06,
+      "loss": 0.1173,
+      "reward": 1.1927084922790527,
+      "reward_std": 0.7748870253562927,
+      "rewards/connected_reward": 0.7127975821495056,
+      "rewards/format_reward": 0.5178571939468384,
+      "rewards/tag_count_reward": 0.7366071939468384,
+      "step": 4
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 246.6785888671875,
+      "epoch": 0.0008,
+      "grad_norm": 2.3274552822113037,
+      "kl": 0.0732421875,
+      "learning_rate": 4.986304738420684e-06,
+      "loss": 0.088,
+      "reward": 1.4244047403335571,
+      "reward_std": 0.7808605432510376,
+      "rewards/connected_reward": 0.8422619700431824,
+      "rewards/format_reward": 0.660714328289032,
+      "rewards/tag_count_reward": 0.8392857313156128,
+      "step": 5
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 256.3214416503906,
+      "epoch": 0.00096,
+      "grad_norm": 5.211669921875,
+      "kl": 0.1904296875,
+      "learning_rate": 4.980286753286196e-06,
+      "loss": 0.0374,
+      "reward": 1.3794643878936768,
+      "reward_std": 0.6322147846221924,
+      "rewards/connected_reward": 0.7660714983940125,
+      "rewards/format_reward": 0.6964285969734192,
+      "rewards/tag_count_reward": 0.8839285969734192,
+      "step": 6
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 234.5178680419922,
+      "epoch": 0.00112,
+      "grad_norm": 7.583590984344482,
+      "kl": 0.2197265625,
+      "learning_rate": 4.973180832407471e-06,
+      "loss": 0.178,
+      "reward": 1.4800595045089722,
+      "reward_std": 0.7939525246620178,
+      "rewards/connected_reward": 0.867559552192688,
+      "rewards/format_reward": 0.6785714626312256,
+      "rewards/tag_count_reward": 0.910714328289032,
+      "step": 7
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 250.96429443359375,
+      "epoch": 0.00128,
+      "grad_norm": 2.215085029602051,
+      "kl": 0.13671875,
+      "learning_rate": 4.964990092676263e-06,
+      "loss": 0.1382,
+      "reward": 1.7797619104385376,
+      "reward_std": 0.8585469722747803,
+      "rewards/connected_reward": 1.1279761791229248,
+      "rewards/format_reward": 0.7678571939468384,
+      "rewards/tag_count_reward": 0.8928571939468384,
+      "step": 8
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 282.08929443359375,
+      "epoch": 0.00144,
+      "grad_norm": 2.3200507164001465,
+      "kl": 0.1650390625,
+      "learning_rate": 4.9557181268217225e-06,
+      "loss": 0.2382,
+      "reward": 1.753422737121582,
+      "reward_std": 0.8391357660293579,
+      "rewards/connected_reward": 1.0386905670166016,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 0.9241071939468384,
+      "step": 9
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 306.64288330078125,
+      "epoch": 0.0016,
+      "grad_norm": 2.2042906284332275,
+      "kl": 0.123046875,
+      "learning_rate": 4.9453690018345144e-06,
+      "loss": 0.0315,
+      "reward": 1.4244047403335571,
+      "reward_std": 0.8331058025360107,
+      "rewards/connected_reward": 0.7226191163063049,
+      "rewards/format_reward": 0.8571429252624512,
+      "rewards/tag_count_reward": 0.910714328289032,
+      "step": 10
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 361.2500305175781,
+      "epoch": 0.00176,
+      "grad_norm": 1.6821162700653076,
+      "kl": 0.138671875,
+      "learning_rate": 4.933947257182901e-06,
+      "loss": 0.0162,
+      "reward": 1.227083444595337,
+      "reward_std": 0.550583004951477,
+      "rewards/connected_reward": 0.4940476715564728,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9553571939468384,
+      "step": 11
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 392.6785888671875,
+      "epoch": 0.00192,
+      "grad_norm": 1.718521237373352,
+      "kl": 0.12158203125,
+      "learning_rate": 4.921457902821578e-06,
+      "loss": -0.0295,
+      "reward": 1.6671130657196045,
+      "reward_std": 0.7108415961265564,
+      "rewards/connected_reward": 0.9630953073501587,
+      "rewards/format_reward": 0.8214285969734192,
+      "rewards/tag_count_reward": 0.9776785969734192,
+      "step": 12
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 373.5000305175781,
+      "epoch": 0.00208,
+      "grad_norm": 1.7214635610580444,
+      "kl": 0.1328125,
+      "learning_rate": 4.907906416994146e-06,
+      "loss": 0.0016,
+      "reward": 1.7629464864730835,
+      "reward_std": 0.7587335705757141,
+      "rewards/connected_reward": 1.0848214626312256,
+      "rewards/format_reward": 0.785714328289032,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 13
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 434.39288330078125,
+      "epoch": 0.00224,
+      "grad_norm": 1.6192271709442139,
+      "kl": 0.1416015625,
+      "learning_rate": 4.893298743830168e-06,
+      "loss": 0.08,
+      "reward": 1.7056547403335571,
+      "reward_std": 0.5545923709869385,
+      "rewards/connected_reward": 1.0029762983322144,
+      "rewards/format_reward": 0.8214285969734192,
+      "rewards/tag_count_reward": 0.973214328289032,
+      "step": 14
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 426.3750305175781,
+      "epoch": 0.0024,
+      "grad_norm": 1.4105689525604248,
+      "kl": 0.1298828125,
+      "learning_rate": 4.8776412907378845e-06,
+      "loss": -0.0215,
+      "reward": 1.6488096714019775,
+      "reward_std": 0.5783611536026001,
+      "rewards/connected_reward": 0.9666666984558105,
+      "rewards/format_reward": 0.785714328289032,
+      "rewards/tag_count_reward": 0.9642857313156128,
+      "step": 15
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 499.39288330078125,
+      "epoch": 0.00256,
+      "grad_norm": 1.6216644048690796,
+      "kl": 0.126953125,
+      "learning_rate": 4.860940925593703e-06,
+      "loss": -0.0328,
+      "reward": 1.387202501296997,
+      "reward_std": 0.5975491404533386,
+      "rewards/connected_reward": 0.6934524178504944,
+      "rewards/format_reward": 0.8035714626312256,
+      "rewards/tag_count_reward": 0.973214328289032,
+      "step": 16
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 491.76788330078125,
+      "epoch": 0.00272,
+      "grad_norm": 1.512536883354187,
+      "kl": 0.130859375,
+      "learning_rate": 4.84320497372973e-06,
+      "loss": 0.0206,
+      "reward": 1.6599701642990112,
+      "reward_std": 0.5700646638870239,
+      "rewards/connected_reward": 0.9175595045089722,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9866071939468384,
+      "step": 17
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 455.5714416503906,
+      "epoch": 0.00288,
+      "grad_norm": 233.13291931152344,
+      "kl": 6.40625,
+      "learning_rate": 4.824441214720629e-06,
+      "loss": 0.2602,
+      "reward": 1.8773809671401978,
+      "reward_std": 0.8444122076034546,
+      "rewards/connected_reward": 1.1398810148239136,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 1.0,
+      "step": 18
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 523.2857666015625,
+      "epoch": 0.00304,
+      "grad_norm": 1.4070404767990112,
+      "kl": 0.1328125,
+      "learning_rate": 4.804657878971252e-06,
+      "loss": -0.006,
+      "reward": 1.7507867813110352,
+      "reward_std": 0.6262676119804382,
+      "rewards/connected_reward": 1.0378402471542358,
+      "rewards/format_reward": 0.8392857313156128,
+      "rewards/tag_count_reward": 0.9776785969734192,
+      "step": 19
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 505.01788330078125,
+      "epoch": 0.0032,
+      "grad_norm": 2.4147706031799316,
+      "kl": 0.1533203125,
+      "learning_rate": 4.783863644106502e-06,
+      "loss": 0.0332,
+      "reward": 1.725595235824585,
+      "reward_std": 0.7785912156105042,
+      "rewards/connected_reward": 1.0193452835083008,
+      "rewards/format_reward": 0.8392857313156128,
+      "rewards/tag_count_reward": 0.9553571939468384,
+      "step": 20
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 438.2321472167969,
+      "epoch": 0.00336,
+      "grad_norm": 1.6368823051452637,
+      "kl": 0.1630859375,
+      "learning_rate": 4.762067631165049e-06,
+      "loss": 0.0295,
+      "reward": 1.9540605545043945,
+      "reward_std": 0.8223637938499451,
+      "rewards/connected_reward": 1.2402212619781494,
+      "rewards/format_reward": 0.8571429252624512,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 21
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 453.8750305175781,
+      "epoch": 0.00352,
+      "grad_norm": 1.7482272386550903,
+      "kl": 0.1572265625,
+      "learning_rate": 4.7392794005985324e-06,
+      "loss": 0.0902,
+      "reward": 1.8142857551574707,
+      "reward_std": 0.8429319858551025,
+      "rewards/connected_reward": 1.133928656578064,
+      "rewards/format_reward": 0.8035714626312256,
+      "rewards/tag_count_reward": 0.9285714626312256,
+      "step": 22
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 466.607177734375,
+      "epoch": 0.00368,
+      "grad_norm": 1.5820465087890625,
+      "kl": 0.1435546875,
+      "learning_rate": 4.715508948078037e-06,
+      "loss": 0.0366,
+      "reward": 1.6919643878936768,
+      "reward_std": 0.7337676882743835,
+      "rewards/connected_reward": 0.9741071462631226,
+      "rewards/format_reward": 0.8571429252624512,
+      "rewards/tag_count_reward": 0.9642857313156128,
+      "step": 23
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 436.9821472167969,
+      "epoch": 0.00384,
+      "grad_norm": 1.3989168405532837,
+      "kl": 0.1650390625,
+      "learning_rate": 4.690766700109659e-06,
+      "loss": -0.0232,
+      "reward": 2.0504465103149414,
+      "reward_std": 0.7145797610282898,
+      "rewards/connected_reward": 1.3071428537368774,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9598214626312256,
+      "step": 24
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 507.6964416503906,
+      "epoch": 0.004,
+      "grad_norm": 1.4435826539993286,
+      "kl": 0.1484375,
+      "learning_rate": 4.665063509461098e-06,
+      "loss": 0.0336,
+      "reward": 1.3611607551574707,
+      "reward_std": 0.5452148914337158,
+      "rewards/connected_reward": 0.5625,
+      "rewards/format_reward": 1.0,
+      "rewards/tag_count_reward": 0.9955357313156128,
+      "step": 25
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 416.6607360839844,
+      "epoch": 0.00416,
+      "grad_norm": 1.6018980741500854,
+      "kl": 0.1513671875,
+      "learning_rate": 4.638410650401267e-06,
+      "loss": -0.0475,
+      "reward": 1.834970235824585,
+      "reward_std": 0.934838056564331,
+      "rewards/connected_reward": 1.086309552192688,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9776785969734192,
+      "step": 26
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 433.6250305175781,
+      "epoch": 0.00432,
+      "grad_norm": 1.635309100151062,
+      "kl": 0.15625,
+      "learning_rate": 4.610819813755038e-06,
+      "loss": 0.0671,
+      "reward": 1.7008929252624512,
+      "reward_std": 0.8552356362342834,
+      "rewards/connected_reward": 0.9508929252624512,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9821429252624512,
+      "step": 27
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 446.3214416503906,
+      "epoch": 0.00448,
+      "grad_norm": 1.4390822649002075,
+      "kl": 0.15234375,
+      "learning_rate": 4.582303101775249e-06,
+      "loss": 0.0515,
+      "reward": 1.7136904001235962,
+      "reward_std": 0.6487642526626587,
+      "rewards/connected_reward": 0.9315477609634399,
+      "rewards/format_reward": 0.9642857313156128,
+      "rewards/tag_count_reward": 1.0,
+      "step": 28
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 428.0000305175781,
+      "epoch": 0.00464,
+      "grad_norm": 1.5933189392089844,
+      "kl": 0.1513671875,
+      "learning_rate": 4.55287302283426e-06,
+      "loss": 0.0457,
+      "reward": 1.5874149799346924,
+      "reward_std": 0.7879763245582581,
+      "rewards/connected_reward": 0.7963436245918274,
+      "rewards/format_reward": 0.9821429252624512,
+      "rewards/tag_count_reward": 1.0,
+      "step": 29
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 478.607177734375,
+      "epoch": 0.0048,
+      "grad_norm": 1.511976718902588,
+      "kl": 0.1435546875,
+      "learning_rate": 4.522542485937369e-06,
+      "loss": 0.0491,
+      "reward": 1.3787201642990112,
+      "reward_std": 0.7991003394126892,
+      "rewards/connected_reward": 0.644345223903656,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9598214626312256,
+      "step": 30
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 430.71429443359375,
+      "epoch": 0.00496,
+      "grad_norm": 1.6736035346984863,
+      "kl": 0.16015625,
+      "learning_rate": 4.491324795060491e-06,
+      "loss": -0.0175,
+      "reward": 1.6544642448425293,
+      "reward_std": 0.6817477941513062,
+      "rewards/connected_reward": 0.9151785969734192,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9464285969734192,
+      "step": 31
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 396.0000305175781,
+      "epoch": 0.00512,
+      "grad_norm": 1.8634084463119507,
+      "kl": 0.1513671875,
+      "learning_rate": 4.4592336433146e-06,
+      "loss": 0.1726,
+      "reward": 1.756250023841858,
+      "reward_std": 0.9827747941017151,
+      "rewards/connected_reward": 1.0178571939468384,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.973214328289032,
+      "step": 32
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 371.26788330078125,
+      "epoch": 0.00528,
+      "grad_norm": 1.7837437391281128,
+      "kl": 0.1484375,
+      "learning_rate": 4.426283106939474e-06,
+      "loss": -0.023,
+      "reward": 1.8660714626312256,
+      "reward_std": 0.8486083149909973,
+      "rewards/connected_reward": 1.0982143878936768,
+      "rewards/format_reward": 0.9464285969734192,
+      "rewards/tag_count_reward": 0.9821429252624512,
+      "step": 33
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 427.0535888671875,
+      "epoch": 0.00544,
+      "grad_norm": 5.339620590209961,
+      "kl": 0.1552734375,
+      "learning_rate": 4.3924876391293915e-06,
+      "loss": 0.0789,
+      "reward": 1.8375002145767212,
+      "reward_std": 0.7801174521446228,
+      "rewards/connected_reward": 1.1482142210006714,
+      "rewards/format_reward": 0.8214285969734192,
+      "rewards/tag_count_reward": 0.9285714626312256,
+      "step": 34
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 405.0000305175781,
+      "epoch": 0.0056,
+      "grad_norm": 3.4218180179595947,
+      "kl": 0.1923828125,
+      "learning_rate": 4.357862063693486e-06,
+      "loss": 0.026,
+      "reward": 1.5308035612106323,
+      "reward_std": 0.8105348944664001,
+      "rewards/connected_reward": 0.8080357313156128,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 35
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 439.5000305175781,
+      "epoch": 0.00576,
+      "grad_norm": 2.0338456630706787,
+      "kl": 0.173828125,
+      "learning_rate": 4.322421568553529e-06,
+      "loss": 0.1697,
+      "reward": 1.2683035135269165,
+      "reward_std": 0.7643690705299377,
+      "rewards/connected_reward": 0.6214286684989929,
+      "rewards/format_reward": 0.7500000596046448,
+      "rewards/tag_count_reward": 0.9062500596046448,
+      "step": 36
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 401.2321472167969,
+      "epoch": 0.00592,
+      "grad_norm": 1.8888318538665771,
+      "kl": 0.1787109375,
+      "learning_rate": 4.286181699082008e-06,
+      "loss": 0.1236,
+      "reward": 1.7342263460159302,
+      "reward_std": 0.9654321670532227,
+      "rewards/connected_reward": 1.0208333730697632,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 0.9196429252624512,
+      "step": 37
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 473.96429443359375,
+      "epoch": 0.00608,
+      "grad_norm": 1.792199969291687,
+      "kl": 0.1982421875,
+      "learning_rate": 4.249158351283414e-06,
+      "loss": 0.154,
+      "reward": 1.37901771068573,
+      "reward_std": 0.8872238397598267,
+      "rewards/connected_reward": 0.7366071939468384,
+      "rewards/format_reward": 0.7678571939468384,
+      "rewards/tag_count_reward": 0.8616071939468384,
+      "step": 38
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 512.232177734375,
+      "epoch": 0.00624,
+      "grad_norm": 2.252385377883911,
+      "kl": 0.1826171875,
+      "learning_rate": 4.211367764821722e-06,
+      "loss": 0.3264,
+      "reward": 1.412500023841858,
+      "reward_std": 0.7886937856674194,
+      "rewards/connected_reward": 0.754464328289032,
+      "rewards/format_reward": 0.785714328289032,
+      "rewards/tag_count_reward": 0.8839285969734192,
+      "step": 39
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 517.7678833007812,
+      "epoch": 0.0064,
+      "grad_norm": 2.793456554412842,
+      "kl": 0.203125,
+      "learning_rate": 4.172826515897146e-06,
+      "loss": 0.264,
+      "reward": 1.2334821224212646,
+      "reward_std": 0.624849796295166,
+      "rewards/connected_reward": 0.5687500238418579,
+      "rewards/format_reward": 0.785714328289032,
+      "rewards/tag_count_reward": 0.9062500596046448,
+      "step": 40
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 460.5535888671875,
+      "epoch": 0.00656,
+      "grad_norm": 2.782038688659668,
+      "kl": 0.1826171875,
+      "learning_rate": 4.133551509975264e-06,
+      "loss": 0.3976,
+      "reward": 1.6736607551574707,
+      "reward_std": 0.9631600379943848,
+      "rewards/connected_reward": 1.0625,
+      "rewards/format_reward": 0.6785714626312256,
+      "rewards/tag_count_reward": 0.9062500596046448,
+      "step": 41
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 591.5714721679688,
+      "epoch": 0.00672,
+      "grad_norm": 1.9981597661972046,
+      "kl": 0.2255859375,
+      "learning_rate": 4.093559974371725e-06,
+      "loss": 0.359,
+      "reward": 1.3278273344039917,
+      "reward_std": 0.9624171257019043,
+      "rewards/connected_reward": 0.7380953431129456,
+      "rewards/format_reward": 0.6785714626312256,
+      "rewards/tag_count_reward": 0.8348214626312256,
+      "step": 42
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 728.2678833007812,
+      "epoch": 0.00688,
+      "grad_norm": 2.215883731842041,
+      "kl": 0.2138671875,
+      "learning_rate": 4.052869450695776e-06,
+      "loss": 0.4698,
+      "reward": 1.158928632736206,
+      "reward_std": 0.7686693072319031,
+      "rewards/connected_reward": 0.5973214507102966,
+      "rewards/format_reward": 0.625,
+      "rewards/tag_count_reward": 0.8303571939468384,
+      "step": 43
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 780.5357666015625,
+      "epoch": 0.00704,
+      "grad_norm": 2.267688035964966,
+      "kl": 0.2734375,
+      "learning_rate": 4.011497787155938e-06,
+      "loss": 0.578,
+      "reward": 1.3976190090179443,
+      "reward_std": 1.026655673980713,
+      "rewards/connected_reward": 0.8404762148857117,
+      "rewards/format_reward": 0.6428571939468384,
+      "rewards/tag_count_reward": 0.785714328289032,
+      "step": 44
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 870.9464721679688,
+      "epoch": 0.0072,
+      "grad_norm": 2.2561347484588623,
+      "kl": 0.32421875,
+      "learning_rate": 3.969463130731183e-06,
+      "loss": 0.5883,
+      "reward": 1.2087798118591309,
+      "reward_std": 0.9154091477394104,
+      "rewards/connected_reward": 0.6610119938850403,
+      "rewards/format_reward": 0.6428571939468384,
+      "rewards/tag_count_reward": 0.754464328289032,
+      "step": 45
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 786.7678833007812,
+      "epoch": 0.00736,
+      "grad_norm": 1.954925298690796,
+      "kl": 0.38671875,
+      "learning_rate": 3.92678391921108e-06,
+      "loss": 0.671,
+      "reward": 1.5040180683135986,
+      "reward_std": 1.171386957168579,
+      "rewards/connected_reward": 0.9571430087089539,
+      "rewards/format_reward": 0.625,
+      "rewards/tag_count_reward": 0.7812500596046448,
+      "step": 46
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 843.0892944335938,
+      "epoch": 0.00752,
+      "grad_norm": 1.6565457582473755,
+      "kl": 0.375,
+      "learning_rate": 3.88347887310836e-06,
+      "loss": 0.7635,
+      "reward": 1.400744080543518,
+      "reward_std": 1.0551815032958984,
+      "rewards/connected_reward": 0.8717262148857117,
+      "rewards/format_reward": 0.5892857313156128,
+      "rewards/tag_count_reward": 0.7812500596046448,
+      "step": 47
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 819.2142944335938,
+      "epoch": 0.00768,
+      "grad_norm": 1.5234383344650269,
+      "kl": 0.35546875,
+      "learning_rate": 3.839566987447492e-06,
+      "loss": 0.5719,
+      "reward": 1.422767996788025,
+      "reward_std": 1.039860725402832,
+      "rewards/connected_reward": 0.9017857313156128,
+      "rewards/format_reward": 0.5892857313156128,
+      "rewards/tag_count_reward": 0.754464328289032,
+      "step": 48
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1017.5178833007812,
+      "epoch": 0.00784,
+      "grad_norm": 1.8639246225357056,
+      "kl": 0.470703125,
+      "learning_rate": 3.795067523432826e-06,
+      "loss": 0.4938,
+      "reward": 1.012946605682373,
+      "reward_std": 1.0206222534179688,
+      "rewards/connected_reward": 0.5464286208152771,
+      "rewards/format_reward": 0.5178571939468384,
+      "rewards/tag_count_reward": 0.691964328289032,
+      "step": 49
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 789.6428833007812,
+      "epoch": 0.008,
+      "grad_norm": 7.7003021240234375,
+      "kl": 0.431640625,
+      "learning_rate": 3.7500000000000005e-06,
+      "loss": 0.6166,
+      "reward": 1.5044643878936768,
+      "reward_std": 1.1621365547180176,
+      "rewards/connected_reward": 0.9642857313156128,
+      "rewards/format_reward": 0.625,
+      "rewards/tag_count_reward": 0.7589285969734192,
+      "step": 50
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 767.1428833007812,
+      "epoch": 0.00816,
+      "grad_norm": 17.987171173095703,
+      "kl": 0.482421875,
+      "learning_rate": 3.7043841852542884e-06,
+      "loss": 0.6032,
+      "reward": 1.5376489162445068,
+      "reward_std": 0.8768205642700195,
+      "rewards/connected_reward": 0.9755952954292297,
+      "rewards/format_reward": 0.660714328289032,
+      "rewards/tag_count_reward": 0.7723214626312256,
+      "step": 51
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1012.2500610351562,
+      "epoch": 0.00832,
+      "grad_norm": 62.031070709228516,
+      "kl": 0.90625,
+      "learning_rate": 3.658240087799655e-06,
+      "loss": 0.7438,
+      "reward": 1.2361607551574707,
+      "reward_std": 1.0895289182662964,
+      "rewards/connected_reward": 0.7866071462631226,
+      "rewards/format_reward": 0.5,
+      "rewards/tag_count_reward": 0.6651785969734192,
+      "step": 52
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 790.0714721679688,
+      "epoch": 0.00848,
+      "grad_norm": 54.658687591552734,
+      "kl": 1.1015625,
+      "learning_rate": 3.611587947962319e-06,
+      "loss": 0.6992,
+      "reward": 1.5057541131973267,
+      "reward_std": 1.0789897441864014,
+      "rewards/connected_reward": 0.9771825671195984,
+      "rewards/format_reward": 0.6071428656578064,
+      "rewards/tag_count_reward": 0.7500000596046448,
+      "step": 53
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 955.4285888671875,
+      "epoch": 0.00864,
+      "grad_norm": 5.784551620483398,
+      "kl": 0.322265625,
+      "learning_rate": 3.564448228912682e-06,
+      "loss": 0.5783,
+      "reward": 1.15814208984375,
+      "reward_std": 0.956221878528595,
+      "rewards/connected_reward": 0.6451956033706665,
+      "rewards/format_reward": 0.5892857313156128,
+      "rewards/tag_count_reward": 0.7276785969734192,
+      "step": 54
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 829.9285888671875,
+      "epoch": 0.0088,
+      "grad_norm": 3.2477169036865234,
+      "kl": 0.146484375,
+      "learning_rate": 3.516841607689501e-06,
+      "loss": 0.5911,
+      "reward": 1.6458333730697632,
+      "reward_std": 1.0081928968429565,
+      "rewards/connected_reward": 1.0967262983322144,
+      "rewards/format_reward": 0.6428571939468384,
+      "rewards/tag_count_reward": 0.7589285969734192,
+      "step": 55
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 627.4642944335938,
+      "epoch": 0.00896,
+      "grad_norm": 14.7655611038208,
+      "kl": 0.1455078125,
+      "learning_rate": 3.4687889661302577e-06,
+      "loss": 0.831,
+      "reward": 2.1366071701049805,
+      "reward_std": 0.9424195289611816,
+      "rewards/connected_reward": 1.4866071939468384,
+      "rewards/format_reward": 0.785714328289032,
+      "rewards/tag_count_reward": 0.8571429252624512,
+      "step": 56
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1031.732177734375,
+      "epoch": 0.00912,
+      "grad_norm": 123.79815673828125,
+      "kl": 0.216796875,
+      "learning_rate": 3.4203113817116955e-06,
+      "loss": 0.7199,
+      "reward": 1.3994048833847046,
+      "reward_std": 1.1809272766113281,
+      "rewards/connected_reward": 0.9047619700431824,
+      "rewards/format_reward": 0.5714285969734192,
+      "rewards/tag_count_reward": 0.6964285969734192,
+      "step": 57
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1176.5357666015625,
+      "epoch": 0.00928,
+      "grad_norm": 49.686466217041016,
+      "kl": 0.4140625,
+      "learning_rate": 3.3714301183045382e-06,
+      "loss": 0.573,
+      "reward": 1.180803656578064,
+      "reward_std": 1.069697380065918,
+      "rewards/connected_reward": 0.7767857313156128,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.6026785969734192,
+      "step": 58
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1143.0179443359375,
+      "epoch": 0.00944,
+      "grad_norm": 12.297547340393066,
+      "kl": 0.1728515625,
+      "learning_rate": 3.3221666168464584e-06,
+      "loss": 0.43,
+      "reward": 1.1979166269302368,
+      "reward_std": 0.9061146378517151,
+      "rewards/connected_reward": 0.7523809671401978,
+      "rewards/format_reward": 0.5,
+      "rewards/tag_count_reward": 0.6517857313156128,
+      "step": 59
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 744.482177734375,
+      "epoch": 0.0096,
+      "grad_norm": 1.3184255361557007,
+      "kl": 0.13671875,
+      "learning_rate": 3.272542485937369e-06,
+      "loss": 0.7022,
+      "reward": 1.6336311101913452,
+      "reward_std": 1.0127315521240234,
+      "rewards/connected_reward": 1.0148810148239136,
+      "rewards/format_reward": 0.7500000596046448,
+      "rewards/tag_count_reward": 0.8125000596046448,
+      "step": 60
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 775.5892944335938,
+      "epoch": 0.00976,
+      "grad_norm": 0.7490354776382446,
+      "kl": 0.12109375,
+      "learning_rate": 3.222579492361179e-06,
+      "loss": 0.5858,
+      "reward": 1.6793580055236816,
+      "reward_std": 1.2150492668151855,
+      "rewards/connected_reward": 1.073554515838623,
+      "rewards/format_reward": 0.7321428656578064,
+      "rewards/tag_count_reward": 0.7991071939468384,
+      "step": 61
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 409.3571472167969,
+      "epoch": 0.00992,
+      "grad_norm": 1.7477023601531982,
+      "kl": 0.1572265625,
+      "learning_rate": 3.1722995515381644e-06,
+      "loss": 0.3222,
+      "reward": 1.969642996788025,
+      "reward_std": 0.9241483807563782,
+      "rewards/connected_reward": 1.21875,
+      "rewards/format_reward": 0.9285714626312256,
+      "rewards/tag_count_reward": 0.9553571939468384,
+      "step": 62
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 625.5357666015625,
+      "epoch": 0.01008,
+      "grad_norm": 11.000500679016113,
+      "kl": 0.1787109375,
+      "learning_rate": 3.121724717912138e-06,
+      "loss": 0.5757,
+      "reward": 1.6025298833847046,
+      "reward_std": 0.8033274412155151,
+      "rewards/connected_reward": 0.944940447807312,
+      "rewards/format_reward": 0.8035714626312256,
+      "rewards/tag_count_reward": 0.8526785969734192,
+      "step": 63
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 288.9107360839844,
+      "epoch": 0.01024,
+      "grad_norm": 47.38264083862305,
+      "kl": 0.6328125,
+      "learning_rate": 3.0708771752766397e-06,
+      "loss": 0.0216,
+      "reward": 1.6721726655960083,
+      "reward_std": 0.7864125967025757,
+      "rewards/connected_reward": 0.9494048357009888,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 64
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 254.85714721679688,
+      "epoch": 0.0104,
+      "grad_norm": 13.367755889892578,
+      "kl": 0.412109375,
+      "learning_rate": 3.019779227044398e-06,
+      "loss": -0.0184,
+      "reward": 2.1937499046325684,
+      "reward_std": 0.6596731543540955,
+      "rewards/connected_reward": 1.4285714626312256,
+      "rewards/format_reward": 0.9464285969734192,
+      "rewards/tag_count_reward": 0.973214328289032,
+      "step": 65
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 268.46429443359375,
+      "epoch": 0.01056,
+      "grad_norm": 31.148229598999023,
+      "kl": 0.6484375,
+      "learning_rate": 2.9684532864643123e-06,
+      "loss": 0.0553,
+      "reward": 1.8163691759109497,
+      "reward_std": 0.6786539554595947,
+      "rewards/connected_reward": 1.0627976655960083,
+      "rewards/format_reward": 0.9285714626312256,
+      "rewards/tag_count_reward": 0.9642857313156128,
+      "step": 66
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 436.2857360839844,
+      "epoch": 0.01072,
+      "grad_norm": 69.53382110595703,
+      "kl": 0.2431640625,
+      "learning_rate": 2.9169218667902562e-06,
+      "loss": 0.4608,
+      "reward": 2.0997023582458496,
+      "reward_std": 0.7058864235877991,
+      "rewards/connected_reward": 1.3720238208770752,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9375000596046448,
+      "step": 67
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 387.9285888671875,
+      "epoch": 0.01088,
+      "grad_norm": 93.46610260009766,
+      "kl": 0.43359375,
+      "learning_rate": 2.8652075714060296e-06,
+      "loss": 0.1717,
+      "reward": 1.9021683931350708,
+      "reward_std": 0.8603525757789612,
+      "rewards/connected_reward": 1.1977040767669678,
+      "rewards/format_reward": 0.8571429252624512,
+      "rewards/tag_count_reward": 0.9196429252624512,
+      "step": 68
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 297.6607360839844,
+      "epoch": 0.01104,
+      "grad_norm": 102.03744506835938,
+      "kl": 0.88671875,
+      "learning_rate": 2.813333083910761e-06,
+      "loss": 0.2127,
+      "reward": 2.1446430683135986,
+      "reward_std": 0.40179580450057983,
+      "rewards/connected_reward": 1.3964285850524902,
+      "rewards/format_reward": 0.9285714626312256,
+      "rewards/tag_count_reward": 0.9464285969734192,
+      "step": 69
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 405.4285888671875,
+      "epoch": 0.0112,
+      "grad_norm": 617.213623046875,
+      "kl": 3.0625,
+      "learning_rate": 2.761321158169134e-06,
+      "loss": 0.5334,
+      "reward": 1.8612104654312134,
+      "reward_std": 0.869892418384552,
+      "rewards/connected_reward": 1.1478174924850464,
+      "rewards/format_reward": 0.8750000596046448,
+      "rewards/tag_count_reward": 0.9196429252624512,
+      "step": 70
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 276.8571472167969,
+      "epoch": 0.01136,
+      "grad_norm": 107.47020721435547,
+      "kl": 0.97265625,
+      "learning_rate": 2.70919460833079e-06,
+      "loss": 0.1194,
+      "reward": 2.099404811859131,
+      "reward_std": 0.595194935798645,
+      "rewards/connected_reward": 1.3690476417541504,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9464285969734192,
+      "step": 71
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 269.4821472167969,
+      "epoch": 0.01152,
+      "grad_norm": 60.09501266479492,
+      "kl": 0.77734375,
+      "learning_rate": 2.6569762988232838e-06,
+      "loss": 0.044,
+      "reward": 1.8062500953674316,
+      "reward_std": 0.6972793340682983,
+      "rewards/connected_reward": 1.0642857551574707,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9553571939468384,
+      "step": 72
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 262.46429443359375,
+      "epoch": 0.01168,
+      "grad_norm": 9.094280242919922,
+      "kl": 0.3046875,
+      "learning_rate": 2.604689134322999e-06,
+      "loss": -0.0869,
+      "reward": 1.8462797403335571,
+      "reward_std": 0.6357587575912476,
+      "rewards/connected_reward": 1.1056549549102783,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 73
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 253.87501525878906,
+      "epoch": 0.01184,
+      "grad_norm": 481.7274475097656,
+      "kl": 0.953125,
+      "learning_rate": 2.5523560497083927e-06,
+      "loss": 0.0369,
+      "reward": 2.012946605682373,
+      "reward_std": 0.9039466381072998,
+      "rewards/connected_reward": 1.2723214626312256,
+      "rewards/format_reward": 0.910714328289032,
+      "rewards/tag_count_reward": 0.9508929252624512,
+      "step": 74
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 237.75001525878906,
+      "epoch": 0.012,
+      "grad_norm": 213.04286193847656,
+      "kl": 0.390625,
+      "learning_rate": 2.5e-06,
+      "loss": -0.0586,
+      "reward": 2.0683035850524902,
+      "reward_std": 0.8664366006851196,
+      "rewards/connected_reward": 1.3571429252624512,
+      "rewards/format_reward": 0.8571429252624512,
+      "rewards/tag_count_reward": 0.941964328289032,
+      "step": 75
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 270.3571472167969,
+      "epoch": 0.01216,
+      "grad_norm": 52.30439376831055,
+      "kl": 0.419921875,
+      "learning_rate": 2.447643950291608e-06,
+      "loss": -0.061,
+      "reward": 1.9406251907348633,
+      "reward_std": 0.8179541826248169,
+      "rewards/connected_reward": 1.2116072177886963,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.941964328289032,
+      "step": 76
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 258.71429443359375,
+      "epoch": 0.01232,
+      "grad_norm": 346.89080810546875,
+      "kl": 0.39453125,
+      "learning_rate": 2.3953108656770018e-06,
+      "loss": -0.0469,
+      "reward": 2.028571367263794,
+      "reward_std": 0.7005811333656311,
+      "rewards/connected_reward": 1.298214316368103,
+      "rewards/format_reward": 0.8928571939468384,
+      "rewards/tag_count_reward": 0.9464285969734192,
+      "step": 77
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 303.1964416503906,
+      "epoch": 0.01248,
+      "grad_norm": 469.85699462890625,
+      "kl": 1.015625,
+      "learning_rate": 2.3430237011767166e-06,
+      "loss": 0.149,
+      "reward": 1.832887053489685,
+      "reward_std": 0.8886847496032715,
+      "rewards/connected_reward": 1.1413692235946655,
+      "rewards/format_reward": 0.8392857313156128,
+      "rewards/tag_count_reward": 0.9062500596046448,
+      "step": 78
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 303.08929443359375,
+      "epoch": 0.01264,
+      "grad_norm": 2587.5205078125,
+      "kl": 2.0625,
+      "learning_rate": 2.290805391669212e-06,
+      "loss": 0.1151,
+      "reward": 1.914987325668335,
+      "reward_std": 0.6788126230239868,
+      "rewards/connected_reward": 1.31632661819458,
+      "rewards/format_reward": 0.6964285969734192,
+      "rewards/tag_count_reward": 0.8348214626312256,
+      "step": 79
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 290.64288330078125,
+      "epoch": 0.0128,
+      "grad_norm": 576.2128295898438,
+      "kl": 0.44921875,
+      "learning_rate": 2.238678841830867e-06,
+      "loss": 0.0522,
+      "reward": 1.7459821701049805,
+      "reward_std": 0.8939936757087708,
+      "rewards/connected_reward": 1.2142857313156128,
+      "rewards/format_reward": 0.5892857313156128,
+      "rewards/tag_count_reward": 0.7901785969734192,
+      "step": 80
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 262.1785888671875,
+      "epoch": 0.01296,
+      "grad_norm": 227.37603759765625,
+      "kl": 0.6640625,
+      "learning_rate": 2.186666916089239e-06,
+      "loss": 0.0319,
+      "reward": 1.686011791229248,
+      "reward_std": 0.7109549641609192,
+      "rewards/connected_reward": 1.2485120296478271,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.7142857313156128,
+      "step": 81
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 321.1071472167969,
+      "epoch": 0.01312,
+      "grad_norm": 1797.874755859375,
+      "kl": 1.859375,
+      "learning_rate": 2.134792428593971e-06,
+      "loss": 0.2395,
+      "reward": 1.648958444595337,
+      "reward_std": 0.9454951286315918,
+      "rewards/connected_reward": 1.2708333730697632,
+      "rewards/format_reward": 0.3571428656578064,
+      "rewards/tag_count_reward": 0.6651785969734192,
+      "step": 82
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 318.3035888671875,
+      "epoch": 0.01328,
+      "grad_norm": 911.9273681640625,
+      "kl": 2.53125,
+      "learning_rate": 2.0830781332097446e-06,
+      "loss": 0.1478,
+      "reward": 1.5819941759109497,
+      "reward_std": 0.9505893588066101,
+      "rewards/connected_reward": 1.2172619104385376,
+      "rewards/format_reward": 0.3571428656578064,
+      "rewards/tag_count_reward": 0.6205357313156128,
+      "step": 83
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 304.875,
+      "epoch": 0.01344,
+      "grad_norm": 2088.1962890625,
+      "kl": 5.3125,
+      "learning_rate": 2.031546713535688e-06,
+      "loss": 0.3074,
+      "reward": 1.2762542963027954,
+      "reward_std": 0.9391399025917053,
+      "rewards/connected_reward": 0.9124150276184082,
+      "rewards/format_reward": 0.3392857313156128,
+      "rewards/tag_count_reward": 0.6473214626312256,
+      "step": 84
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 316.4464416503906,
+      "epoch": 0.0136,
+      "grad_norm": 549.3711547851562,
+      "kl": 1.71875,
+      "learning_rate": 1.9802207729556023e-06,
+      "loss": 0.172,
+      "reward": 1.3764880895614624,
+      "reward_std": 0.7936123609542847,
+      "rewards/connected_reward": 0.9791667461395264,
+      "rewards/format_reward": 0.392857164144516,
+      "rewards/tag_count_reward": 0.6696428656578064,
+      "step": 85
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 285.26788330078125,
+      "epoch": 0.01376,
+      "grad_norm": 483.4754943847656,
+      "kl": 0.6953125,
+      "learning_rate": 1.9291228247233607e-06,
+      "loss": 0.0362,
+      "reward": 1.3629463911056519,
+      "reward_std": 0.7181650400161743,
+      "rewards/connected_reward": 1.0598214864730835,
+      "rewards/format_reward": 0.25,
+      "rewards/tag_count_reward": 0.59375,
+      "step": 86
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 289.4821472167969,
+      "epoch": 0.01392,
+      "grad_norm": 431.3416442871094,
+      "kl": 0.71875,
+      "learning_rate": 1.8782752820878636e-06,
+      "loss": 0.1296,
+      "reward": 1.6370537281036377,
+      "reward_std": 0.7323834896087646,
+      "rewards/connected_reward": 1.3392857313156128,
+      "rewards/format_reward": 0.25,
+      "rewards/tag_count_reward": 0.5758928656578064,
+      "step": 87
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 272.3035888671875,
+      "epoch": 0.01408,
+      "grad_norm": 178.10662841796875,
+      "kl": 0.88671875,
+      "learning_rate": 1.827700448461836e-06,
+      "loss": 0.0167,
+      "reward": 1.5172619819641113,
+      "reward_std": 0.9202746152877808,
+      "rewards/connected_reward": 1.202529788017273,
+      "rewards/format_reward": 0.267857164144516,
+      "rewards/tag_count_reward": 0.6026785969734192,
+      "step": 88
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 253.57144165039062,
+      "epoch": 0.01424,
+      "grad_norm": 58.177215576171875,
+      "kl": 2.171875,
+      "learning_rate": 1.7774205076388207e-06,
+      "loss": 0.0,
+      "reward": 0.9139881730079651,
+      "reward_std": 0.7583447098731995,
+      "rewards/connected_reward": 0.77827388048172,
+      "rewards/format_reward": 0.0357142873108387,
+      "rewards/tag_count_reward": 0.392857164144516,
+      "step": 89
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 276.6071472167969,
+      "epoch": 0.0144,
+      "grad_norm": 205.79637145996094,
+      "kl": 1.6484375,
+      "learning_rate": 1.7274575140626318e-06,
+      "loss": 0.0985,
+      "reward": 1.4693453311920166,
+      "reward_std": 1.1510217189788818,
+      "rewards/connected_reward": 1.0863096714019775,
+      "rewards/format_reward": 0.3750000298023224,
+      "rewards/tag_count_reward": 0.6517857313156128,
+      "step": 90
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 303.625,
+      "epoch": 0.01456,
+      "grad_norm": 187.00259399414062,
+      "kl": 3.203125,
+      "learning_rate": 1.677833383153542e-06,
+      "loss": 0.1504,
+      "reward": 0.8815476894378662,
+      "reward_std": 0.8259068131446838,
+      "rewards/connected_reward": 0.7217261791229248,
+      "rewards/format_reward": 0.0892857164144516,
+      "rewards/tag_count_reward": 0.3839285969734192,
+      "step": 91
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 280.625,
+      "epoch": 0.01472,
+      "grad_norm": 47.36444854736328,
+      "kl": 1.5078125,
+      "learning_rate": 1.6285698816954626e-06,
+      "loss": -0.0548,
+      "reward": 1.3680061101913452,
+      "reward_std": 0.9850864410400391,
+      "rewards/connected_reward": 1.0461310148239136,
+      "rewards/format_reward": 0.3035714328289032,
+      "rewards/tag_count_reward": 0.566964328289032,
+      "step": 92
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 322.5357360839844,
+      "epoch": 0.01488,
+      "grad_norm": 84.1014633178711,
+      "kl": 1.859375,
+      "learning_rate": 1.5796886182883053e-06,
+      "loss": -0.08,
+      "reward": 1.18556547164917,
+      "reward_std": 0.9476414918899536,
+      "rewards/connected_reward": 0.9136905670166016,
+      "rewards/format_reward": 0.2142857313156128,
+      "rewards/tag_count_reward": 0.5491071939468384,
+      "step": 93
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 296.89288330078125,
+      "epoch": 0.01504,
+      "grad_norm": 2105.967041015625,
+      "kl": 14.9375,
+      "learning_rate": 1.5312110338697427e-06,
+      "loss": 0.4129,
+      "reward": 1.0491071939468384,
+      "reward_std": 0.9280386567115784,
+      "rewards/connected_reward": 0.8214285969734192,
+      "rewards/format_reward": 0.1607142984867096,
+      "rewards/tag_count_reward": 0.4910714626312256,
+      "step": 94
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 268.8214416503906,
+      "epoch": 0.0152,
+      "grad_norm": 181.77145385742188,
+      "kl": 3.625,
+      "learning_rate": 1.4831583923105e-06,
+      "loss": 0.1499,
+      "reward": 1.3781250715255737,
+      "reward_std": 0.7101474404335022,
+      "rewards/connected_reward": 1.1294643878936768,
+      "rewards/format_reward": 0.1785714328289032,
+      "rewards/tag_count_reward": 0.53125,
+      "step": 95
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 282.96429443359375,
+      "epoch": 0.01536,
+      "grad_norm": 159.0343017578125,
+      "kl": 2.625,
+      "learning_rate": 1.4355517710873184e-06,
+      "loss": 0.0932,
+      "reward": 1.0690478086471558,
+      "reward_std": 0.6700408458709717,
+      "rewards/connected_reward": 0.8779762387275696,
+      "rewards/format_reward": 0.125,
+      "rewards/tag_count_reward": 0.4285714626312256,
+      "step": 96
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 273.39288330078125,
+      "epoch": 0.01552,
+      "grad_norm": 4359.7138671875,
+      "kl": 6.34375,
+      "learning_rate": 1.388412052037682e-06,
+      "loss": 0.348,
+      "reward": 1.5040180683135986,
+      "reward_std": 0.8597317337989807,
+      "rewards/connected_reward": 1.3125,
+      "rewards/format_reward": 0.1071428656578064,
+      "rewards/tag_count_reward": 0.4598214626312256,
+      "step": 97
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 411.46429443359375,
+      "epoch": 0.01568,
+      "grad_norm": 464.1128845214844,
+      "kl": 2.40625,
+      "learning_rate": 1.3417599122003464e-06,
+      "loss": 0.2797,
+      "reward": 1.3879464864730835,
+      "reward_std": 0.7324258089065552,
+      "rewards/connected_reward": 1.1473214626312256,
+      "rewards/format_reward": 0.1785714328289032,
+      "rewards/tag_count_reward": 0.504464328289032,
+      "step": 98
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 336.4464416503906,
+      "epoch": 0.01584,
+      "grad_norm": 185.98046875,
+      "kl": 1.4921875,
+      "learning_rate": 1.2956158147457116e-06,
+      "loss": 0.2559,
+      "reward": 1.360267996788025,
+      "reward_std": 0.7969329357147217,
+      "rewards/connected_reward": 1.2008929252624512,
+      "rewards/format_reward": 0.0535714328289032,
+      "rewards/tag_count_reward": 0.4419642984867096,
+      "step": 99
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 463.232177734375,
+      "epoch": 0.016,
+      "grad_norm": 105.13380432128906,
+      "kl": 2.3125,
+      "learning_rate": 1.2500000000000007e-06,
+      "loss": 0.3902,
+      "reward": 1.274553656578064,
+      "reward_std": 0.7874523997306824,
+      "rewards/connected_reward": 1.0973215103149414,
+      "rewards/format_reward": 0.0892857164144516,
+      "rewards/tag_count_reward": 0.4419642984867096,
+      "step": 100
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 479.0535888671875,
+      "epoch": 0.01616,
+      "grad_norm": 685.9603881835938,
+      "kl": 1.3515625,
+      "learning_rate": 1.204932476567175e-06,
+      "loss": 0.287,
+      "reward": 1.4331845045089722,
+      "reward_std": 0.7297948002815247,
+      "rewards/connected_reward": 1.2380951642990112,
+      "rewards/format_reward": 0.125,
+      "rewards/tag_count_reward": 0.4419642984867096,
+      "step": 101
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 527.8928833007812,
+      "epoch": 0.01632,
+      "grad_norm": 436.9136657714844,
+      "kl": 1.4609375,
+      "learning_rate": 1.160433012552508e-06,
+      "loss": 0.3846,
+      "reward": 1.3385417461395264,
+      "reward_std": 1.1538331508636475,
+      "rewards/connected_reward": 1.0684523582458496,
+      "rewards/format_reward": 0.2321428656578064,
+      "rewards/tag_count_reward": 0.5133928656578064,
+      "step": 102
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 310.4821472167969,
+      "epoch": 0.01648,
+      "grad_norm": 5630.6962890625,
+      "kl": 12.8125,
+      "learning_rate": 1.11652112689164e-06,
+      "loss": 1.1389,
+      "reward": 1.5440478324890137,
+      "reward_std": 0.8701499700546265,
+      "rewards/connected_reward": 1.351190447807312,
+      "rewards/format_reward": 0.1071428656578064,
+      "rewards/tag_count_reward": 0.4642857313156128,
+      "step": 103
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 379.3571472167969,
+      "epoch": 0.01664,
+      "grad_norm": 410.12548828125,
+      "kl": 1.9296875,
+      "learning_rate": 1.073216080788921e-06,
+      "loss": 0.3541,
+      "reward": 1.1357142925262451,
+      "reward_std": 0.719650149345398,
+      "rewards/connected_reward": 1.0,
+      "rewards/format_reward": 0.0357142873108387,
+      "rewards/tag_count_reward": 0.392857164144516,
+      "step": 104
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 333.51788330078125,
+      "epoch": 0.0168,
+      "grad_norm": 348.0798034667969,
+      "kl": 2.28125,
+      "learning_rate": 1.0305368692688175e-06,
+      "loss": 0.0754,
+      "reward": 1.1610119342803955,
+      "reward_std": 0.7080978751182556,
+      "rewards/connected_reward": 0.8940476179122925,
+      "rewards/format_reward": 0.196428582072258,
+      "rewards/tag_count_reward": 0.5625,
+      "step": 105
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 329.4464416503906,
+      "epoch": 0.01696,
+      "grad_norm": 1205.54443359375,
+      "kl": 0.62109375,
+      "learning_rate": 9.88502212844063e-07,
+      "loss": 0.0577,
+      "reward": 1.4229167699813843,
+      "reward_std": 0.8394676446914673,
+      "rewards/connected_reward": 1.159523844718933,
+      "rewards/format_reward": 0.1785714328289032,
+      "rewards/tag_count_reward": 0.5803571939468384,
+      "step": 106
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 376.3750305175781,
+      "epoch": 0.01712,
+      "grad_norm": 774.3217163085938,
+      "kl": 2.46875,
+      "learning_rate": 9.471305493042243e-07,
+      "loss": 0.3784,
+      "reward": 1.0742347240447998,
+      "reward_std": 0.9993730783462524,
+      "rewards/connected_reward": 0.8813775181770325,
+      "rewards/format_reward": 0.1071428656578064,
+      "rewards/tag_count_reward": 0.4642857313156128,
+      "step": 107
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 333.4821472167969,
+      "epoch": 0.01728,
+      "grad_norm": 157.8820343017578,
+      "kl": 1.671875,
+      "learning_rate": 9.064400256282757e-07,
+      "loss": 0.1336,
+      "reward": 1.3309524059295654,
+      "reward_std": 0.7281343340873718,
+      "rewards/connected_reward": 1.1220239400863647,
+      "rewards/format_reward": 0.1071428656578064,
+      "rewards/tag_count_reward": 0.5178571939468384,
+      "step": 108
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 389.08929443359375,
+      "epoch": 0.01744,
+      "grad_norm": 95.9675521850586,
+      "kl": 2.6875,
+      "learning_rate": 8.664484900247363e-07,
+      "loss": 0.269,
+      "reward": 1.4202382564544678,
+      "reward_std": 0.5553953051567078,
+      "rewards/connected_reward": 1.1175596714019775,
+      "rewards/format_reward": 0.267857164144516,
+      "rewards/tag_count_reward": 0.5625,
+      "step": 109
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 341.21429443359375,
+      "epoch": 0.0176,
+      "grad_norm": 478.20684814453125,
+      "kl": 3.71875,
+      "learning_rate": 8.271734841028553e-07,
+      "loss": 0.4583,
+      "reward": 1.6011906862258911,
+      "reward_std": 0.7328930497169495,
+      "rewards/connected_reward": 1.3565478324890137,
+      "rewards/format_reward": 0.1785714328289032,
+      "rewards/tag_count_reward": 0.5178571939468384,
+      "step": 110
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 362.71429443359375,
+      "epoch": 0.01776,
+      "grad_norm": 570.6087036132812,
+      "kl": 1.578125,
+      "learning_rate": 7.886322351782782e-07,
+      "loss": 0.2897,
+      "reward": 1.381250023841858,
+      "reward_std": 0.9994818568229675,
+      "rewards/connected_reward": 1.1053571701049805,
+      "rewards/format_reward": 0.2142857313156128,
+      "rewards/tag_count_reward": 0.5625,
+      "step": 111
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 411.64288330078125,
+      "epoch": 0.01792,
+      "grad_norm": 227.14622497558594,
+      "kl": 0.8984375,
+      "learning_rate": 7.508416487165862e-07,
+      "loss": 0.2723,
+      "reward": 1.3443453311920166,
+      "reward_std": 0.8701754808425903,
+      "rewards/connected_reward": 1.108630895614624,
+      "rewards/format_reward": 0.1607142984867096,
+      "rewards/tag_count_reward": 0.5178571939468384,
+      "step": 112
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 404.9464416503906,
+      "epoch": 0.01808,
+      "grad_norm": 133.9763641357422,
+      "kl": 0.7109375,
+      "learning_rate": 7.138183009179922e-07,
+      "loss": 0.1851,
+      "reward": 1.5876487493515015,
+      "reward_std": 0.7359150648117065,
+      "rewards/connected_reward": 1.194345235824585,
+      "rewards/format_reward": 0.392857164144516,
+      "rewards/tag_count_reward": 0.65625,
+      "step": 113
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 373.89288330078125,
+      "epoch": 0.01824,
+      "grad_norm": 73.8996810913086,
+      "kl": 1.4765625,
+      "learning_rate": 6.775784314464717e-07,
+      "loss": 0.3297,
+      "reward": 1.5834821462631226,
+      "reward_std": 0.7138813138008118,
+      "rewards/connected_reward": 1.3071428537368774,
+      "rewards/format_reward": 0.196428582072258,
+      "rewards/tag_count_reward": 0.59375,
+      "step": 114
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 355.51788330078125,
+      "epoch": 0.0184,
+      "grad_norm": 283.4354553222656,
+      "kl": 1.5078125,
+      "learning_rate": 6.421379363065142e-07,
+      "loss": 0.3945,
+      "reward": 1.6325894594192505,
+      "reward_std": 0.7952902913093567,
+      "rewards/connected_reward": 1.2625001668930054,
+      "rewards/format_reward": 0.3571428656578064,
+      "rewards/tag_count_reward": 0.6383928656578064,
+      "step": 115
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 580.7857666015625,
+      "epoch": 0.01856,
+      "grad_norm": 158.34906005859375,
+      "kl": 2.71875,
+      "learning_rate": 6.075123608706093e-07,
+      "loss": 0.432,
+      "reward": 1.3404762744903564,
+      "reward_std": 0.909478485584259,
+      "rewards/connected_reward": 1.0833333730697632,
+      "rewards/format_reward": 0.2142857313156128,
+      "rewards/tag_count_reward": 0.5,
+      "step": 116
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 327.875,
+      "epoch": 0.01872,
+      "grad_norm": 87.71063995361328,
+      "kl": 0.51953125,
+      "learning_rate": 5.737168930605272e-07,
+      "loss": 0.0656,
+      "reward": 1.6141369342803955,
+      "reward_std": 0.8700290322303772,
+      "rewards/connected_reward": 1.1404762268066406,
+      "rewards/format_reward": 0.5,
+      "rewards/tag_count_reward": 0.7455357313156128,
+      "step": 117
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 376.0000305175781,
+      "epoch": 0.01888,
+      "grad_norm": 83.34387969970703,
+      "kl": 2.625,
+      "learning_rate": 5.407663566854008e-07,
+      "loss": 0.4935,
+      "reward": 1.892113208770752,
+      "reward_std": 0.8559380769729614,
+      "rewards/connected_reward": 1.3943451642990112,
+      "rewards/format_reward": 0.5535714626312256,
+      "rewards/tag_count_reward": 0.7366071939468384,
+      "step": 118
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 435.9285888671875,
+      "epoch": 0.01904,
+      "grad_norm": 2181.889892578125,
+      "kl": 6.625,
+      "learning_rate": 5.086752049395094e-07,
+      "loss": 1.0453,
+      "reward": 1.2653273344039917,
+      "reward_std": 0.9918228387832642,
+      "rewards/connected_reward": 0.8970238566398621,
+      "rewards/format_reward": 0.3750000298023224,
+      "rewards/tag_count_reward": 0.6026785969734192,
+      "step": 119
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 430.7500305175781,
+      "epoch": 0.0192,
+      "grad_norm": 7002.36767578125,
+      "kl": 7.375,
+      "learning_rate": 4.774575140626317e-07,
+      "loss": 0.9658,
+      "reward": 1.498660683631897,
+      "reward_std": 0.9669275879859924,
+      "rewards/connected_reward": 1.0910714864730835,
+      "rewards/format_reward": 0.4107142984867096,
+      "rewards/tag_count_reward": 0.6741071939468384,
+      "step": 120
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 466.33929443359375,
+      "epoch": 0.01936,
+      "grad_norm": 2396.701416015625,
+      "kl": 6.78125,
+      "learning_rate": 4.4712697716573994e-07,
+      "loss": 0.6327,
+      "reward": 1.1019346714019775,
+      "reward_std": 0.9098219871520996,
+      "rewards/connected_reward": 0.7291667461395264,
+      "rewards/format_reward": 0.3571428656578064,
+      "rewards/tag_count_reward": 0.6473214626312256,
+      "step": 121
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 441.9464416503906,
+      "epoch": 0.01952,
+      "grad_norm": 18626.265625,
+      "kl": 37.75,
+      "learning_rate": 4.1769689822475147e-07,
+      "loss": 3.2228,
+      "reward": 1.485119104385376,
+      "reward_std": 1.0364267826080322,
+      "rewards/connected_reward": 1.0386905670166016,
+      "rewards/format_reward": 0.5178571939468384,
+      "rewards/tag_count_reward": 0.625,
+      "step": 122
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 389.2321472167969,
+      "epoch": 0.01968,
+      "grad_norm": 198.896728515625,
+      "kl": 2.34375,
+      "learning_rate": 3.891801862449629e-07,
+      "loss": 0.3945,
+      "reward": 1.3038691282272339,
+      "reward_std": 1.0375421047210693,
+      "rewards/connected_reward": 0.8851190805435181,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.6517857313156128,
+      "step": 123
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 429.5714416503906,
+      "epoch": 0.01984,
+      "grad_norm": 180.44760131835938,
+      "kl": 2.265625,
+      "learning_rate": 3.615893495987335e-07,
+      "loss": 0.4959,
+      "reward": 1.2294642925262451,
+      "reward_std": 0.82194584608078,
+      "rewards/connected_reward": 0.7562500238418579,
+      "rewards/format_reward": 0.5178571939468384,
+      "rewards/tag_count_reward": 0.7142857313156128,
+      "step": 124
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 418.4821472167969,
+      "epoch": 0.02,
+      "grad_norm": 222.1766815185547,
+      "kl": 1.484375,
+      "learning_rate": 3.3493649053890325e-07,
+      "loss": 0.4571,
+      "reward": 1.794196605682373,
+      "reward_std": 0.9758167862892151,
+      "rewards/connected_reward": 1.227678656578064,
+      "rewards/format_reward": 0.6428571939468384,
+      "rewards/tag_count_reward": 0.816964328289032,
+      "step": 125
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 437.33929443359375,
+      "epoch": 0.02016,
+      "grad_norm": 233.66514587402344,
+      "kl": 1.8828125,
+      "learning_rate": 3.092332998903416e-07,
+      "loss": 0.4417,
+      "reward": 1.347916841506958,
+      "reward_std": 0.7624654173851013,
+      "rewards/connected_reward": 0.8398810625076294,
+      "rewards/format_reward": 0.5714285969734192,
+      "rewards/tag_count_reward": 0.7410714626312256,
+      "step": 126
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 311.4285888671875,
+      "epoch": 0.02032,
+      "grad_norm": 183.92347717285156,
+      "kl": 2.0625,
+      "learning_rate": 2.844910519219632e-07,
+      "loss": 0.347,
+      "reward": 1.6866072416305542,
+      "reward_std": 1.0179650783538818,
+      "rewards/connected_reward": 1.1875,
+      "rewards/format_reward": 0.5535714626312256,
+      "rewards/tag_count_reward": 0.7410714626312256,
+      "step": 127
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 340.01788330078125,
+      "epoch": 0.02048,
+      "grad_norm": 557.9811401367188,
+      "kl": 8.25,
+      "learning_rate": 2.6072059940146775e-07,
+      "loss": 0.497,
+      "reward": 1.3771826028823853,
+      "reward_std": 1.0474642515182495,
+      "rewards/connected_reward": 0.8343254327774048,
+      "rewards/format_reward": 0.625,
+      "rewards/tag_count_reward": 0.7678571939468384,
+      "step": 128
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 359.33929443359375,
+      "epoch": 0.02064,
+      "grad_norm": 96.67167663574219,
+      "kl": 3.03125,
+      "learning_rate": 2.3793236883495164e-07,
+      "loss": 0.378,
+      "reward": 1.5348213911056519,
+      "reward_std": 0.820845365524292,
+      "rewards/connected_reward": 1.0187500715255737,
+      "rewards/format_reward": 0.5714285969734192,
+      "rewards/tag_count_reward": 0.7678571939468384,
+      "step": 129
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 404.6785888671875,
+      "epoch": 0.0208,
+      "grad_norm": 1207.408935546875,
+      "kl": 15.1875,
+      "learning_rate": 2.1613635589349756e-07,
+      "loss": 1.1967,
+      "reward": 1.7483631372451782,
+      "reward_std": 0.8727819323539734,
+      "rewards/connected_reward": 1.251488208770752,
+      "rewards/format_reward": 0.535714328289032,
+      "rewards/tag_count_reward": 0.7633928656578064,
+      "step": 130
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 550.1785888671875,
+      "epoch": 0.02096,
+      "grad_norm": 146.79945373535156,
+      "kl": 3.203125,
+      "learning_rate": 1.95342121028749e-07,
+      "loss": 0.4891,
+      "reward": 1.643898844718933,
+      "reward_std": 1.03229558467865,
+      "rewards/connected_reward": 1.2470238208770752,
+      "rewards/format_reward": 0.4107142984867096,
+      "rewards/tag_count_reward": 0.6383928656578064,
+      "step": 131
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 448.8214416503906,
+      "epoch": 0.02112,
+      "grad_norm": 201.84288024902344,
+      "kl": 2.734375,
+      "learning_rate": 1.7555878527937164e-07,
+      "loss": 0.437,
+      "reward": 1.5223214626312256,
+      "reward_std": 0.8164080381393433,
+      "rewards/connected_reward": 1.0410715341567993,
+      "rewards/format_reward": 0.5178571939468384,
+      "rewards/tag_count_reward": 0.7410714626312256,
+      "step": 132
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 445.96429443359375,
+      "epoch": 0.02128,
+      "grad_norm": 271.8717956542969,
+      "kl": 2.765625,
+      "learning_rate": 1.567950262702714e-07,
+      "loss": 0.7114,
+      "reward": 1.504166841506958,
+      "reward_std": 0.8897882103919983,
+      "rewards/connected_reward": 1.0604166984558105,
+      "rewards/format_reward": 0.4642857313156128,
+      "rewards/tag_count_reward": 0.7053571939468384,
+      "step": 133
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 549.607177734375,
+      "epoch": 0.02144,
+      "grad_norm": 262.3206787109375,
+      "kl": 3.6875,
+      "learning_rate": 1.3905907440629752e-07,
+      "loss": 0.6148,
+      "reward": 1.5785714387893677,
+      "reward_std": 0.9023001194000244,
+      "rewards/connected_reward": 1.133928656578064,
+      "rewards/format_reward": 0.4821428656578064,
+      "rewards/tag_count_reward": 0.6785714626312256,
+      "step": 134
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 370.26788330078125,
+      "epoch": 0.0216,
+      "grad_norm": 199.73208618164062,
+      "kl": 3.75,
+      "learning_rate": 1.223587092621162e-07,
+      "loss": 0.5675,
+      "reward": 1.5458333492279053,
+      "reward_std": 0.9446250796318054,
+      "rewards/connected_reward": 1.1110119819641113,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.7053571939468384,
+      "step": 135
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 455.6785888671875,
+      "epoch": 0.02176,
+      "grad_norm": 119.2493667602539,
+      "kl": 1.4765625,
+      "learning_rate": 1.067012561698319e-07,
+      "loss": 0.4643,
+      "reward": 1.4904762506484985,
+      "reward_std": 0.9661902189254761,
+      "rewards/connected_reward": 0.985119104385376,
+      "rewards/format_reward": 0.5714285969734192,
+      "rewards/tag_count_reward": 0.7321428656578064,
+      "step": 136
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 413.1607360839844,
+      "epoch": 0.02192,
+      "grad_norm": 189.31283569335938,
+      "kl": 1.5390625,
+      "learning_rate": 9.209358300585474e-08,
+      "loss": 0.5992,
+      "reward": 1.70119047164917,
+      "reward_std": 1.0350253582000732,
+      "rewards/connected_reward": 1.2422618865966797,
+      "rewards/format_reward": 0.5,
+      "rewards/tag_count_reward": 0.6964285969734192,
+      "step": 137
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 396.64288330078125,
+      "epoch": 0.02208,
+      "grad_norm": 312.1037292480469,
+      "kl": 3.28125,
+      "learning_rate": 7.854209717842231e-08,
+      "loss": 0.2851,
+      "reward": 1.2653274536132812,
+      "reward_std": 0.9041686654090881,
+      "rewards/connected_reward": 0.9273810386657715,
+      "rewards/format_reward": 0.3035714328289032,
+      "rewards/tag_count_reward": 0.6205357313156128,
+      "step": 138
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 428.6607360839844,
+      "epoch": 0.02224,
+      "grad_norm": 96.09481048583984,
+      "kl": 3.234375,
+      "learning_rate": 6.605274281709929e-08,
+      "loss": 0.6046,
+      "reward": 1.3367561101913452,
+      "reward_std": 0.9484272599220276,
+      "rewards/connected_reward": 1.021130919456482,
+      "rewards/format_reward": 0.2857142984867096,
+      "rewards/tag_count_reward": 0.5758928656578064,
+      "step": 139
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 427.8035888671875,
+      "epoch": 0.0224,
+      "grad_norm": 112.52371978759766,
+      "kl": 2.671875,
+      "learning_rate": 5.463099816548578e-08,
+      "loss": 0.4703,
+      "reward": 1.6272321939468384,
+      "reward_std": 0.9923813939094543,
+      "rewards/connected_reward": 1.2071430683135986,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.65625,
+      "step": 140
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 442.1607360839844,
+      "epoch": 0.02256,
+      "grad_norm": 116.0601577758789,
+      "kl": 3.25,
+      "learning_rate": 4.428187317827848e-08,
+      "loss": 0.5996,
+      "reward": 1.4907739162445068,
+      "reward_std": 0.8477200269699097,
+      "rewards/connected_reward": 1.0898810625076294,
+      "rewards/format_reward": 0.4107142984867096,
+      "rewards/tag_count_reward": 0.6517857313156128,
+      "step": 141
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 394.21429443359375,
+      "epoch": 0.02272,
+      "grad_norm": 1539.869384765625,
+      "kl": 6.03125,
+      "learning_rate": 3.5009907323737826e-08,
+      "loss": 1.3606,
+      "reward": 1.6227679252624512,
+      "reward_std": 1.0998399257659912,
+      "rewards/connected_reward": 1.0982143878936768,
+      "rewards/format_reward": 0.6071428656578064,
+      "rewards/tag_count_reward": 0.7366071939468384,
+      "step": 142
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 361.9285888671875,
+      "epoch": 0.02288,
+      "grad_norm": 60.153076171875,
+      "kl": 2.84375,
+      "learning_rate": 2.681916759252917e-08,
+      "loss": 0.4273,
+      "reward": 1.530272126197815,
+      "reward_std": 1.0186634063720703,
+      "rewards/connected_reward": 1.1356292963027954,
+      "rewards/format_reward": 0.392857164144516,
+      "rewards/tag_count_reward": 0.660714328289032,
+      "step": 143
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 420.0000305175781,
+      "epoch": 0.02304,
+      "grad_norm": 57.555294036865234,
+      "kl": 2.4375,
+      "learning_rate": 1.9713246713805588e-08,
+      "loss": 0.4188,
+      "reward": 1.5285714864730835,
+      "reward_std": 1.0183099508285522,
+      "rewards/connected_reward": 1.0892857313156128,
+      "rewards/format_reward": 0.4821428656578064,
+      "rewards/tag_count_reward": 0.660714328289032,
+      "step": 144
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 327.7321472167969,
+      "epoch": 0.0232,
+      "grad_norm": 92.85120391845703,
+      "kl": 1.3828125,
+      "learning_rate": 1.3695261579316776e-08,
+      "loss": 0.2957,
+      "reward": 1.6851191520690918,
+      "reward_std": 0.9654322266578674,
+      "rewards/connected_reward": 1.2351192235946655,
+      "rewards/format_reward": 0.4821428656578064,
+      "rewards/tag_count_reward": 0.6964285969734192,
+      "step": 145
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 374.5000305175781,
+      "epoch": 0.02336,
+      "grad_norm": 448.3329772949219,
+      "kl": 3.171875,
+      "learning_rate": 8.767851876239075e-09,
+      "loss": 0.5714,
+      "reward": 1.5226191282272339,
+      "reward_std": 0.7929543852806091,
+      "rewards/connected_reward": 1.0904762744903564,
+      "rewards/format_reward": 0.4464285969734192,
+      "rewards/tag_count_reward": 0.6964285969734192,
+      "step": 146
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 430.26788330078125,
+      "epoch": 0.02352,
+      "grad_norm": 171.38597106933594,
+      "kl": 2.390625,
+      "learning_rate": 4.933178929321103e-09,
+      "loss": 0.4569,
+      "reward": 1.3949406147003174,
+      "reward_std": 0.719992458820343,
+      "rewards/connected_reward": 0.985119104385376,
+      "rewards/format_reward": 0.4285714626312256,
+      "rewards/tag_count_reward": 0.6517857313156128,
+      "step": 147
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 370.5000305175781,
+      "epoch": 0.02368,
+      "grad_norm": 80.20691680908203,
+      "kl": 1.7421875,
+      "learning_rate": 2.192924752854042e-09,
+      "loss": 0.2907,
+      "reward": 1.4619261026382446,
+      "reward_std": 0.9718099236488342,
+      "rewards/connected_reward": 1.0248724222183228,
+      "rewards/format_reward": 0.4642857313156128,
+      "rewards/tag_count_reward": 0.6830357313156128,
+      "step": 148
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 345.5714416503906,
+      "epoch": 0.02384,
+      "grad_norm": 57.89230728149414,
+      "kl": 0.96875,
+      "learning_rate": 5.48291312886251e-10,
+      "loss": 0.3204,
+      "reward": 1.9949404001235962,
+      "reward_std": 0.6536844968795776,
+      "rewards/connected_reward": 1.5922620296478271,
+      "rewards/format_reward": 0.392857164144516,
+      "rewards/tag_count_reward": 0.6875000596046448,
+      "step": 149
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 481.607177734375,
+      "epoch": 0.024,
+      "grad_norm": 70.68065643310547,
+      "kl": 2.9375,
+      "learning_rate": 0.0,
+      "loss": 0.3935,
+      "reward": 1.4215773344039917,
+      "reward_std": 1.039550542831421,
+      "rewards/connected_reward": 1.1175596714019775,
+      "rewards/format_reward": 0.267857164144516,
+      "rewards/tag_count_reward": 0.566964328289032,
+      "step": 150
+    },
+    {
+      "epoch": 0.024,
+      "step": 150,
+      "total_flos": 0.0,
+      "train_loss": 0.31898806129537965,
+      "train_runtime": 4133.207,
+      "train_samples_per_second": 2.032,
+      "train_steps_per_second": 0.036
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 150,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}