Model save

Browse files

Files changed (5) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +9 -0
train_results.json +8 -0
trainer_state.json +1229 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+library_name: transformers
+model_name: Qwen2.5-1.5B-Open-R1-GRPO-cot-v3
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen2.5-1.5B-Open-R1-GRPO-cot-v3
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="weltonwang88/Qwen2.5-1.5B-Open-R1-GRPO-cot-v3", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/weltonwang88-stanford/huggingface/runs/ef81zz98)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.16.0.dev0
+- Transformers: 4.50.0.dev0
+- Pytorch: 2.5.1+cu121
+- Datasets: 3.3.2
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.6775813137989773,
+    "train_runtime": 20674.7257,
+    "train_samples": 50,
+    "train_samples_per_second": 0.087,
+    "train_steps_per_second": 0.005
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.50.0.dev0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.6775813137989773,
+    "train_runtime": 20674.7257,
+    "train_samples": 50,
+    "train_samples_per_second": 0.087,
+    "train_steps_per_second": 0.005
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1229 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.9955555555555557,
+  "eval_steps": 30,
+  "global_step": 112,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1992.2813301086426,
+      "epoch": 0.035555555555555556,
+      "grad_norm": 0.11222778239149554,
+      "kl": 0.0,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.0509,
+      "reward": -8.534029252827168,
+      "reward_std": 3.1286671087145805,
+      "rewards/cot_length_penalty_reward": -8.792958237230778,
+      "rewards/math_latex_accuracy_reward": 0.2589285857975483,
+      "step": 1
+    },
+    {
+      "clip_ratio": 0.0,
+      "epoch": 0.07111111111111111,
+      "grad_norm": 0.11224900964736327,
+      "kl": 0.0,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.0509,
+      "step": 2
+    },
+    {
+      "clip_ratio": 0.002639908329001628,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.11190265883625335,
+      "kl": 0.0004132986068725586,
+      "learning_rate": 5e-06,
+      "loss": 0.051,
+      "step": 3
+    },
+    {
+      "clip_ratio": 0.0026859724457608536,
+      "epoch": 0.14222222222222222,
+      "grad_norm": 0.10842287053917446,
+      "kl": 0.00042808055877685547,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 0.0506,
+      "step": 4
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2354.6050567626953,
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.11660083282365401,
+      "kl": 0.0005452632904052734,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.0532,
+      "reward": -9.576663568615913,
+      "reward_std": 3.4961936213076115,
+      "rewards/cot_length_penalty_reward": -9.817734986543655,
+      "rewards/math_latex_accuracy_reward": 0.24107144074514508,
+      "step": 5
+    },
+    {
+      "clip_ratio": 0.004271271725883707,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.15124528039054966,
+      "kl": 0.0023946762084960938,
+      "learning_rate": 1e-05,
+      "loss": 0.0521,
+      "step": 6
+    },
+    {
+      "clip_ratio": 0.00593576196115464,
+      "epoch": 0.24888888888888888,
+      "grad_norm": 0.22845939154064746,
+      "kl": 0.0017180442810058594,
+      "learning_rate": 1.1666666666666668e-05,
+      "loss": 0.0519,
+      "step": 7
+    },
+    {
+      "clip_ratio": 0.00681446076487191,
+      "epoch": 0.28444444444444444,
+      "grad_norm": 0.25815482225017694,
+      "kl": 0.002631664276123047,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 0.0484,
+      "step": 8
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2251.4844703674316,
+      "epoch": 0.32,
+      "grad_norm": 0.11515864264586664,
+      "kl": 0.0024547576904296875,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.0261,
+      "reward": -8.878705874085426,
+      "reward_std": 3.5140193179249763,
+      "rewards/cot_length_penalty_reward": -9.128705888986588,
+      "rewards/math_latex_accuracy_reward": 0.2500000149011612,
+      "step": 9
+    },
+    {
+      "clip_ratio": 0.0070763813419034705,
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.2974695944541913,
+      "kl": 0.005417823791503906,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.0255,
+      "step": 10
+    },
+    {
+      "clip_ratio": 0.010105093329912052,
+      "epoch": 0.39111111111111113,
+      "grad_norm": 85.04432926044146,
+      "kl": 0.007180213928222656,
+      "learning_rate": 1.8333333333333333e-05,
+      "loss": 19.5224,
+      "step": 11
+    },
+    {
+      "clip_ratio": 0.017627036664634943,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 2.432235493998052,
+      "kl": 0.0702056884765625,
+      "learning_rate": 2e-05,
+      "loss": 0.0247,
+      "step": 12
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1870.0737648010254,
+      "epoch": 0.4622222222222222,
+      "grad_norm": 0.32693917550649865,
+      "kl": 0.022480010986328125,
+      "learning_rate": 1.9995065603657317e-05,
+      "loss": 0.0103,
+      "reward": -9.444286078214645,
+      "reward_std": 3.207320176064968,
+      "rewards/cot_length_penalty_reward": -9.848303943872452,
+      "rewards/math_latex_accuracy_reward": 0.4040178805589676,
+      "step": 13
+    },
+    {
+      "clip_ratio": 0.004404508654261008,
+      "epoch": 0.49777777777777776,
+      "grad_norm": 1.6311442510668763,
+      "kl": 0.010528564453125,
+      "learning_rate": 1.9980267284282718e-05,
+      "loss": 0.0093,
+      "step": 14
+    },
+    {
+      "clip_ratio": 0.0062026621017139405,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.5615009008596469,
+      "kl": 0.051082611083984375,
+      "learning_rate": 1.99556196460308e-05,
+      "loss": 0.0076,
+      "step": 15
+    },
+    {
+      "clip_ratio": 0.006701507809339091,
+      "epoch": 0.5688888888888889,
+      "grad_norm": 0.15171374489080983,
+      "kl": 0.018802642822265625,
+      "learning_rate": 1.9921147013144782e-05,
+      "loss": 0.0041,
+      "step": 16
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2222.4309005737305,
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.11283429637288535,
+      "kl": 0.01442718505859375,
+      "learning_rate": 1.9876883405951378e-05,
+      "loss": 0.081,
+      "reward": -9.554554164409637,
+      "reward_std": 4.235630825161934,
+      "rewards/cot_length_penalty_reward": -9.844732716679573,
+      "rewards/math_latex_accuracy_reward": 0.29017858393490314,
+      "step": 17
+    },
+    {
+      "clip_ratio": 0.004120954225072637,
+      "epoch": 0.64,
+      "grad_norm": 0.10899171004441378,
+      "kl": 0.015628814697265625,
+      "learning_rate": 1.982287250728689e-05,
+      "loss": 0.2482,
+      "step": 18
+    },
+    {
+      "clip_ratio": 0.005419444481958635,
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.1157331857796372,
+      "kl": 0.01764678955078125,
+      "learning_rate": 1.9759167619387474e-05,
+      "loss": 0.2459,
+      "step": 19
+    },
+    {
+      "clip_ratio": 0.005958295805612579,
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.10652991525129403,
+      "kl": 0.01905059814453125,
+      "learning_rate": 1.9685831611286312e-05,
+      "loss": 0.2434,
+      "step": 20
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2501.2255668640137,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.12065925342674215,
+      "kl": 0.020366668701171875,
+      "learning_rate": 1.9602936856769432e-05,
+      "loss": 0.033,
+      "reward": -11.581663489341736,
+      "reward_std": 4.305310405790806,
+      "rewards/cot_length_penalty_reward": -11.86737784743309,
+      "rewards/math_latex_accuracy_reward": 0.2857142973225564,
+      "step": 21
+    },
+    {
+      "clip_ratio": 0.0039043642027536407,
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.36463686705700576,
+      "kl": 0.019252777099609375,
+      "learning_rate": 1.9510565162951538e-05,
+      "loss": 0.0325,
+      "step": 22
+    },
+    {
+      "clip_ratio": 0.005750590149546042,
+      "epoch": 0.8177777777777778,
+      "grad_norm": 20283.4909421177,
+      "kl": 1147.058982849121,
+      "learning_rate": 1.9408807689542257e-05,
+      "loss": 46.027,
+      "step": 23
+    },
+    {
+      "clip_ratio": 0.008487990504363552,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.17175661916148474,
+      "kl": 0.026885986328125,
+      "learning_rate": 1.9297764858882516e-05,
+      "loss": 0.0289,
+      "step": 24
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2333.062614440918,
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.10364225115067384,
+      "kl": 0.0201416015625,
+      "learning_rate": 1.9177546256839814e-05,
+      "loss": 0.0113,
+      "reward": -10.619498401880264,
+      "reward_std": 3.768970273435116,
+      "rewards/cot_length_penalty_reward": -10.869498312473297,
+      "rewards/math_latex_accuracy_reward": 0.2500000123400241,
+      "step": 25
+    },
+    {
+      "clip_ratio": 0.0032545153953833506,
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.10471903488067007,
+      "kl": 0.02140045166015625,
+      "learning_rate": 1.9048270524660197e-05,
+      "loss": 0.0103,
+      "step": 26
+    },
+    {
+      "clip_ratio": 0.004354664255515672,
+      "epoch": 0.96,
+      "grad_norm": 0.0965546219660842,
+      "kl": 0.0223541259765625,
+      "learning_rate": 1.891006524188368e-05,
+      "loss": 0.0085,
+      "step": 27
+    },
+    {
+      "clip_ratio": 0.005275880845147185,
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.10544368470129743,
+      "kl": 0.023712158203125,
+      "learning_rate": 1.8763066800438638e-05,
+      "loss": 0.0065,
+      "step": 28
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2427.296974182129,
+      "epoch": 1.0355555555555556,
+      "grad_norm": 0.42983301720943096,
+      "kl": 0.03394317626953125,
+      "learning_rate": 1.860742027003944e-05,
+      "loss": 0.0039,
+      "reward": -11.214900106191635,
+      "reward_std": 3.760936316102743,
+      "rewards/cot_length_penalty_reward": -11.5006143450737,
+      "rewards/math_latex_accuracy_reward": 0.285714297555387,
+      "step": 29
+    },
+    {
+      "epoch": 1.0711111111111111,
+      "grad_norm": 0.1067773530257121,
+      "learning_rate": 1.8443279255020153e-05,
+      "loss": 0.0071,
+      "step": 30
+    },
+    {
+      "epoch": 1.0711111111111111,
+      "eval_clip_ratio": 0.0,
+      "eval_completion_length": 2303.7637939453125,
+      "eval_kl": 0.025606595552884616,
+      "eval_loss": 0.04365207254886627,
+      "eval_reward": -8.784464891140278,
+      "eval_reward_std": 3.7600448498359094,
+      "eval_rewards/cot_length_penalty_reward": -9.116882379238422,
+      "eval_rewards/math_latex_accuracy_reward": 0.3324175958450024,
+      "eval_runtime": 448.0952,
+      "eval_samples_per_second": 0.112,
+      "eval_steps_per_second": 0.004,
+      "step": 30
+    },
+    {
+      "clip_ratio": 0.0037656883359886706,
+      "epoch": 1.1066666666666667,
+      "grad_norm": 0.7346983824268155,
+      "kl": 0.027835845947265625,
+      "learning_rate": 1.827080574274562e-05,
+      "loss": 0.0033,
+      "step": 31
+    },
+    {
+      "clip_ratio": 0.006145871157059446,
+      "epoch": 1.1422222222222222,
+      "grad_norm": 11.473259751864658,
+      "kl": 1.4422760009765625,
+      "learning_rate": 1.8090169943749477e-05,
+      "loss": 0.0549,
+      "step": 32
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2531.1674995422363,
+      "epoch": 1.1777777777777778,
+      "grad_norm": 0.11984100416609303,
+      "kl": 0.03124237060546875,
+      "learning_rate": 1.7901550123756906e-05,
+      "loss": 0.0239,
+      "reward": -8.361417755484581,
+      "reward_std": 4.016169548034668,
+      "rewards/cot_length_penalty_reward": -8.689542889595032,
+      "rewards/math_latex_accuracy_reward": 0.3281250139698386,
+      "step": 33
+    },
+    {
+      "clip_ratio": 0.004058451057062484,
+      "epoch": 1.2133333333333334,
+      "grad_norm": 0.16684935043495766,
+      "kl": 0.03450775146484375,
+      "learning_rate": 1.7705132427757895e-05,
+      "loss": 0.0232,
+      "step": 34
+    },
+    {
+      "clip_ratio": 0.006084064312744886,
+      "epoch": 1.248888888888889,
+      "grad_norm": 0.11163561617870978,
+      "kl": 0.0318145751953125,
+      "learning_rate": 1.7501110696304598e-05,
+      "loss": 0.0214,
+      "step": 35
+    },
+    {
+      "clip_ratio": 0.007263028150191531,
+      "epoch": 1.2844444444444445,
+      "grad_norm": 0.11128954549532989,
+      "kl": 0.03281402587890625,
+      "learning_rate": 1.7289686274214116e-05,
+      "loss": 0.0197,
+      "step": 36
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1888.4487342834473,
+      "epoch": 1.32,
+      "grad_norm": 0.6837402463902799,
+      "kl": 0.05413055419921875,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 0.1151,
+      "reward": -7.783694684505463,
+      "reward_std": 3.098730646073818,
+      "rewards/cot_length_penalty_reward": -8.16985534131527,
+      "rewards/math_latex_accuracy_reward": 0.3861607341095805,
+      "step": 37
+    },
+    {
+      "clip_ratio": 0.0028788788622478023,
+      "epoch": 1.3555555555555556,
+      "grad_norm": 2.464864347264584,
+      "kl": 0.04084014892578125,
+      "learning_rate": 1.684547105928689e-05,
+      "loss": 0.3644,
+      "step": 38
+    },
+    {
+      "clip_ratio": 0.004996606716304086,
+      "epoch": 1.3911111111111112,
+      "grad_norm": 0.32236476835294475,
+      "kl": 0.04229736328125,
+      "learning_rate": 1.661311865323652e-05,
+      "loss": 0.1132,
+      "step": 39
+    },
+    {
+      "clip_ratio": 0.005848184140631929,
+      "epoch": 1.4266666666666667,
+      "grad_norm": 2.236113570550632,
+      "kl": 0.180694580078125,
+      "learning_rate": 1.63742398974869e-05,
+      "loss": 0.116,
+      "step": 40
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1681.7277793884277,
+      "epoch": 1.462222222222222,
+      "grad_norm": 55.91487468875501,
+      "kl": 1.1835174560546875,
+      "learning_rate": 1.6129070536529767e-05,
+      "loss": 0.0918,
+      "reward": -7.822701282799244,
+      "reward_std": 2.5297958850860596,
+      "rewards/cot_length_penalty_reward": -8.191005058586597,
+      "rewards/math_latex_accuracy_reward": 0.3683035862632096,
+      "step": 41
+    },
+    {
+      "clip_ratio": 0.003095990905421786,
+      "epoch": 1.4977777777777779,
+      "grad_norm": 3454.9772869499748,
+      "kl": 0.0470123291015625,
+      "learning_rate": 1.5877852522924733e-05,
+      "loss": 6.8427,
+      "step": 42
+    },
+    {
+      "clip_ratio": 0.005094703097711317,
+      "epoch": 1.5333333333333332,
+      "grad_norm": 15.584761637863307,
+      "kl": 1.0414886474609375,
+      "learning_rate": 1.5620833778521306e-05,
+      "loss": 0.0866,
+      "step": 43
+    },
+    {
+      "clip_ratio": 0.008272722363471985,
+      "epoch": 1.568888888888889,
+      "grad_norm": 1.2550063449844295,
+      "kl": 0.04656219482421875,
+      "learning_rate": 1.5358267949789968e-05,
+      "loss": 0.0502,
+      "step": 44
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2313.542510986328,
+      "epoch": 1.6044444444444443,
+      "grad_norm": 0.1122939508940078,
+      "kl": 0.038604736328125,
+      "learning_rate": 1.5090414157503715e-05,
+      "loss": 0.0762,
+      "reward": -9.223605461418629,
+      "reward_std": 3.827972359955311,
+      "rewards/cot_length_penalty_reward": -9.58521255850792,
+      "rewards/math_latex_accuracy_reward": 0.36160715692676604,
+      "step": 45
+    },
+    {
+      "clip_ratio": 0.0038211173960007727,
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.12378389214572502,
+      "kl": 0.04041290283203125,
+      "learning_rate": 1.4817536741017153e-05,
+      "loss": 0.0756,
+      "step": 46
+    },
+    {
+      "clip_ratio": 0.005922177180764265,
+      "epoch": 1.6755555555555555,
+      "grad_norm": 0.12771635689812005,
+      "kl": 0.04157257080078125,
+      "learning_rate": 1.4539904997395468e-05,
+      "loss": 0.0745,
+      "step": 47
+    },
+    {
+      "clip_ratio": 0.006733638554578647,
+      "epoch": 1.7111111111111112,
+      "grad_norm": 0.10941185512569215,
+      "kl": 0.04193115234375,
+      "learning_rate": 1.4257792915650728e-05,
+      "loss": 0.0731,
+      "step": 48
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1813.1072387695312,
+      "epoch": 1.7466666666666666,
+      "grad_norm": 3.0104818633298525,
+      "kl": 0.2118988037109375,
+      "learning_rate": 1.3971478906347806e-05,
+      "loss": -0.0205,
+      "reward": -10.289654642343521,
+      "reward_std": 3.131831008940935,
+      "rewards/cot_length_penalty_reward": -10.762868821620941,
+      "rewards/math_latex_accuracy_reward": 0.47321430779993534,
+      "step": 49
+    },
+    {
+      "clip_ratio": 0.002227201643108856,
+      "epoch": 1.7822222222222224,
+      "grad_norm": 0.11902180384589063,
+      "kl": 0.04238128662109375,
+      "learning_rate": 1.3681245526846782e-05,
+      "loss": -0.0276,
+      "step": 50
+    },
+    {
+      "clip_ratio": 0.0032200364221353084,
+      "epoch": 1.8177777777777777,
+      "grad_norm": 0.1256236704087802,
+      "kl": 0.04290008544921875,
+      "learning_rate": 1.3387379202452917e-05,
+      "loss": -0.0286,
+      "step": 51
+    },
+    {
+      "clip_ratio": 0.003942100578569807,
+      "epoch": 1.8533333333333335,
+      "grad_norm": 0.10119215538963353,
+      "kl": 0.0430450439453125,
+      "learning_rate": 1.3090169943749475e-05,
+      "loss": -0.03,
+      "step": 52
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2273.6608276367188,
+      "epoch": 1.8888888888888888,
+      "grad_norm": 0.17450385200895388,
+      "kl": 0.05017852783203125,
+      "learning_rate": 1.2789911060392295e-05,
+      "loss": 0.005,
+      "reward": -7.135257016867399,
+      "reward_std": 3.697649233043194,
+      "rewards/cot_length_penalty_reward": -7.5548999309539795,
+      "rewards/math_latex_accuracy_reward": 0.4196428684517741,
+      "step": 53
+    },
+    {
+      "clip_ratio": 0.002888819137297105,
+      "epoch": 1.9244444444444444,
+      "grad_norm": 0.10967403835477697,
+      "kl": 0.04810333251953125,
+      "learning_rate": 1.2486898871648552e-05,
+      "loss": 0.0038,
+      "step": 54
+    },
+    {
+      "clip_ratio": 0.005481840795255266,
+      "epoch": 1.96,
+      "grad_norm": 0.14987018246678602,
+      "kl": 0.05097198486328125,
+      "learning_rate": 1.2181432413965428e-05,
+      "loss": 0.0028,
+      "step": 55
+    },
+    {
+      "clip_ratio": 0.007852705341065302,
+      "epoch": 1.9955555555555555,
+      "grad_norm": 0.12818843141531608,
+      "kl": 0.0562286376953125,
+      "learning_rate": 1.187381314585725e-05,
+      "loss": 0.0013,
+      "step": 56
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2008.5916328430176,
+      "epoch": 2.0355555555555553,
+      "grad_norm": 0.1643995846810127,
+      "kl": 0.0548858642578125,
+      "learning_rate": 1.156434465040231e-05,
+      "loss": 0.0118,
+      "reward": -7.901306234300137,
+      "reward_std": 2.6794423200190067,
+      "rewards/cot_length_penalty_reward": -8.209341906011105,
+      "rewards/math_latex_accuracy_reward": 0.3080357303842902,
+      "step": 57
+    },
+    {
+      "clip_ratio": 0.0029279392474563792,
+      "epoch": 2.071111111111111,
+      "grad_norm": 0.12045324524183162,
+      "kl": 0.0587005615234375,
+      "learning_rate": 1.1253332335643043e-05,
+      "loss": 0.0108,
+      "step": 58
+    },
+    {
+      "clip_ratio": 0.005807226421893574,
+      "epoch": 2.1066666666666665,
+      "grad_norm": 0.14841036250602688,
+      "kl": 0.0640106201171875,
+      "learning_rate": 1.0941083133185146e-05,
+      "loss": 0.0097,
+      "step": 59
+    },
+    {
+      "epoch": 2.1422222222222222,
+      "grad_norm": 0.11089468253649072,
+      "learning_rate": 1.0627905195293135e-05,
+      "loss": 0.0084,
+      "step": 60
+    },
+    {
+      "epoch": 2.1422222222222222,
+      "eval_clip_ratio": 0.0,
+      "eval_completion_length": 2107.960148737981,
+      "eval_kl": 0.05983323317307692,
+      "eval_loss": -0.00015631201677024364,
+      "eval_reward": -8.494354761563814,
+      "eval_reward_std": 3.4025442325151882,
+      "eval_rewards/cot_length_penalty_reward": -8.876223013951229,
+      "eval_rewards/math_latex_accuracy_reward": 0.3818681509448932,
+      "eval_runtime": 422.1329,
+      "eval_samples_per_second": 0.118,
+      "eval_steps_per_second": 0.005,
+      "step": 60
+    },
+    {
+      "clip_ratio": 0.003193242686393205,
+      "completion_length": 1800.6674766540527,
+      "epoch": 2.1777777777777776,
+      "grad_norm": 0.1697521453747013,
+      "kl": 0.065582275390625,
+      "learning_rate": 1.0314107590781284e-05,
+      "loss": 0.0174,
+      "reward": -8.092556223273277,
+      "reward_std": 3.146493151783943,
+      "rewards/cot_length_penalty_reward": -8.458627462387085,
+      "rewards/math_latex_accuracy_reward": 0.3660714477300644,
+      "step": 61
+    },
+    {
+      "clip_ratio": 0.003192656353348866,
+      "epoch": 2.2133333333333334,
+      "grad_norm": 0.12330763778596482,
+      "kl": 0.0718231201171875,
+      "learning_rate": 1e-05,
+      "loss": 0.0163,
+      "step": 62
+    },
+    {
+      "clip_ratio": 0.0062453514110529795,
+      "epoch": 2.2488888888888887,
+      "grad_norm": 0.16098784282181033,
+      "kl": 0.078704833984375,
+      "learning_rate": 9.685892409218718e-06,
+      "loss": 0.0151,
+      "step": 63
+    },
+    {
+      "clip_ratio": 0.006978008910664357,
+      "epoch": 2.2844444444444445,
+      "grad_norm": 0.1406450810476633,
+      "kl": 0.0782470703125,
+      "learning_rate": 9.372094804706867e-06,
+      "loss": 0.0137,
+      "step": 64
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2595.997859954834,
+      "epoch": 2.32,
+      "grad_norm": 0.18250322704566987,
+      "kl": 0.0649871826171875,
+      "learning_rate": 9.058916866814857e-06,
+      "loss": 0.0147,
+      "reward": -9.348549716174603,
+      "reward_std": 3.3840084299445152,
+      "rewards/cot_length_penalty_reward": -9.70346000418067,
+      "rewards/math_latex_accuracy_reward": 0.35491072852164507,
+      "step": 65
+    },
+    {
+      "clip_ratio": 0.0031114893354242668,
+      "epoch": 2.3555555555555556,
+      "grad_norm": 0.13005553219968966,
+      "kl": 0.0695648193359375,
+      "learning_rate": 8.746667664356957e-06,
+      "loss": 0.014,
+      "step": 66
+    },
+    {
+      "clip_ratio": 0.0075038159266114235,
+      "epoch": 2.391111111111111,
+      "grad_norm": 0.19621512659848725,
+      "kl": 0.0780181884765625,
+      "learning_rate": 8.43565534959769e-06,
+      "loss": 0.0133,
+      "step": 67
+    },
+    {
+      "clip_ratio": 0.006932365708053112,
+      "epoch": 2.4266666666666667,
+      "grad_norm": 0.13215694629988284,
+      "kl": 0.07647705078125,
+      "learning_rate": 8.126186854142752e-06,
+      "loss": 0.0122,
+      "step": 68
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2154.4286880493164,
+      "epoch": 2.462222222222222,
+      "grad_norm": 0.27197173025048144,
+      "kl": 0.086151123046875,
+      "learning_rate": 7.818567586034578e-06,
+      "loss": 0.0247,
+      "reward": -8.337913118302822,
+      "reward_std": 3.0677984952926636,
+      "rewards/cot_length_penalty_reward": -8.806663155555725,
+      "rewards/math_latex_accuracy_reward": 0.4687500186264515,
+      "step": 69
+    },
+    {
+      "clip_ratio": 0.005053140237578191,
+      "epoch": 2.497777777777778,
+      "grad_norm": 0.20964263197545416,
+      "kl": 0.0977630615234375,
+      "learning_rate": 7.513101128351454e-06,
+      "loss": 0.0237,
+      "step": 70
+    },
+    {
+      "clip_ratio": 0.005771905358415097,
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.15787820635605407,
+      "kl": 0.0987091064453125,
+      "learning_rate": 7.210088939607709e-06,
+      "loss": 0.0226,
+      "step": 71
+    },
+    {
+      "clip_ratio": 0.0062158564978744835,
+      "epoch": 2.568888888888889,
+      "grad_norm": 0.4007267449310534,
+      "kl": 0.0895538330078125,
+      "learning_rate": 6.909830056250527e-06,
+      "loss": 0.022,
+      "step": 72
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1935.1675262451172,
+      "epoch": 2.6044444444444443,
+      "grad_norm": 0.266781012315019,
+      "kl": 0.10394287109375,
+      "learning_rate": 6.612620797547087e-06,
+      "loss": 0.0125,
+      "reward": -7.354442303534597,
+      "reward_std": 2.94980551302433,
+      "rewards/cot_length_penalty_reward": -7.771853107959032,
+      "rewards/math_latex_accuracy_reward": 0.41741072852164507,
+      "step": 73
+    },
+    {
+      "clip_ratio": 0.01473489188356325,
+      "epoch": 2.64,
+      "grad_norm": 0.542093788713072,
+      "kl": 0.1417083740234375,
+      "learning_rate": 6.318754473153221e-06,
+      "loss": 0.0132,
+      "step": 74
+    },
+    {
+      "clip_ratio": 0.009351018321467564,
+      "epoch": 2.6755555555555555,
+      "grad_norm": 0.32832820257493534,
+      "kl": 0.1302490234375,
+      "learning_rate": 6.028521093652195e-06,
+      "loss": 0.0111,
+      "step": 75
+    },
+    {
+      "clip_ratio": 0.008401441504247487,
+      "epoch": 2.7111111111111112,
+      "grad_norm": 0.5313671762370776,
+      "kl": 0.106719970703125,
+      "learning_rate": 5.742207084349274e-06,
+      "loss": 0.0105,
+      "step": 76
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1853.8215065002441,
+      "epoch": 2.7466666666666666,
+      "grad_norm": 0.25601743476635025,
+      "kl": 0.128814697265625,
+      "learning_rate": 5.460095002604533e-06,
+      "loss": -0.018,
+      "reward": -7.114141087979078,
+      "reward_std": 2.6430138647556305,
+      "rewards/cot_length_penalty_reward": -7.493605274707079,
+      "rewards/math_latex_accuracy_reward": 0.37946430314332247,
+      "step": 77
+    },
+    {
+      "clip_ratio": 0.004884305511950515,
+      "epoch": 2.7822222222222224,
+      "grad_norm": 0.18667971276259676,
+      "kl": 0.1357421875,
+      "learning_rate": 5.1824632589828465e-06,
+      "loss": -0.019,
+      "step": 78
+    },
+    {
+      "clip_ratio": 0.008678867772687227,
+      "epoch": 2.8177777777777777,
+      "grad_norm": 0.2515967343714355,
+      "kl": 0.1392822265625,
+      "learning_rate": 4.909585842496287e-06,
+      "loss": -0.0199,
+      "step": 79
+    },
+    {
+      "clip_ratio": 0.008155457631801255,
+      "epoch": 2.8533333333333335,
+      "grad_norm": 0.18942366870295294,
+      "kl": 0.131805419921875,
+      "learning_rate": 4.641732050210032e-06,
+      "loss": -0.0211,
+      "step": 80
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2211.9219856262207,
+      "epoch": 2.888888888888889,
+      "grad_norm": 0.22043905749174672,
+      "kl": 0.1049957275390625,
+      "learning_rate": 4.379166221478697e-06,
+      "loss": -0.0247,
+      "reward": -9.63335988484323,
+      "reward_std": 2.9292308390140533,
+      "rewards/cot_length_penalty_reward": -10.111038556322455,
+      "rewards/math_latex_accuracy_reward": 0.4776785969734192,
+      "step": 81
+    },
+    {
+      "clip_ratio": 0.002452510700095445,
+      "epoch": 2.924444444444444,
+      "grad_norm": 0.2264032851218282,
+      "kl": 0.1047821044921875,
+      "learning_rate": 4.12214747707527e-06,
+      "loss": -0.0248,
+      "step": 82
+    },
+    {
+      "clip_ratio": 0.0035177832323824987,
+      "epoch": 2.96,
+      "grad_norm": 0.14675046732213165,
+      "kl": 0.1127166748046875,
+      "learning_rate": 3.8709294634702374e-06,
+      "loss": -0.0259,
+      "step": 83
+    },
+    {
+      "clip_ratio": 0.0036856129445368424,
+      "epoch": 2.9955555555555557,
+      "grad_norm": 0.1630420640767936,
+      "kl": 0.09552001953125,
+      "learning_rate": 3.625760102513103e-06,
+      "loss": -0.0267,
+      "step": 84
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1545.3081169128418,
+      "epoch": 3.0355555555555553,
+      "grad_norm": 14.400966954596354,
+      "kl": 0.286773681640625,
+      "learning_rate": 3.3868813467634833e-06,
+      "loss": -0.0265,
+      "reward": -7.1288284212350845,
+      "reward_std": 1.927463386207819,
+      "rewards/cot_length_penalty_reward": -7.606507122516632,
+      "rewards/math_latex_accuracy_reward": 0.47767859511077404,
+      "step": 85
+    },
+    {
+      "clip_ratio": 0.0029736382130067796,
+      "epoch": 3.071111111111111,
+      "grad_norm": 0.46033460120838376,
+      "kl": 0.130645751953125,
+      "learning_rate": 3.1545289407131128e-06,
+      "loss": -0.0322,
+      "step": 86
+    },
+    {
+      "clip_ratio": 0.0043519225146155804,
+      "epoch": 3.1066666666666665,
+      "grad_norm": 0.2864629379691617,
+      "kl": 0.13775634765625,
+      "learning_rate": 2.9289321881345257e-06,
+      "loss": -0.0338,
+      "step": 87
+    },
+    {
+      "clip_ratio": 0.009186911847791635,
+      "epoch": 3.1422222222222222,
+      "grad_norm": 0.24547967049213823,
+      "kl": 0.155426025390625,
+      "learning_rate": 2.7103137257858867e-06,
+      "loss": -0.0347,
+      "step": 88
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1969.4442825317383,
+      "epoch": 3.1777777777777776,
+      "grad_norm": 0.433485726828516,
+      "kl": 0.1619415283203125,
+      "learning_rate": 2.4988893036954045e-06,
+      "loss": -0.0013,
+      "reward": -8.979866623878479,
+      "reward_std": 2.524892296642065,
+      "rewards/cot_length_penalty_reward": -9.319152384996414,
+      "rewards/math_latex_accuracy_reward": 0.339285729220137,
+      "step": 89
+    },
+    {
+      "epoch": 3.2133333333333334,
+      "grad_norm": 0.2766265099056435,
+      "learning_rate": 2.2948675722421086e-06,
+      "loss": -0.003,
+      "step": 90
+    },
+    {
+      "epoch": 3.2133333333333334,
+      "eval_clip_ratio": 0.0,
+      "eval_completion_length": 2205.780292217548,
+      "eval_kl": 0.18556565504807693,
+      "eval_loss": 0.01654699072241783,
+      "eval_reward": -7.743022455332371,
+      "eval_reward_std": 2.8156597109941335,
+      "eval_rewards/cot_length_penalty_reward": -8.072692573070526,
+      "eval_rewards/math_latex_accuracy_reward": 0.32967034670022816,
+      "eval_runtime": 490.0675,
+      "eval_samples_per_second": 0.102,
+      "eval_steps_per_second": 0.004,
+      "step": 90
+    },
+    {
+      "clip_ratio": 0.006224101292900741,
+      "epoch": 3.2488888888888887,
+      "grad_norm": 0.4063813985520313,
+      "kl": 0.197174072265625,
+      "learning_rate": 2.098449876243096e-06,
+      "loss": -0.0037,
+      "step": 91
+    },
+    {
+      "clip_ratio": 0.009523139509838074,
+      "epoch": 3.2844444444444445,
+      "grad_norm": 0.26707887580226697,
+      "kl": 0.196014404296875,
+      "learning_rate": 1.9098300562505266e-06,
+      "loss": -0.0047,
+      "step": 92
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2093.5982818603516,
+      "epoch": 3.32,
+      "grad_norm": 0.3581002181636425,
+      "kl": 0.199798583984375,
+      "learning_rate": 1.7291942572543806e-06,
+      "loss": 0.0178,
+      "reward": -7.027399688959122,
+      "reward_std": 2.7810670882463455,
+      "rewards/cot_length_penalty_reward": -7.435881897807121,
+      "rewards/math_latex_accuracy_reward": 0.40848216507583857,
+      "step": 93
+    },
+    {
+      "clip_ratio": 0.002175023495510686,
+      "epoch": 3.3555555555555556,
+      "grad_norm": 0.35335279380104717,
+      "kl": 0.19476318359375,
+      "learning_rate": 1.5567207449798517e-06,
+      "loss": 0.017,
+      "step": 94
+    },
+    {
+      "clip_ratio": 0.0038238488195929676,
+      "epoch": 3.391111111111111,
+      "grad_norm": 0.26816647206060557,
+      "kl": 0.21075439453125,
+      "learning_rate": 1.3925797299605649e-06,
+      "loss": 0.0159,
+      "step": 95
+    },
+    {
+      "clip_ratio": 0.006063876280677505,
+      "epoch": 3.4266666666666667,
+      "grad_norm": 0.3302248399941887,
+      "kl": 0.22137451171875,
+      "learning_rate": 1.2369331995613664e-06,
+      "loss": 0.0151,
+      "step": 96
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2630.805938720703,
+      "epoch": 3.462222222222222,
+      "grad_norm": 1.7166272647509115,
+      "kl": 0.281219482421875,
+      "learning_rate": 1.0899347581163222e-06,
+      "loss": 0.0839,
+      "reward": -7.589185383694712,
+      "reward_std": 3.134066376835108,
+      "rewards/cot_length_penalty_reward": -7.917310604825616,
+      "rewards/math_latex_accuracy_reward": 0.32812501839362085,
+      "step": 97
+    },
+    {
+      "clip_ratio": 0.007515597055316903,
+      "epoch": 3.497777777777778,
+      "grad_norm": 4.303565090937016,
+      "kl": 0.204925537109375,
+      "learning_rate": 9.517294753398066e-07,
+      "loss": 0.0869,
+      "step": 98
+    },
+    {
+      "clip_ratio": 0.0075942349576507695,
+      "epoch": 3.533333333333333,
+      "grad_norm": 2.9889664346961444,
+      "kl": 0.2061920166015625,
+      "learning_rate": 8.224537431601886e-07,
+      "loss": 0.0841,
+      "step": 99
+    },
+    {
+      "clip_ratio": 0.0051619461009977385,
+      "epoch": 3.568888888888889,
+      "grad_norm": 0.4431396626395719,
+      "kl": 0.22930908203125,
+      "learning_rate": 7.022351411174866e-07,
+      "loss": 0.0814,
+      "step": 100
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2545.8371925354004,
+      "epoch": 3.6044444444444443,
+      "grad_norm": 0.4301809872674547,
+      "kl": 0.171051025390625,
+      "learning_rate": 5.911923104577455e-07,
+      "loss": 0.0196,
+      "reward": -10.28242233581841,
+      "reward_std": 3.0546065159142017,
+      "rewards/cot_length_penalty_reward": -10.666351079940796,
+      "rewards/math_latex_accuracy_reward": 0.3839285857975483,
+      "step": 101
+    },
+    {
+      "clip_ratio": 0.0024334693371201865,
+      "epoch": 3.64,
+      "grad_norm": 0.40563497309844976,
+      "kl": 0.19537353515625,
+      "learning_rate": 4.894348370484648e-07,
+      "loss": 0.0191,
+      "step": 102
+    },
+    {
+      "clip_ratio": 0.00391879488597624,
+      "epoch": 3.6755555555555555,
+      "grad_norm": 0.5720892424115457,
+      "kl": 0.21160888671875,
+      "learning_rate": 3.9706314323056936e-07,
+      "loss": 0.0191,
+      "step": 103
+    },
+    {
+      "clip_ratio": 0.005467013252200559,
+      "epoch": 3.7111111111111112,
+      "grad_norm": 0.5349890131537904,
+      "kl": 0.21337890625,
+      "learning_rate": 3.1416838871368925e-07,
+      "loss": 0.0189,
+      "step": 104
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 2160.7389335632324,
+      "epoch": 3.7466666666666666,
+      "grad_norm": 13.207091106199918,
+      "kl": 0.7435302734375,
+      "learning_rate": 2.4083238061252565e-07,
+      "loss": 0.0564,
+      "reward": -7.942288625985384,
+      "reward_std": 2.594830472022295,
+      "rewards/cot_length_penalty_reward": -8.37309193611145,
+      "rewards/math_latex_accuracy_reward": 0.43080358672887087,
+      "step": 105
+    },
+    {
+      "clip_ratio": 0.0028747237083734944,
+      "epoch": 3.7822222222222224,
+      "grad_norm": 3.5352628792853067,
+      "kl": 0.472412109375,
+      "learning_rate": 1.7712749271311392e-07,
+      "loss": 0.0465,
+      "step": 106
+    },
+    {
+      "clip_ratio": 0.004760361814987846,
+      "epoch": 3.8177777777777777,
+      "grad_norm": 0.94419108473764,
+      "kl": 0.3760833740234375,
+      "learning_rate": 1.231165940486234e-07,
+      "loss": 0.0439,
+      "step": 107
+    },
+    {
+      "clip_ratio": 0.0061231208674144,
+      "epoch": 3.8533333333333335,
+      "grad_norm": 1.7315584230873846,
+      "kl": 0.3495025634765625,
+      "learning_rate": 7.885298685522235e-08,
+      "loss": 0.044,
+      "step": 108
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 1982.8750839233398,
+      "epoch": 3.888888888888889,
+      "grad_norm": 0.7007233776562277,
+      "kl": 0.2796783447265625,
+      "learning_rate": 4.438035396920004e-08,
+      "loss": 0.0207,
+      "reward": -9.675179054960608,
+      "reward_std": 2.574063938111067,
+      "rewards/cot_length_penalty_reward": -9.989911276847124,
+      "rewards/math_latex_accuracy_reward": 0.314732160884887,
+      "step": 109
+    },
+    {
+      "clip_ratio": 0.001981915433134418,
+      "epoch": 3.924444444444444,
+      "grad_norm": 0.6787732749959883,
+      "kl": 0.2747039794921875,
+      "learning_rate": 1.973271571728441e-08,
+      "loss": 0.0208,
+      "step": 110
+    },
+    {
+      "clip_ratio": 0.0019334297030582093,
+      "epoch": 3.96,
+      "grad_norm": 0.6415798052337234,
+      "kl": 0.27728271484375,
+      "learning_rate": 4.9343963426840006e-09,
+      "loss": 0.0206,
+      "step": 111
+    },
+    {
+      "clip_ratio": 0.0017770093054423342,
+      "epoch": 3.9955555555555557,
+      "grad_norm": 0.6316769640873855,
+      "kl": 0.30633544921875,
+      "learning_rate": 0.0,
+      "loss": 0.0207,
+      "step": 112
+    },
+    {
+      "epoch": 3.9955555555555557,
+      "step": 112,
+      "total_flos": 0.0,
+      "train_loss": 0.6775813137989773,
+      "train_runtime": 20674.7257,
+      "train_samples_per_second": 0.087,
+      "train_steps_per_second": 0.005
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 112,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}