qingyangzhang
/

Qwen2.5-3B-EMPO-TQA

@@ -1,10 +1,8 @@
 ---
-datasets: domenicrosati/TruthfulQA
 library_name: transformers
 model_name: Qwen2.5-3B-EMPO-TQA
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - grpo
 licence: license
@@ -12,7 +10,7 @@ licence: license
 # Model Card for Qwen2.5-3B-EMPO-TQA
-This model is a fine-tuned version of [None](https://huggingface.co/None) on the [domenicrosati/TruthfulQA](https://huggingface.co/datasets/domenicrosati/TruthfulQA) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -28,7 +26,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/gaqlrb6w)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ---
 library_name: transformers
 model_name: Qwen2.5-3B-EMPO-TQA
 tags:
 - generated_from_trainer
 - trl
 - grpo
 licence: license
 # Model Card for Qwen2.5-3B-EMPO-TQA
+This model is a fine-tuned version of [None](https://huggingface.co/None).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/s54psly4)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.00023655204281105568,
-    "train_runtime": 1010.415,
     "train_samples": 490,
-    "train_samples_per_second": 0.485,
-    "train_steps_per_second": 0.01
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.00038097099556277193,
+    "train_runtime": 1888.0541,
     "train_samples": 490,
+    "train_samples_per_second": 0.779,
+    "train_steps_per_second": 0.016
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.00023655204281105568,
-    "train_runtime": 1010.415,
     "train_samples": 490,
-    "train_samples_per_second": 0.485,
-    "train_steps_per_second": 0.01
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.00038097099556277193,
+    "train_runtime": 1888.0541,
     "train_samples": 490,
+    "train_samples_per_second": 0.779,
+    "train_steps_per_second": 0.016
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.975609756097561,
   "eval_steps": 100,
-  "global_step": 10,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -129,19 +129,259 @@
       "step": 10
     },
     {
-      "epoch": 0.975609756097561,
-      "step": 10,
       "total_flos": 0.0,
-      "train_loss": 0.00023655204281105568,
-      "train_runtime": 1010.415,
-      "train_samples_per_second": 0.485,
-      "train_steps_per_second": 0.01
     }
   ],
   "logging_steps": 1,
-  "max_steps": 10,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.8780487804878048,
   "eval_steps": 100,
+  "global_step": 30,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 10
     },
     {
+      "completion_length": 17.564236164093018,
+      "epoch": 1.0975609756097562,
+      "grad_norm": 0.6446972489356995,
+      "kl": 0.009983062744140625,
+      "learning_rate": 1.5971585917027862e-06,
+      "loss": 0.0004,
+      "reward": 0.5920138917863369,
+      "reward_std": 0.11490329634398222,
+      "rewards/semantic_entropy": 0.5920138917863369,
+      "step": 11
+    },
+    {
+      "completion_length": 20.96701431274414,
+      "epoch": 1.1951219512195121,
+      "grad_norm": 0.4366309642791748,
+      "kl": 0.012844085693359375,
+      "learning_rate": 1.5e-06,
+      "loss": 0.0005,
+      "reward": 0.5960648246109486,
+      "reward_std": 0.10349765885621309,
+      "rewards/semantic_entropy": 0.5960648246109486,
+      "step": 12
+    },
+    {
+      "completion_length": 20.28819489479065,
+      "epoch": 1.2926829268292683,
+      "grad_norm": 0.7926385998725891,
+      "kl": 0.017467498779296875,
+      "learning_rate": 1.3960797660391568e-06,
+      "loss": 0.0007,
+      "reward": 0.5014467723667622,
+      "reward_std": 0.12278107088059187,
+      "rewards/semantic_entropy": 0.5014467723667622,
+      "step": 13
+    },
+    {
+      "completion_length": 20.972223043441772,
+      "epoch": 1.3902439024390243,
+      "grad_norm": 0.67302006483078,
+      "kl": 0.017255783081054688,
+      "learning_rate": 1.2868032327110903e-06,
+      "loss": 0.0007,
+      "reward": 0.5095486305654049,
+      "reward_std": 0.09976449748501182,
+      "rewards/semantic_entropy": 0.5095486305654049,
+      "step": 14
+    },
+    {
+      "completion_length": 18.17013907432556,
+      "epoch": 1.4878048780487805,
+      "grad_norm": 0.43078961968421936,
+      "kl": 0.0076904296875,
+      "learning_rate": 1.1736481776669305e-06,
+      "loss": 0.0003,
+      "reward": 0.588252317160368,
+      "reward_std": 0.1104576913639903,
+      "rewards/semantic_entropy": 0.588252317160368,
+      "step": 15
+    },
+    {
+      "completion_length": 19.697917222976685,
+      "epoch": 1.5853658536585367,
+      "grad_norm": 0.4722602665424347,
+      "kl": 0.010288238525390625,
+      "learning_rate": 1.0581448289104758e-06,
+      "loss": 0.0004,
+      "reward": 0.559606496244669,
+      "reward_std": 0.11904297955334187,
+      "rewards/semantic_entropy": 0.559606496244669,
+      "step": 16
+    },
+    {
+      "completion_length": 19.661458730697632,
+      "epoch": 1.6829268292682928,
+      "grad_norm": 0.6996189951896667,
+      "kl": 0.012149810791015625,
+      "learning_rate": 9.418551710895241e-07,
+      "loss": 0.0005,
+      "reward": 0.591435182839632,
+      "reward_std": 0.10133868269622326,
+      "rewards/semantic_entropy": 0.591435182839632,
+      "step": 17
+    },
+    {
+      "completion_length": 21.19444465637207,
+      "epoch": 1.7804878048780488,
+      "grad_norm": 0.5424029231071472,
+      "kl": 0.023311614990234375,
+      "learning_rate": 8.263518223330696e-07,
+      "loss": 0.0009,
+      "reward": 0.5104166567325592,
+      "reward_std": 0.1390146454796195,
+      "rewards/semantic_entropy": 0.5104166567325592,
+      "step": 18
+    },
+    {
+      "completion_length": 21.192708253860474,
+      "epoch": 1.8780487804878048,
+      "grad_norm": 0.5004396438598633,
+      "kl": 0.007457733154296875,
+      "learning_rate": 7.1319676728891e-07,
+      "loss": 0.0003,
+      "reward": 0.5214120373129845,
+      "reward_std": 0.12240998912602663,
+      "rewards/semantic_entropy": 0.5214120373129845,
+      "step": 19
+    },
+    {
+      "completion_length": 19.510417342185974,
+      "epoch": 1.975609756097561,
+      "grad_norm": 0.5037679076194763,
+      "kl": 0.012096405029296875,
+      "learning_rate": 6.039202339608431e-07,
+      "loss": 0.0005,
+      "reward": 0.6221064738929272,
+      "reward_std": 0.11695278249680996,
+      "rewards/semantic_entropy": 0.6221064738929272,
+      "step": 20
+    },
+    {
+      "completion_length": 17.875,
+      "epoch": 2.0,
+      "grad_norm": 0.5037679076194763,
+      "kl": 0.03363037109375,
+      "learning_rate": 5.000000000000002e-07,
+      "loss": 0.0003,
+      "reward": 1.0,
+      "reward_std": 0.0,
+      "rewards/semantic_entropy": 1.0,
+      "step": 21
+    },
+    {
+      "completion_length": 18.83506989479065,
+      "epoch": 2.097560975609756,
+      "grad_norm": 0.727460503578186,
+      "kl": 0.01171875,
+      "learning_rate": 4.02841408297214e-07,
+      "loss": 0.0005,
+      "reward": 0.6105324216187,
+      "reward_std": 0.0801440766081214,
+      "rewards/semantic_entropy": 0.6105324216187,
+      "step": 22
+    },
+    {
+      "completion_length": 20.21701443195343,
+      "epoch": 2.1951219512195124,
+      "grad_norm": 0.6406362056732178,
+      "kl": 0.01398468017578125,
+      "learning_rate": 3.137583621312665e-07,
+      "loss": 0.0006,
+      "reward": 0.5815972313284874,
+      "reward_std": 0.12349289190024137,
+      "rewards/semantic_entropy": 0.5815972313284874,
+      "step": 23
+    },
+    {
+      "completion_length": 21.807291984558105,
+      "epoch": 2.292682926829268,
+      "grad_norm": 0.45703092217445374,
+      "kl": 0.012033462524414062,
+      "learning_rate": 2.339555568810221e-07,
+      "loss": 0.0005,
+      "reward": 0.5804398320615292,
+      "reward_std": 0.1076621082611382,
+      "rewards/semantic_entropy": 0.5804398320615292,
+      "step": 24
+    },
+    {
+      "completion_length": 20.090278387069702,
+      "epoch": 2.3902439024390243,
+      "grad_norm": 1.17950439453125,
+      "kl": 0.023633956909179688,
+      "learning_rate": 1.6451218858706372e-07,
+      "loss": 0.0009,
+      "reward": 0.4774305485188961,
+      "reward_std": 0.11873876117169857,
+      "rewards/semantic_entropy": 0.4774305485188961,
+      "step": 25
+    },
+    {
+      "completion_length": 19.114583492279053,
+      "epoch": 2.4878048780487805,
+      "grad_norm": 1.0232934951782227,
+      "kl": 0.021167755126953125,
+      "learning_rate": 1.0636735967658784e-07,
+      "loss": 0.0008,
+      "reward": 0.5917245410382748,
+      "reward_std": 0.10972362849861383,
+      "rewards/semantic_entropy": 0.5917245410382748,
+      "step": 26
+    },
+    {
+      "completion_length": 17.560763835906982,
+      "epoch": 2.5853658536585367,
+      "grad_norm": 0.45686405897140503,
+      "kl": 0.0112457275390625,
+      "learning_rate": 6.030737921409168e-08,
+      "loss": 0.0004,
+      "reward": 0.5879629701375961,
+      "reward_std": 0.10896958655212075,
+      "rewards/semantic_entropy": 0.5879629701375961,
+      "step": 27
+    },
+    {
+      "completion_length": 18.482638955116272,
+      "epoch": 2.682926829268293,
+      "grad_norm": 2.8627212047576904,
+      "kl": 0.028564453125,
+      "learning_rate": 2.6955129420176193e-08,
+      "loss": 0.0011,
+      "reward": 0.5341435223817825,
+      "reward_std": 0.11482170736417174,
+      "rewards/semantic_entropy": 0.5341435223817825,
+      "step": 28
+    },
+    {
+      "completion_length": 20.75868058204651,
+      "epoch": 2.7804878048780486,
+      "grad_norm": 0.60300213098526,
+      "kl": 0.014234542846679688,
+      "learning_rate": 6.761642258056976e-09,
+      "loss": 0.0006,
+      "reward": 0.47453703358769417,
+      "reward_std": 0.12850847654044628,
+      "rewards/semantic_entropy": 0.47453703358769417,
+      "step": 29
+    },
+    {
+      "completion_length": 19.67881965637207,
+      "epoch": 2.8780487804878048,
+      "grad_norm": 0.4153330624103546,
+      "kl": 0.01200103759765625,
+      "learning_rate": 0.0,
+      "loss": 0.0005,
+      "reward": 0.6588541865348816,
+      "reward_std": 0.10488813614938408,
+      "rewards/semantic_entropy": 0.6588541865348816,
+      "step": 30
+    },
+    {
+      "epoch": 2.8780487804878048,
+      "step": 30,
       "total_flos": 0.0,
+      "train_loss": 0.00038097099556277193,
+      "train_runtime": 1888.0541,
+      "train_samples_per_second": 0.779,
+      "train_steps_per_second": 0.016
     }
   ],
   "logging_steps": 1,
+  "max_steps": 30,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {