radna commited on
Commit
68cd72a
·
verified ·
1 Parent(s): 329f9c2

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint-10/adapter_config.json +5 -5
  2. checkpoint-10/trainer_state.json +16 -15
  3. checkpoint-10/training_args.bin +1 -1
  4. checkpoint-12/adapter_config.json +5 -5
  5. checkpoint-12/trainer_state.json +19 -18
  6. checkpoint-12/training_args.bin +1 -1
  7. checkpoint-14/adapter_config.json +5 -5
  8. checkpoint-14/trainer_state.json +21 -20
  9. checkpoint-14/training_args.bin +1 -1
  10. checkpoint-16/adapter_config.json +5 -5
  11. checkpoint-16/trainer_state.json +23 -22
  12. checkpoint-16/training_args.bin +1 -1
  13. checkpoint-18/adapter_config.json +5 -5
  14. checkpoint-18/trainer_state.json +26 -25
  15. checkpoint-18/training_args.bin +1 -1
  16. checkpoint-2/adapter_config.json +5 -5
  17. checkpoint-2/trainer_state.json +6 -5
  18. checkpoint-2/training_args.bin +1 -1
  19. checkpoint-20/adapter_config.json +5 -5
  20. checkpoint-20/trainer_state.json +28 -27
  21. checkpoint-20/training_args.bin +1 -1
  22. checkpoint-22/adapter_config.json +5 -5
  23. checkpoint-22/trainer_state.json +31 -30
  24. checkpoint-22/training_args.bin +1 -1
  25. checkpoint-24/adapter_config.json +5 -5
  26. checkpoint-24/trainer_state.json +34 -33
  27. checkpoint-24/training_args.bin +1 -1
  28. checkpoint-26/adapter_config.json +5 -5
  29. checkpoint-26/trainer_state.json +36 -35
  30. checkpoint-26/training_args.bin +1 -1
  31. checkpoint-28/adapter_config.json +5 -5
  32. checkpoint-28/trainer_state.json +38 -37
  33. checkpoint-28/training_args.bin +1 -1
  34. checkpoint-30/adapter_config.json +5 -5
  35. checkpoint-30/global_step30/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt +3 -0
  36. checkpoint-30/global_step30/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt +3 -0
  37. checkpoint-30/global_step30/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt +3 -0
  38. checkpoint-30/global_step30/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt +3 -0
  39. checkpoint-30/global_step30/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt +3 -0
  40. checkpoint-30/global_step30/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt +3 -0
  41. checkpoint-30/global_step30/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt +3 -0
  42. checkpoint-30/global_step30/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt +3 -0
  43. checkpoint-30/rng_state_10.pth +3 -0
  44. checkpoint-30/rng_state_11.pth +3 -0
  45. checkpoint-30/rng_state_12.pth +3 -0
  46. checkpoint-30/rng_state_13.pth +3 -0
  47. checkpoint-30/rng_state_14.pth +3 -0
  48. checkpoint-30/rng_state_15.pth +3 -0
  49. checkpoint-30/rng_state_8.pth +3 -0
  50. checkpoint-30/rng_state_9.pth +3 -0
checkpoint-10/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-10/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.012996690347790718,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
4
  "epoch": 2.4210526315789473,
@@ -6,7 +7,7 @@
6
  "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  }
 
1
  {
2
+ "best_global_step": 6,
3
  "best_metric": 0.012996690347790718,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-6",
5
  "epoch": 2.4210526315789473,
 
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  }
checkpoint-10/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-12/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-12/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 2.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 12,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 2.8421052631578947,
 
7
  "global_step": 12,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
checkpoint-12/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-14/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-14/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 3.4210526315789473,
@@ -6,7 +7,7 @@
6
  "global_step": 14,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  }
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 3.4210526315789473,
 
7
  "global_step": 14,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  }
checkpoint-14/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-16/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-16/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03234308212995529,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
4
  "epoch": 3.8421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 16,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  }
 
1
  {
2
+ "best_global_step": 12,
3
  "best_metric": 0.03234308212995529,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-12",
5
  "epoch": 3.8421052631578947,
 
7
  "global_step": 16,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  }
checkpoint-16/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-18/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-18/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 4.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 18,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 4.421052631578947,
 
7
  "global_step": 18,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
checkpoint-18/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-2/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-2/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.42105263157894735,
@@ -6,7 +7,7 @@
6
  "global_step": 2,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  }
39
  ],
40
  "logging_steps": 1,
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 0.42105263157894735,
 
7
  "global_step": 2,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  }
40
  ],
41
  "logging_steps": 1,
checkpoint-2/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-20/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-20/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 4.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  }
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 4.842105263157895,
 
7
  "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 180.29,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 180.29,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  }
checkpoint-20/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-22/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-22/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.03729328140616417,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
4
  "epoch": 5.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 22,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  }
 
1
  {
2
+ "best_global_step": 18,
3
  "best_metric": 0.03729328140616417,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-18",
5
  "epoch": 5.421052631578947,
 
7
  "global_step": 22,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 180.29,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 180.29,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 180.29,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 180.29,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  }
checkpoint-22/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-24/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-24/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 5.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 24,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 5.842105263157895,
 
7
  "global_step": 24,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 180.29,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 180.29,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 180.29,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 180.29,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 180.29,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 180.29,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0616,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
checkpoint-24/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-26/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-26/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 6.421052631578947,
@@ -6,7 +7,7 @@
6
  "global_step": 26,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
@@ -408,7 +409,7 @@
408
  "kl": 0.1800537109375,
409
  "learning_rate": 7.243995901002312e-05,
410
  "loss": -0.02097315341234207,
411
- "memory(GiB)": 182.91,
412
  "response_clip_ratio": 0.6171875,
413
  "reward": 0.03010205877944827,
414
  "reward_std": 0.10742511600255966,
@@ -424,7 +425,7 @@
424
  "kl": 0.18408203125,
425
  "learning_rate": 6.980398830195785e-05,
426
  "loss": -0.02103913575410843,
427
- "memory(GiB)": 182.91,
428
  "step": 26,
429
  "train_speed(iter/s)": 0.000421
430
  }
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 6.421052631578947,
 
7
  "global_step": 26,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 180.29,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 180.29,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 180.29,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 180.29,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 180.29,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 180.29,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0616,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
 
409
  "kl": 0.1800537109375,
410
  "learning_rate": 7.243995901002312e-05,
411
  "loss": -0.02097315341234207,
412
+ "memory(GiB)": 180.29,
413
  "response_clip_ratio": 0.6171875,
414
  "reward": 0.03010205877944827,
415
  "reward_std": 0.10742511600255966,
 
425
  "kl": 0.18408203125,
426
  "learning_rate": 6.980398830195785e-05,
427
  "loss": -0.02103913575410843,
428
+ "memory(GiB)": 180.29,
429
  "step": 26,
430
  "train_speed(iter/s)": 0.000421
431
  }
checkpoint-26/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-28/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-28/trainer_state.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "best_metric": 0.04339282959699631,
3
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
4
  "epoch": 6.842105263157895,
@@ -6,7 +7,7 @@
6
  "global_step": 28,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
@@ -16,14 +17,14 @@
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666667e-05,
18
  "loss": -0.11016345024108887,
19
- "memory(GiB)": 182.91,
20
  "response_clip_ratio": 0.11328125,
21
  "reward": -0.002658387296833098,
22
  "reward_std": 0.06134121119976044,
23
  "rewards/CosineReward": -0.0026579967816360295,
24
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
25
  "step": 1,
26
- "train_speed(iter/s)": 0.000242
27
  },
28
  {
29
  "clip_ratio": 0.0,
@@ -32,9 +33,9 @@
32
  "kl": 0.0,
33
  "learning_rate": 3.3333333333333335e-05,
34
  "loss": -0.11016345024108887,
35
- "memory(GiB)": 182.91,
36
  "step": 2,
37
- "train_speed(iter/s)": 0.000467
38
  },
39
  {
40
  "clip_ratio": 1.3441811461234465e-05,
@@ -44,7 +45,7 @@
44
  "kl": 9.50181856751442e-07,
45
  "learning_rate": 5e-05,
46
  "loss": -0.06604708731174469,
47
- "memory(GiB)": 182.91,
48
  "response_clip_ratio": 0.13671875,
49
  "reward": 0.0006296975770965219,
50
  "reward_std": 0.07172460854053497,
@@ -60,7 +61,7 @@
60
  "kl": 1.1101365089416504e-05,
61
  "learning_rate": 6.666666666666667e-05,
62
  "loss": -0.06727766245603561,
63
- "memory(GiB)": 182.91,
64
  "step": 4,
65
  "train_speed(iter/s)": 0.000458
66
  },
@@ -72,7 +73,7 @@
72
  "kl": 0.00017762184143066406,
73
  "learning_rate": 8.333333333333334e-05,
74
  "loss": -0.09315311908721924,
75
- "memory(GiB)": 182.91,
76
  "response_clip_ratio": 0.119140625,
77
  "reward": -0.005135859013535082,
78
  "reward_std": 0.07994875870645046,
@@ -86,9 +87,9 @@
86
  "grad_norm": 0.18263348937034607,
87
  "learning_rate": 0.0001,
88
  "loss": -0.1041698157787323,
89
- "memory(GiB)": 182.91,
90
  "step": 6,
91
- "train_speed(iter/s)": 0.000459
92
  },
93
  {
94
  "epoch": 1.4210526315789473,
@@ -101,7 +102,7 @@
101
  "eval_reward_std": 0.08769983053207397,
102
  "eval_rewards/CosineReward": 0.012996694073081017,
103
  "eval_rewards/RepetitionPenalty": 0.0,
104
- "eval_runtime": 1030.1127,
105
  "eval_samples_per_second": 0.001,
106
  "eval_steps_per_second": 0.001,
107
  "step": 6
@@ -114,7 +115,7 @@
114
  "kl": 0.017406463623046875,
115
  "learning_rate": 9.991540791356342e-05,
116
  "loss": -0.051375165581703186,
117
- "memory(GiB)": 182.91,
118
  "response_clip_ratio": 0.1484375,
119
  "reward": 0.004909618757665157,
120
  "reward_std": 0.08167182095348835,
@@ -130,7 +131,7 @@
130
  "kl": 0.089599609375,
131
  "learning_rate": 9.966191788709716e-05,
132
  "loss": -0.05105742812156677,
133
- "memory(GiB)": 182.91,
134
  "step": 8,
135
  "train_speed(iter/s)": 0.000433
136
  },
@@ -142,7 +143,7 @@
142
  "kl": 0.0963134765625,
143
  "learning_rate": 9.924038765061042e-05,
144
  "loss": -0.05842069163918495,
145
- "memory(GiB)": 182.91,
146
  "response_clip_ratio": 0.255859375,
147
  "reward": 0.03643610421568155,
148
  "reward_std": 0.11898956261575222,
@@ -158,7 +159,7 @@
158
  "kl": 0.1185302734375,
159
  "learning_rate": 9.865224352899119e-05,
160
  "loss": -0.06491819024085999,
161
- "memory(GiB)": 182.91,
162
  "step": 10,
163
  "train_speed(iter/s)": 0.000436
164
  },
@@ -170,7 +171,7 @@
170
  "kl": 0.1275634765625,
171
  "learning_rate": 9.789947561577445e-05,
172
  "loss": -0.04600231721997261,
173
- "memory(GiB)": 182.91,
174
  "response_clip_ratio": 0.361328125,
175
  "reward": 0.023204635945148766,
176
  "reward_std": 0.10593634657561779,
@@ -184,7 +185,7 @@
184
  "grad_norm": 0.05781339108943939,
185
  "learning_rate": 9.698463103929542e-05,
186
  "loss": -0.05069056898355484,
187
- "memory(GiB)": 182.91,
188
  "step": 12,
189
  "train_speed(iter/s)": 0.000439
190
  },
@@ -199,7 +200,7 @@
199
  "eval_reward_std": 0.10685288906097412,
200
  "eval_rewards/CosineReward": 0.03234308212995529,
201
  "eval_rewards/RepetitionPenalty": 0.0,
202
- "eval_runtime": 1025.9041,
203
  "eval_samples_per_second": 0.001,
204
  "eval_steps_per_second": 0.001,
205
  "step": 12
@@ -212,7 +213,7 @@
212
  "kl": 0.151123046875,
213
  "learning_rate": 9.591080534401371e-05,
214
  "loss": -0.02191038429737091,
215
- "memory(GiB)": 182.91,
216
  "response_clip_ratio": 0.419921875,
217
  "reward": 0.035983758978545666,
218
  "reward_std": 0.11553369648754597,
@@ -228,7 +229,7 @@
228
  "kl": 0.169189453125,
229
  "learning_rate": 9.468163201617062e-05,
230
  "loss": -0.022672578692436218,
231
- "memory(GiB)": 182.91,
232
  "step": 14,
233
  "train_speed(iter/s)": 0.000427
234
  },
@@ -240,7 +241,7 @@
240
  "kl": 0.166748046875,
241
  "learning_rate": 9.330127018922194e-05,
242
  "loss": -0.059799157083034515,
243
- "memory(GiB)": 182.91,
244
  "response_clip_ratio": 0.4765625,
245
  "reward": 0.03584331553429365,
246
  "reward_std": 0.11829411797225475,
@@ -256,7 +257,7 @@
256
  "kl": 0.16748046875,
257
  "learning_rate": 9.177439057064683e-05,
258
  "loss": -0.06071458384394646,
259
- "memory(GiB)": 182.91,
260
  "step": 16,
261
  "train_speed(iter/s)": 0.000431
262
  },
@@ -268,7 +269,7 @@
268
  "kl": 0.1787109375,
269
  "learning_rate": 9.01061596377522e-05,
270
  "loss": -0.04504441097378731,
271
- "memory(GiB)": 182.91,
272
  "response_clip_ratio": 0.5625,
273
  "reward": 0.027318883687257767,
274
  "reward_std": 0.10441224090754986,
@@ -282,7 +283,7 @@
282
  "grad_norm": 0.005998397711664438,
283
  "learning_rate": 8.83022221559489e-05,
284
  "loss": -0.045487549155950546,
285
- "memory(GiB)": 182.91,
286
  "step": 18,
287
  "train_speed(iter/s)": 0.000432
288
  },
@@ -297,7 +298,7 @@
297
  "eval_reward_std": 0.10691346973180771,
298
  "eval_rewards/CosineReward": 0.03729327768087387,
299
  "eval_rewards/RepetitionPenalty": 0.0,
300
- "eval_runtime": 1041.231,
301
  "eval_samples_per_second": 0.001,
302
  "eval_steps_per_second": 0.001,
303
  "step": 18
@@ -310,7 +311,7 @@
310
  "kl": 0.1820068359375,
311
  "learning_rate": 8.636868207865244e-05,
312
  "loss": -0.03466903418302536,
313
- "memory(GiB)": 182.91,
314
  "response_clip_ratio": 0.466796875,
315
  "reward": 0.04069916973821819,
316
  "reward_std": 0.11991005763411522,
@@ -326,7 +327,7 @@
326
  "kl": 0.19287109375,
327
  "learning_rate": 8.43120818934367e-05,
328
  "loss": -0.03502114117145538,
329
- "memory(GiB)": 182.91,
330
  "step": 20,
331
  "train_speed(iter/s)": 0.000424
332
  },
@@ -338,14 +339,14 @@
338
  "kl": 0.17626953125,
339
  "learning_rate": 8.213938048432697e-05,
340
  "loss": -0.008662773296236992,
341
- "memory(GiB)": 182.91,
342
  "response_clip_ratio": 0.5625,
343
  "reward": 0.04996980866417289,
344
  "reward_std": 0.13849420100450516,
345
  "rewards/CosineReward": 0.049969930201768875,
346
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
347
  "step": 21,
348
- "train_speed(iter/s)": 0.000408
349
  },
350
  {
351
  "clip_ratio": 5.869188044016482e-05,
@@ -354,7 +355,7 @@
354
  "kl": 0.178955078125,
355
  "learning_rate": 7.985792958513931e-05,
356
  "loss": -0.008743642829358578,
357
- "memory(GiB)": 182.91,
358
  "step": 22,
359
  "train_speed(iter/s)": 0.000426
360
  },
@@ -366,7 +367,7 @@
366
  "kl": 0.1796875,
367
  "learning_rate": 7.74754489035403e-05,
368
  "loss": -0.03423420712351799,
369
- "memory(GiB)": 182.91,
370
  "response_clip_ratio": 0.583984375,
371
  "reward": 0.034468831261619925,
372
  "reward_std": 0.11841745302081108,
@@ -380,7 +381,7 @@
380
  "grad_norm": 0.014131724834442139,
381
  "learning_rate": 7.500000000000001e-05,
382
  "loss": -0.03426633030176163,
383
- "memory(GiB)": 182.91,
384
  "step": 24,
385
  "train_speed(iter/s)": 0.000427
386
  },
@@ -395,7 +396,7 @@
395
  "eval_reward_std": 0.10456253588199615,
396
  "eval_rewards/CosineReward": 0.04339282959699631,
397
  "eval_rewards/RepetitionPenalty": 0.0,
398
- "eval_runtime": 1045.0632,
399
  "eval_samples_per_second": 0.001,
400
  "eval_steps_per_second": 0.001,
401
  "step": 24
@@ -408,7 +409,7 @@
408
  "kl": 0.1800537109375,
409
  "learning_rate": 7.243995901002312e-05,
410
  "loss": -0.02097315341234207,
411
- "memory(GiB)": 182.91,
412
  "response_clip_ratio": 0.6171875,
413
  "reward": 0.03010205877944827,
414
  "reward_std": 0.10742511600255966,
@@ -424,7 +425,7 @@
424
  "kl": 0.18408203125,
425
  "learning_rate": 6.980398830195785e-05,
426
  "loss": -0.02103913575410843,
427
- "memory(GiB)": 182.91,
428
  "step": 26,
429
  "train_speed(iter/s)": 0.000421
430
  },
@@ -436,7 +437,7 @@
436
  "kl": 0.174560546875,
437
  "learning_rate": 6.710100716628344e-05,
438
  "loss": -0.03593946248292923,
439
- "memory(GiB)": 182.91,
440
  "response_clip_ratio": 0.513671875,
441
  "reward": 0.04752760287374258,
442
  "reward_std": 0.14935147762298584,
@@ -452,7 +453,7 @@
452
  "kl": 0.182373046875,
453
  "learning_rate": 6.434016163555452e-05,
454
  "loss": -0.03595500811934471,
455
- "memory(GiB)": 182.91,
456
  "step": 28,
457
  "train_speed(iter/s)": 0.000422
458
  }
 
1
  {
2
+ "best_global_step": 24,
3
  "best_metric": 0.04339282959699631,
4
  "best_model_checkpoint": "/mnt/nvme5n1p1/trained_grpo_distill_14b_rl_70_s3/v3-20250330-200345/checkpoint-24",
5
  "epoch": 6.842105263157895,
 
7
  "global_step": 28,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
+ "is_world_process_zero": false,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
 
17
  "kl": 0.0,
18
  "learning_rate": 1.6666666666666667e-05,
19
  "loss": -0.11016345024108887,
20
+ "memory(GiB)": 180.29,
21
  "response_clip_ratio": 0.11328125,
22
  "reward": -0.002658387296833098,
23
  "reward_std": 0.06134121119976044,
24
  "rewards/CosineReward": -0.0026579967816360295,
25
  "rewards/RepetitionPenalty": -3.8975886695880035e-07,
26
  "step": 1,
27
+ "train_speed(iter/s)": 0.000241
28
  },
29
  {
30
  "clip_ratio": 0.0,
 
33
  "kl": 0.0,
34
  "learning_rate": 3.3333333333333335e-05,
35
  "loss": -0.11016345024108887,
36
+ "memory(GiB)": 180.29,
37
  "step": 2,
38
+ "train_speed(iter/s)": 0.000466
39
  },
40
  {
41
  "clip_ratio": 1.3441811461234465e-05,
 
45
  "kl": 9.50181856751442e-07,
46
  "learning_rate": 5e-05,
47
  "loss": -0.06604708731174469,
48
+ "memory(GiB)": 180.29,
49
  "response_clip_ratio": 0.13671875,
50
  "reward": 0.0006296975770965219,
51
  "reward_std": 0.07172460854053497,
 
61
  "kl": 1.1101365089416504e-05,
62
  "learning_rate": 6.666666666666667e-05,
63
  "loss": -0.06727766245603561,
64
+ "memory(GiB)": 180.29,
65
  "step": 4,
66
  "train_speed(iter/s)": 0.000458
67
  },
 
73
  "kl": 0.00017762184143066406,
74
  "learning_rate": 8.333333333333334e-05,
75
  "loss": -0.09315311908721924,
76
+ "memory(GiB)": 180.29,
77
  "response_clip_ratio": 0.119140625,
78
  "reward": -0.005135859013535082,
79
  "reward_std": 0.07994875870645046,
 
87
  "grad_norm": 0.18263348937034607,
88
  "learning_rate": 0.0001,
89
  "loss": -0.1041698157787323,
90
+ "memory(GiB)": 180.29,
91
  "step": 6,
92
+ "train_speed(iter/s)": 0.000458
93
  },
94
  {
95
  "epoch": 1.4210526315789473,
 
102
  "eval_reward_std": 0.08769983053207397,
103
  "eval_rewards/CosineReward": 0.012996694073081017,
104
  "eval_rewards/RepetitionPenalty": 0.0,
105
+ "eval_runtime": 1030.1223,
106
  "eval_samples_per_second": 0.001,
107
  "eval_steps_per_second": 0.001,
108
  "step": 6
 
115
  "kl": 0.017406463623046875,
116
  "learning_rate": 9.991540791356342e-05,
117
  "loss": -0.051375165581703186,
118
+ "memory(GiB)": 180.29,
119
  "response_clip_ratio": 0.1484375,
120
  "reward": 0.004909618757665157,
121
  "reward_std": 0.08167182095348835,
 
131
  "kl": 0.089599609375,
132
  "learning_rate": 9.966191788709716e-05,
133
  "loss": -0.05105742812156677,
134
+ "memory(GiB)": 180.29,
135
  "step": 8,
136
  "train_speed(iter/s)": 0.000433
137
  },
 
143
  "kl": 0.0963134765625,
144
  "learning_rate": 9.924038765061042e-05,
145
  "loss": -0.05842069163918495,
146
+ "memory(GiB)": 180.29,
147
  "response_clip_ratio": 0.255859375,
148
  "reward": 0.03643610421568155,
149
  "reward_std": 0.11898956261575222,
 
159
  "kl": 0.1185302734375,
160
  "learning_rate": 9.865224352899119e-05,
161
  "loss": -0.06491819024085999,
162
+ "memory(GiB)": 180.29,
163
  "step": 10,
164
  "train_speed(iter/s)": 0.000436
165
  },
 
171
  "kl": 0.1275634765625,
172
  "learning_rate": 9.789947561577445e-05,
173
  "loss": -0.04600231721997261,
174
+ "memory(GiB)": 180.29,
175
  "response_clip_ratio": 0.361328125,
176
  "reward": 0.023204635945148766,
177
  "reward_std": 0.10593634657561779,
 
185
  "grad_norm": 0.05781339108943939,
186
  "learning_rate": 9.698463103929542e-05,
187
  "loss": -0.05069056898355484,
188
+ "memory(GiB)": 180.29,
189
  "step": 12,
190
  "train_speed(iter/s)": 0.000439
191
  },
 
200
  "eval_reward_std": 0.10685288906097412,
201
  "eval_rewards/CosineReward": 0.03234308212995529,
202
  "eval_rewards/RepetitionPenalty": 0.0,
203
+ "eval_runtime": 1025.9045,
204
  "eval_samples_per_second": 0.001,
205
  "eval_steps_per_second": 0.001,
206
  "step": 12
 
213
  "kl": 0.151123046875,
214
  "learning_rate": 9.591080534401371e-05,
215
  "loss": -0.02191038429737091,
216
+ "memory(GiB)": 180.29,
217
  "response_clip_ratio": 0.419921875,
218
  "reward": 0.035983758978545666,
219
  "reward_std": 0.11553369648754597,
 
229
  "kl": 0.169189453125,
230
  "learning_rate": 9.468163201617062e-05,
231
  "loss": -0.022672578692436218,
232
+ "memory(GiB)": 180.29,
233
  "step": 14,
234
  "train_speed(iter/s)": 0.000427
235
  },
 
241
  "kl": 0.166748046875,
242
  "learning_rate": 9.330127018922194e-05,
243
  "loss": -0.059799157083034515,
244
+ "memory(GiB)": 180.29,
245
  "response_clip_ratio": 0.4765625,
246
  "reward": 0.03584331553429365,
247
  "reward_std": 0.11829411797225475,
 
257
  "kl": 0.16748046875,
258
  "learning_rate": 9.177439057064683e-05,
259
  "loss": -0.06071458384394646,
260
+ "memory(GiB)": 180.29,
261
  "step": 16,
262
  "train_speed(iter/s)": 0.000431
263
  },
 
269
  "kl": 0.1787109375,
270
  "learning_rate": 9.01061596377522e-05,
271
  "loss": -0.04504441097378731,
272
+ "memory(GiB)": 180.29,
273
  "response_clip_ratio": 0.5625,
274
  "reward": 0.027318883687257767,
275
  "reward_std": 0.10441224090754986,
 
283
  "grad_norm": 0.005998397711664438,
284
  "learning_rate": 8.83022221559489e-05,
285
  "loss": -0.045487549155950546,
286
+ "memory(GiB)": 180.29,
287
  "step": 18,
288
  "train_speed(iter/s)": 0.000432
289
  },
 
298
  "eval_reward_std": 0.10691346973180771,
299
  "eval_rewards/CosineReward": 0.03729327768087387,
300
  "eval_rewards/RepetitionPenalty": 0.0,
301
+ "eval_runtime": 1041.2321,
302
  "eval_samples_per_second": 0.001,
303
  "eval_steps_per_second": 0.001,
304
  "step": 18
 
311
  "kl": 0.1820068359375,
312
  "learning_rate": 8.636868207865244e-05,
313
  "loss": -0.03466903418302536,
314
+ "memory(GiB)": 180.29,
315
  "response_clip_ratio": 0.466796875,
316
  "reward": 0.04069916973821819,
317
  "reward_std": 0.11991005763411522,
 
327
  "kl": 0.19287109375,
328
  "learning_rate": 8.43120818934367e-05,
329
  "loss": -0.03502114117145538,
330
+ "memory(GiB)": 180.29,
331
  "step": 20,
332
  "train_speed(iter/s)": 0.000424
333
  },
 
339
  "kl": 0.17626953125,
340
  "learning_rate": 8.213938048432697e-05,
341
  "loss": -0.008662773296236992,
342
+ "memory(GiB)": 180.29,
343
  "response_clip_ratio": 0.5625,
344
  "reward": 0.04996980866417289,
345
  "reward_std": 0.13849420100450516,
346
  "rewards/CosineReward": 0.049969930201768875,
347
  "rewards/RepetitionPenalty": -1.1864573679076784e-07,
348
  "step": 21,
349
+ "train_speed(iter/s)": 0.000407
350
  },
351
  {
352
  "clip_ratio": 5.869188044016482e-05,
 
355
  "kl": 0.178955078125,
356
  "learning_rate": 7.985792958513931e-05,
357
  "loss": -0.008743642829358578,
358
+ "memory(GiB)": 180.29,
359
  "step": 22,
360
  "train_speed(iter/s)": 0.000426
361
  },
 
367
  "kl": 0.1796875,
368
  "learning_rate": 7.74754489035403e-05,
369
  "loss": -0.03423420712351799,
370
+ "memory(GiB)": 180.29,
371
  "response_clip_ratio": 0.583984375,
372
  "reward": 0.034468831261619925,
373
  "reward_std": 0.11841745302081108,
 
381
  "grad_norm": 0.014131724834442139,
382
  "learning_rate": 7.500000000000001e-05,
383
  "loss": -0.03426633030176163,
384
+ "memory(GiB)": 180.29,
385
  "step": 24,
386
  "train_speed(iter/s)": 0.000427
387
  },
 
396
  "eval_reward_std": 0.10456253588199615,
397
  "eval_rewards/CosineReward": 0.04339282959699631,
398
  "eval_rewards/RepetitionPenalty": 0.0,
399
+ "eval_runtime": 1045.0616,
400
  "eval_samples_per_second": 0.001,
401
  "eval_steps_per_second": 0.001,
402
  "step": 24
 
409
  "kl": 0.1800537109375,
410
  "learning_rate": 7.243995901002312e-05,
411
  "loss": -0.02097315341234207,
412
+ "memory(GiB)": 180.29,
413
  "response_clip_ratio": 0.6171875,
414
  "reward": 0.03010205877944827,
415
  "reward_std": 0.10742511600255966,
 
425
  "kl": 0.18408203125,
426
  "learning_rate": 6.980398830195785e-05,
427
  "loss": -0.02103913575410843,
428
+ "memory(GiB)": 180.29,
429
  "step": 26,
430
  "train_speed(iter/s)": 0.000421
431
  },
 
437
  "kl": 0.174560546875,
438
  "learning_rate": 6.710100716628344e-05,
439
  "loss": -0.03593946248292923,
440
+ "memory(GiB)": 180.29,
441
  "response_clip_ratio": 0.513671875,
442
  "reward": 0.04752760287374258,
443
  "reward_std": 0.14935147762298584,
 
453
  "kl": 0.182373046875,
454
  "learning_rate": 6.434016163555452e-05,
455
  "loss": -0.03595500811934471,
456
+ "memory(GiB)": 180.29,
457
  "step": 28,
458
  "train_speed(iter/s)": 0.000422
459
  }
checkpoint-28/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1207fcb9d91c7deb13a80104f3ca89016b4cff3ef13ebd136ee6320d5a9888bb
3
  size 9809
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7039ba231031019d68d36961179e879852b9f8a6de15562e5b792330bcbb4412
3
  size 9809
checkpoint-30/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
- "gate_proj",
28
- "k_proj",
29
  "o_proj",
30
- "up_proj",
31
  "q_proj",
32
- "down_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
 
 
27
  "o_proj",
 
28
  "q_proj",
29
+ "v_proj",
30
+ "up_proj",
31
+ "gate_proj",
32
+ "k_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-30/global_step30/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e55637d10a606b1c4548d11222cf2203be3732556568beb057466cdc664ad4d
3
+ size 51616527
checkpoint-30/global_step30/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2f918a8d78d02a6bb0c3b78aa535ba635edd803e57b1d29a398f8044ee1f23d
3
+ size 51616015
checkpoint-30/global_step30/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e9a17698868042b6c5d413425d6800e585ce6747e333ca3c1f237999bee440
3
+ size 51616527
checkpoint-30/global_step30/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5466499266f4b34f807e3c8219f5d1dc961145b81040242cdc2604123afb29f
3
+ size 51616015
checkpoint-30/global_step30/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf7bbc813d8c4a9ddd84f99f272e1a3597a0aedd614f42be8b26292f36bd0d4
3
+ size 51616527
checkpoint-30/global_step30/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b43f39939991ea04c843fb1af82098f35e53f66d1ed284bfe74b2b156e3c90
3
+ size 51616015
checkpoint-30/global_step30/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c6064fa63ee43597a575223e5d27d1a9f78c46ca5d2249d2c286a8e11546906
3
+ size 51616517
checkpoint-30/global_step30/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:476e35fb26447a6dab0120c7bb123daf6c56d682fa095ba3520478075d361f49
3
+ size 51616005
checkpoint-30/rng_state_10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e2c4ae8a7a72d5fa3fffa5b1790f7c05f343652ff279b267cb02c1ea1450019
3
+ size 16404
checkpoint-30/rng_state_11.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12a2477af9f9146885ae739aae7fce8caed34fdcc7b6ba670110eaf5af9bbd4f
3
+ size 16468
checkpoint-30/rng_state_12.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6e02e4e933e0565bcac02de812b35f312d7d3c1fa524e7ec565c0b58168b2c5
3
+ size 16340
checkpoint-30/rng_state_13.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83f60cc7fc25a6bf581d13962550c94c3f732a7fcebe112c13d88d7be34ff3ee
3
+ size 16468
checkpoint-30/rng_state_14.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e89c5c47e96e9a670905d6a024b717057f7e744cb15d841066c2543d38d493ca
3
+ size 16404
checkpoint-30/rng_state_15.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62a09586fa73abe0c5bd3addd127d400057b84dc07190df99c680834ca075d87
3
+ size 16404
checkpoint-30/rng_state_8.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:806ea740b8aa5b01e84b7e8a2de7b547cd7ef04618091a007b4e2b7e2edcc6cb
3
+ size 16389
checkpoint-30/rng_state_9.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c96fddfdb910bf5037b513e22fd76243525edf1f04813a9013acd71733a822f
3
+ size 16389