hazentr commited on
Commit
652c830
·
verified ·
1 Parent(s): 7cd7e07

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +5 -5
  3. model.safetensors +1 -1
  4. train_results.json +5 -5
  5. trainer_state.json +283 -90
README.md CHANGED
@@ -4,11 +4,11 @@ library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
7
- - rl-swarm
8
- - gensyn
9
  - grpo
 
10
  - I am quick timid frog
11
  - trl
 
12
  licence: license
13
  ---
14
 
 
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
 
 
7
  - grpo
8
+ - gensyn
9
  - I am quick timid frog
10
  - trl
11
+ - rl-swarm
12
  licence: license
13
  ---
14
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0874896764755249,
4
- "train_runtime": 937.823,
5
- "train_samples": 5,
6
- "train_samples_per_second": 0.171,
7
- "train_steps_per_second": 0.021
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.15644574165344238,
4
+ "train_runtime": 1523.8731,
5
+ "train_samples": 79,
6
+ "train_samples_per_second": 0.105,
7
+ "train_steps_per_second": 0.013
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:713264c96749c24604d08d142b497451f9725614fb94d291a0e5e9dee6936d71
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24593988fb48726d4a5db5e2e37fe74d6a678e8b4785c2c95c767b747a674505
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0874896764755249,
4
- "train_runtime": 937.823,
5
- "train_samples": 5,
6
- "train_samples_per_second": 0.171,
7
- "train_steps_per_second": 0.021
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.15644574165344238,
4
+ "train_runtime": 1523.8731,
5
+ "train_samples": 79,
6
+ "train_samples_per_second": 0.105,
7
+ "train_steps_per_second": 0.013
8
  }
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 9.8,
6
  "eval_steps": 500,
7
- "global_step": 10,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -15,22 +15,22 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.25,
19
- "completions/max_length": 938.0,
20
- "completions/max_terminated_length": 865.5,
21
- "completions/mean_length": 656.125,
22
- "completions/mean_terminated_length": 580.375,
23
- "completions/min_length": 287.5,
24
- "completions/min_terminated_length": 287.5,
25
- "epoch": 1.8,
26
  "frac_reward_zero_std": 0.0,
27
- "grad_norm": 2.378174066543579,
28
- "kl": 0.00024694142030057264,
29
  "learning_rate": 5e-07,
30
- "loss": -0.0692,
31
- "num_tokens": 14594.0,
32
- "reward": -0.09812777116894722,
33
- "reward_std": 0.16612516529858112,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
@@ -39,14 +39,14 @@
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
- "rewards/question_recreation_reward_func/mean": 0.01712222769856453,
43
- "rewards/question_recreation_reward_func/std": 0.01072776922956109,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
47
  "rewards/strict_format_reward_func/std": 0.0,
48
- "rewards/xmlcount_reward_func/mean": -0.11524999886751175,
49
- "rewards/xmlcount_reward_func/std": 0.32597625255584717,
50
  "step": 2
51
  },
52
  {
@@ -57,20 +57,20 @@
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
  "completions/max_length": 1024.0,
60
- "completions/max_terminated_length": 868.0,
61
- "completions/mean_length": 470.0625,
62
- "completions/mean_terminated_length": 390.9285888671875,
63
- "completions/min_length": 60.0,
64
- "completions/min_terminated_length": 60.0,
65
- "epoch": 3.8,
66
  "frac_reward_zero_std": 0.0,
67
- "grad_norm": 5.8317060470581055,
68
- "kl": 0.00036002036267746007,
69
  "learning_rate": 4.864543104251586e-07,
70
- "loss": 0.136,
71
- "num_tokens": 26211.0,
72
- "reward": 0.07699444144964218,
73
- "reward_std": 0.030455347150564194,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
@@ -79,14 +79,14 @@
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
- "rewards/question_recreation_reward_func/mean": 0.07699444144964218,
83
- "rewards/question_recreation_reward_func/std": 0.04443136602640152,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
87
  "rewards/strict_format_reward_func/std": 0.0,
88
- "rewards/xmlcount_reward_func/mean": 0.0,
89
- "rewards/xmlcount_reward_func/std": 0.0,
90
  "step": 4
91
  },
92
  {
@@ -96,21 +96,21 @@
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
  "completions/clipped_ratio": 0.125,
99
- "completions/max_length": 1024.0,
100
- "completions/max_terminated_length": 1003.5,
101
- "completions/mean_length": 548.625,
102
- "completions/mean_terminated_length": 480.7143096923828,
103
- "completions/min_length": 134.0,
104
- "completions/min_terminated_length": 134.0,
105
- "epoch": 5.8,
106
  "frac_reward_zero_std": 0.0,
107
- "grad_norm": 3.393465518951416,
108
- "kl": 0.00036590506533684675,
109
  "learning_rate": 4.472851273490984e-07,
110
- "loss": 0.1056,
111
- "num_tokens": 39085.0,
112
- "reward": 0.052806172519922256,
113
- "reward_std": 0.017434499226510525,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
@@ -119,8 +119,8 @@
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
- "rewards/question_recreation_reward_func/mean": 0.052806172519922256,
123
- "rewards/question_recreation_reward_func/std": 0.018543646670877934,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -135,22 +135,22 @@
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
- "completions/clipped_ratio": 0.0,
139
- "completions/max_length": 900.0,
140
- "completions/max_terminated_length": 900.0,
141
- "completions/mean_length": 458.8125,
142
- "completions/mean_terminated_length": 458.8125,
143
- "completions/min_length": 114.0,
144
- "completions/min_terminated_length": 114.0,
145
- "epoch": 7.8,
146
  "frac_reward_zero_std": 0.0,
147
- "grad_norm": 4.418057918548584,
148
- "kl": 0.0005609585878119105,
149
  "learning_rate": 3.867370395306068e-07,
150
- "loss": 0.0939,
151
- "num_tokens": 50522.0,
152
- "reward": 0.024604666978120804,
153
- "reward_std": 0.009030941408127546,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
@@ -159,8 +159,8 @@
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
- "rewards/question_recreation_reward_func/mean": 0.024604666978120804,
163
- "rewards/question_recreation_reward_func/std": 0.011531331343576312,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -176,21 +176,21 @@
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
  "completions/clipped_ratio": 0.0625,
179
- "completions/max_length": 928.5,
180
- "completions/max_terminated_length": 824.0,
181
- "completions/mean_length": 451.25,
182
- "completions/mean_terminated_length": 412.8482208251953,
183
- "completions/min_length": 56.0,
184
- "completions/min_terminated_length": 56.0,
185
- "epoch": 9.8,
186
  "frac_reward_zero_std": 0.0,
187
- "grad_norm": 4.358504772186279,
188
- "kl": 0.0006784129072912037,
189
  "learning_rate": 3.1137137178519977e-07,
190
- "loss": 0.1711,
191
- "num_tokens": 61838.0,
192
- "reward": 0.08084426820278168,
193
- "reward_std": 0.05193536356091499,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
  "rewards/consensus_reward_func/mean": 0.0,
@@ -199,8 +199,8 @@
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
- "rewards/question_recreation_reward_func/mean": 0.08084426820278168,
203
- "rewards/question_recreation_reward_func/std": 0.08748885244131088,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -210,19 +210,212 @@
210
  "step": 10
211
  },
212
  {
213
- "epoch": 9.8,
214
- "step": 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  "total_flos": 0.0,
216
- "train_loss": 0.0874896764755249,
217
- "train_runtime": 937.823,
218
- "train_samples_per_second": 0.171,
219
- "train_steps_per_second": 0.021
220
  }
221
  ],
222
  "logging_steps": 2,
223
  "max_steps": 20,
224
- "num_input_tokens_seen": 61838,
225
- "num_train_epochs": 10,
226
  "save_steps": 25,
227
  "stateful_callbacks": {
228
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9620253164556962,
6
  "eval_steps": 500,
7
+ "global_step": 19,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.125,
19
+ "completions/max_length": 931.0,
20
+ "completions/max_terminated_length": 819.5,
21
+ "completions/mean_length": 412.1875,
22
+ "completions/mean_terminated_length": 329.5833435058594,
23
+ "completions/min_length": 87.0,
24
+ "completions/min_terminated_length": 87.0,
25
+ "epoch": 0.10126582278481013,
26
  "frac_reward_zero_std": 0.0,
27
+ "grad_norm": 5.04105806350708,
28
+ "kl": -5.448857809398078e-09,
29
  "learning_rate": 5e-07,
30
+ "loss": 0.1398,
31
+ "num_tokens": 10691.0,
32
+ "reward": 0.01983704511076212,
33
+ "reward_std": 0.008039223263040185,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
 
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.01983704511076212,
43
+ "rewards/question_recreation_reward_func/std": 0.012273336760699749,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
47
  "rewards/strict_format_reward_func/std": 0.0,
48
+ "rewards/xmlcount_reward_func/mean": 0.0,
49
+ "rewards/xmlcount_reward_func/std": 0.0,
50
  "step": 2
51
  },
52
  {
 
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
  "completions/max_length": 1024.0,
60
+ "completions/max_terminated_length": 569.0,
61
+ "completions/mean_length": 401.1875,
62
+ "completions/mean_terminated_length": 312.21429443359375,
63
+ "completions/min_length": 53.5,
64
+ "completions/min_terminated_length": 53.5,
65
+ "epoch": 0.20253164556962025,
66
  "frac_reward_zero_std": 0.0,
67
+ "grad_norm": 3.7304916381835938,
68
+ "kl": 0.0002279730260852375,
69
  "learning_rate": 4.864543104251586e-07,
70
+ "loss": 0.1637,
71
+ "num_tokens": 21206.0,
72
+ "reward": -0.1395124290138483,
73
+ "reward_std": 0.21926994435489178,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
 
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.010925075970590115,
83
+ "rewards/question_recreation_reward_func/std": 0.009428349556401372,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
87
  "rewards/strict_format_reward_func/std": 0.0,
88
+ "rewards/xmlcount_reward_func/mean": -0.15043750405311584,
89
+ "rewards/xmlcount_reward_func/std": 0.4255015254020691,
90
  "step": 4
91
  },
92
  {
 
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
  "completions/clipped_ratio": 0.125,
99
+ "completions/max_length": 887.0,
100
+ "completions/max_terminated_length": 672.5,
101
+ "completions/mean_length": 362.0,
102
+ "completions/mean_terminated_length": 257.3125,
103
+ "completions/min_length": 12.5,
104
+ "completions/min_terminated_length": 12.5,
105
+ "epoch": 0.3037974683544304,
106
  "frac_reward_zero_std": 0.0,
107
+ "grad_norm": 7.230990886688232,
108
+ "kl": 0.0008080850275291596,
109
  "learning_rate": 4.472851273490984e-07,
110
+ "loss": 0.2574,
111
+ "num_tokens": 31094.0,
112
+ "reward": 0.022412247024476528,
113
+ "reward_std": 0.011643366422504187,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
 
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
+ "rewards/question_recreation_reward_func/mean": 0.022412247024476528,
123
+ "rewards/question_recreation_reward_func/std": 0.01874966360628605,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
 
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
+ "completions/clipped_ratio": 0.1875,
139
+ "completions/max_length": 862.5,
140
+ "completions/max_terminated_length": 702.5,
141
+ "completions/mean_length": 461.6875,
142
+ "completions/mean_terminated_length": 357.3249969482422,
143
+ "completions/min_length": 67.0,
144
+ "completions/min_terminated_length": 67.0,
145
+ "epoch": 0.4050632911392405,
146
  "frac_reward_zero_std": 0.0,
147
+ "grad_norm": 3.539825677871704,
148
+ "kl": 0.0011349972373864148,
149
  "learning_rate": 3.867370395306068e-07,
150
+ "loss": 0.2018,
151
+ "num_tokens": 42577.0,
152
+ "reward": 0.10562402009963989,
153
+ "reward_std": 0.061309706419706345,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
 
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
+ "rewards/question_recreation_reward_func/mean": 0.10562402009963989,
163
+ "rewards/question_recreation_reward_func/std": 0.05985743924975395,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
 
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
  "completions/clipped_ratio": 0.0625,
179
+ "completions/max_length": 664.0,
180
+ "completions/max_terminated_length": 436.5,
181
+ "completions/mean_length": 222.4375,
182
+ "completions/mean_terminated_length": 173.0357208251953,
183
+ "completions/min_length": 4.5,
184
+ "completions/min_terminated_length": 4.5,
185
+ "epoch": 0.5063291139240507,
186
  "frac_reward_zero_std": 0.0,
187
+ "grad_norm": 13.28281021118164,
188
+ "kl": 0.0055584801666554995,
189
  "learning_rate": 3.1137137178519977e-07,
190
+ "loss": 0.0764,
191
+ "num_tokens": 50232.0,
192
+ "reward": 0.05675883777439594,
193
+ "reward_std": 0.017147527541965246,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
  "rewards/consensus_reward_func/mean": 0.0,
 
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
+ "rewards/question_recreation_reward_func/mean": 0.05675883777439594,
203
+ "rewards/question_recreation_reward_func/std": 0.02832796238362789,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
 
210
  "step": 10
211
  },
212
  {
213
+ "clip_ratio/high_max": 0.0,
214
+ "clip_ratio/high_mean": 0.0,
215
+ "clip_ratio/low_mean": 0.0,
216
+ "clip_ratio/low_min": 0.0,
217
+ "clip_ratio/region_mean": 0.0,
218
+ "completions/clipped_ratio": 0.125,
219
+ "completions/max_length": 681.5,
220
+ "completions/max_terminated_length": 319.0,
221
+ "completions/mean_length": 298.25,
222
+ "completions/mean_terminated_length": 195.8541717529297,
223
+ "completions/min_length": 73.5,
224
+ "completions/min_terminated_length": 73.5,
225
+ "epoch": 0.6075949367088608,
226
+ "frac_reward_zero_std": 0.0,
227
+ "grad_norm": 8.755450248718262,
228
+ "kl": 0.0035302894830238074,
229
+ "learning_rate": 2.2935516363191693e-07,
230
+ "loss": 0.2667,
231
+ "num_tokens": 59100.0,
232
+ "reward": 0.015352241694927216,
233
+ "reward_std": 0.0057732411660254,
234
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
235
+ "rewards/concensus_correctness_reward_func/std": 0.0,
236
+ "rewards/consensus_reward_func/mean": 0.0,
237
+ "rewards/consensus_reward_func/std": 0.0,
238
+ "rewards/cumulative_reward_2/mean": 0.0,
239
+ "rewards/cumulative_reward_2/std": 0.0,
240
+ "rewards/final_correctness_reward_func/mean": 0.0,
241
+ "rewards/final_correctness_reward_func/std": 0.0,
242
+ "rewards/question_recreation_reward_func/mean": 0.015352241694927216,
243
+ "rewards/question_recreation_reward_func/std": 0.006488756742328405,
244
+ "rewards/soft_format_reward_func/mean": 0.0,
245
+ "rewards/soft_format_reward_func/std": 0.0,
246
+ "rewards/strict_format_reward_func/mean": 0.0,
247
+ "rewards/strict_format_reward_func/std": 0.0,
248
+ "rewards/xmlcount_reward_func/mean": 0.0,
249
+ "rewards/xmlcount_reward_func/std": 0.0,
250
+ "step": 12
251
+ },
252
+ {
253
+ "clip_ratio/high_max": 0.0,
254
+ "clip_ratio/high_mean": 0.0,
255
+ "clip_ratio/low_mean": 0.0,
256
+ "clip_ratio/low_min": 0.0,
257
+ "clip_ratio/region_mean": 0.0,
258
+ "completions/clipped_ratio": 0.125,
259
+ "completions/max_length": 991.0,
260
+ "completions/max_terminated_length": 756.5,
261
+ "completions/mean_length": 509.4375,
262
+ "completions/mean_terminated_length": 421.9583435058594,
263
+ "completions/min_length": 36.5,
264
+ "completions/min_terminated_length": 36.5,
265
+ "epoch": 0.7088607594936709,
266
+ "frac_reward_zero_std": 0.0,
267
+ "grad_norm": 4.25427770614624,
268
+ "kl": 0.001760676721460186,
269
+ "learning_rate": 1.4957614383675767e-07,
270
+ "loss": 0.2682,
271
+ "num_tokens": 71347.0,
272
+ "reward": -0.05695461109280586,
273
+ "reward_std": 0.22896763868629932,
274
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
275
+ "rewards/concensus_correctness_reward_func/std": 0.0,
276
+ "rewards/consensus_reward_func/mean": 0.0,
277
+ "rewards/consensus_reward_func/std": 0.0,
278
+ "rewards/cumulative_reward_2/mean": 0.0,
279
+ "rewards/cumulative_reward_2/std": 0.0,
280
+ "rewards/final_correctness_reward_func/mean": 0.0,
281
+ "rewards/final_correctness_reward_func/std": 0.0,
282
+ "rewards/question_recreation_reward_func/mean": 0.05810788832604885,
283
+ "rewards/question_recreation_reward_func/std": 0.04673689045011997,
284
+ "rewards/soft_format_reward_func/mean": 0.0,
285
+ "rewards/soft_format_reward_func/std": 0.0,
286
+ "rewards/strict_format_reward_func/mean": 0.0,
287
+ "rewards/strict_format_reward_func/std": 0.0,
288
+ "rewards/xmlcount_reward_func/mean": -0.11506249755620956,
289
+ "rewards/xmlcount_reward_func/std": 0.3688294589519501,
290
+ "step": 14
291
+ },
292
+ {
293
+ "clip_ratio/high_max": 0.0,
294
+ "clip_ratio/high_mean": 0.0,
295
+ "clip_ratio/low_mean": 0.0,
296
+ "clip_ratio/low_min": 0.0,
297
+ "clip_ratio/region_mean": 0.0,
298
+ "completions/clipped_ratio": 0.0,
299
+ "completions/max_length": 214.0,
300
+ "completions/max_terminated_length": 214.0,
301
+ "completions/mean_length": 103.6875,
302
+ "completions/mean_terminated_length": 103.6875,
303
+ "completions/min_length": 5.0,
304
+ "completions/min_terminated_length": 5.0,
305
+ "epoch": 0.810126582278481,
306
+ "frac_reward_zero_std": 0.0,
307
+ "grad_norm": 12.23747444152832,
308
+ "kl": 0.007723030605120584,
309
+ "learning_rate": 8.067960709356478e-08,
310
+ "loss": 0.0009,
311
+ "num_tokens": 77102.0,
312
+ "reward": 0.17531824856996536,
313
+ "reward_std": 0.03592286352068186,
314
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
315
+ "rewards/concensus_correctness_reward_func/std": 0.0,
316
+ "rewards/consensus_reward_func/mean": 0.0,
317
+ "rewards/consensus_reward_func/std": 0.0,
318
+ "rewards/cumulative_reward_2/mean": 0.0,
319
+ "rewards/cumulative_reward_2/std": 0.0,
320
+ "rewards/final_correctness_reward_func/mean": 0.0,
321
+ "rewards/final_correctness_reward_func/std": 0.0,
322
+ "rewards/question_recreation_reward_func/mean": 0.17531824856996536,
323
+ "rewards/question_recreation_reward_func/std": 0.045727355405688286,
324
+ "rewards/soft_format_reward_func/mean": 0.0,
325
+ "rewards/soft_format_reward_func/std": 0.0,
326
+ "rewards/strict_format_reward_func/mean": 0.0,
327
+ "rewards/strict_format_reward_func/std": 0.0,
328
+ "rewards/xmlcount_reward_func/mean": 0.0,
329
+ "rewards/xmlcount_reward_func/std": 0.0,
330
+ "step": 16
331
+ },
332
+ {
333
+ "clip_ratio/high_max": 0.0,
334
+ "clip_ratio/high_mean": 0.0,
335
+ "clip_ratio/low_mean": 0.0,
336
+ "clip_ratio/low_min": 0.0,
337
+ "clip_ratio/region_mean": 0.0,
338
+ "completions/clipped_ratio": 0.0625,
339
+ "completions/max_length": 824.0,
340
+ "completions/max_terminated_length": 530.5,
341
+ "completions/mean_length": 194.3125,
342
+ "completions/mean_terminated_length": 137.34821701049805,
343
+ "completions/min_length": 3.5,
344
+ "completions/min_terminated_length": 3.5,
345
+ "epoch": 0.9113924050632911,
346
+ "frac_reward_zero_std": 0.125,
347
+ "grad_norm": 9.846582412719727,
348
+ "kl": 0.006351641248329543,
349
+ "learning_rate": 3.013156219837776e-08,
350
+ "loss": 0.0552,
351
+ "num_tokens": 84307.0,
352
+ "reward": 0.13931425474584103,
353
+ "reward_std": 0.058048633858561516,
354
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
355
+ "rewards/concensus_correctness_reward_func/std": 0.0,
356
+ "rewards/consensus_reward_func/mean": 0.0,
357
+ "rewards/consensus_reward_func/std": 0.0,
358
+ "rewards/cumulative_reward_2/mean": 0.0,
359
+ "rewards/cumulative_reward_2/std": 0.0,
360
+ "rewards/final_correctness_reward_func/mean": 0.0,
361
+ "rewards/final_correctness_reward_func/std": 0.0,
362
+ "rewards/question_recreation_reward_func/mean": 0.13931425474584103,
363
+ "rewards/question_recreation_reward_func/std": 0.15117722004652023,
364
+ "rewards/soft_format_reward_func/mean": 0.0,
365
+ "rewards/soft_format_reward_func/std": 0.0,
366
+ "rewards/strict_format_reward_func/mean": 0.0,
367
+ "rewards/strict_format_reward_func/std": 0.0,
368
+ "rewards/xmlcount_reward_func/mean": 0.0,
369
+ "rewards/xmlcount_reward_func/std": 0.0,
370
+ "step": 18
371
+ },
372
+ {
373
+ "clip_ratio/high_max": 0.0,
374
+ "clip_ratio/high_mean": 0.0,
375
+ "clip_ratio/low_mean": 0.0,
376
+ "clip_ratio/low_min": 0.0,
377
+ "clip_ratio/region_mean": 0.0,
378
+ "completions/clipped_ratio": 0.5,
379
+ "completions/max_length": 1024.0,
380
+ "completions/max_terminated_length": 565.0,
381
+ "completions/mean_length": 665.25,
382
+ "completions/mean_terminated_length": 306.5,
383
+ "completions/min_length": 5.0,
384
+ "completions/min_terminated_length": 5.0,
385
+ "epoch": 0.9620253164556962,
386
+ "frac_reward_zero_std": 0.0,
387
+ "kl": 0.002655731455888599,
388
+ "num_tokens": 91677.0,
389
+ "reward": 0.012782756239175797,
390
+ "reward_std": 0.005160772241652012,
391
+ "rewards/concensus_correctness_reward_func/mean": 0.0,
392
+ "rewards/concensus_correctness_reward_func/std": 0.0,
393
+ "rewards/consensus_reward_func/mean": 0.0,
394
+ "rewards/consensus_reward_func/std": 0.0,
395
+ "rewards/cumulative_reward_2/mean": 0.0,
396
+ "rewards/cumulative_reward_2/std": 0.0,
397
+ "rewards/final_correctness_reward_func/mean": 0.0,
398
+ "rewards/final_correctness_reward_func/std": 0.0,
399
+ "rewards/question_recreation_reward_func/mean": 0.012782756239175797,
400
+ "rewards/question_recreation_reward_func/std": 0.008415077812969685,
401
+ "rewards/soft_format_reward_func/mean": 0.0,
402
+ "rewards/soft_format_reward_func/std": 0.0,
403
+ "rewards/strict_format_reward_func/mean": 0.0,
404
+ "rewards/strict_format_reward_func/std": 0.0,
405
+ "rewards/xmlcount_reward_func/mean": 0.0,
406
+ "rewards/xmlcount_reward_func/std": 0.0,
407
+ "step": 19,
408
  "total_flos": 0.0,
409
+ "train_loss": 0.15644574165344238,
410
+ "train_runtime": 1523.8731,
411
+ "train_samples_per_second": 0.105,
412
+ "train_steps_per_second": 0.013
413
  }
414
  ],
415
  "logging_steps": 2,
416
  "max_steps": 20,
417
+ "num_input_tokens_seen": 91677,
418
+ "num_train_epochs": 1,
419
  "save_steps": 25,
420
  "stateful_callbacks": {
421
  "TrainerControl": {