hazentr commited on
Commit
93f193a
·
verified ·
1 Parent(s): 652c830

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -2
  2. all_results.json +4 -4
  3. model.safetensors +1 -1
  4. train_results.json +4 -4
  5. trainer_state.json +186 -179
README.md CHANGED
@@ -4,11 +4,11 @@ library_name: transformers
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
7
- - grpo
8
  - gensyn
9
- - I am quick timid frog
10
  - trl
11
  - rl-swarm
 
 
12
  licence: license
13
  ---
14
 
 
4
  model_name: Qwen2.5-0.5B-Instruct-Gensyn-Swarm-quick_timid_frog
5
  tags:
6
  - generated_from_trainer
 
7
  - gensyn
 
8
  - trl
9
  - rl-swarm
10
+ - I am quick timid frog
11
+ - grpo
12
  licence: license
13
  ---
14
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.15644574165344238,
4
- "train_runtime": 1523.8731,
5
- "train_samples": 79,
6
- "train_samples_per_second": 0.105,
7
  "train_steps_per_second": 0.013
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0773889608681202,
4
+ "train_runtime": 1482.6063,
5
+ "train_samples": 100,
6
+ "train_samples_per_second": 0.108,
7
  "train_steps_per_second": 0.013
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24593988fb48726d4a5db5e2e37fe74d6a678e8b4785c2c95c767b747a674505
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec1757c7bcfb69a7ca1f90936efaae4cbde51cad9aa6283191230f67cfe9dd0a
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.15644574165344238,
4
- "train_runtime": 1523.8731,
5
- "train_samples": 79,
6
- "train_samples_per_second": 0.105,
7
  "train_steps_per_second": 0.013
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0773889608681202,
4
+ "train_runtime": 1482.6063,
5
+ "train_samples": 100,
6
+ "train_samples_per_second": 0.108,
7
  "train_steps_per_second": 0.013
8
  }
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9620253164556962,
6
  "eval_steps": 500,
7
- "global_step": 19,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16,21 +16,21 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.125,
19
- "completions/max_length": 931.0,
20
- "completions/max_terminated_length": 819.5,
21
- "completions/mean_length": 412.1875,
22
- "completions/mean_terminated_length": 329.5833435058594,
23
- "completions/min_length": 87.0,
24
- "completions/min_terminated_length": 87.0,
25
- "epoch": 0.10126582278481013,
26
  "frac_reward_zero_std": 0.0,
27
- "grad_norm": 5.04105806350708,
28
- "kl": -5.448857809398078e-09,
29
  "learning_rate": 5e-07,
30
- "loss": 0.1398,
31
- "num_tokens": 10691.0,
32
- "reward": 0.01983704511076212,
33
- "reward_std": 0.008039223263040185,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
@@ -39,8 +39,8 @@
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
- "rewards/question_recreation_reward_func/mean": 0.01983704511076212,
43
- "rewards/question_recreation_reward_func/std": 0.012273336760699749,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -56,21 +56,21 @@
56
  "clip_ratio/low_min": 0.0,
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
- "completions/max_length": 1024.0,
60
- "completions/max_terminated_length": 569.0,
61
- "completions/mean_length": 401.1875,
62
- "completions/mean_terminated_length": 312.21429443359375,
63
- "completions/min_length": 53.5,
64
- "completions/min_terminated_length": 53.5,
65
- "epoch": 0.20253164556962025,
66
  "frac_reward_zero_std": 0.0,
67
- "grad_norm": 3.7304916381835938,
68
- "kl": 0.0002279730260852375,
69
  "learning_rate": 4.864543104251586e-07,
70
- "loss": 0.1637,
71
- "num_tokens": 21206.0,
72
- "reward": -0.1395124290138483,
73
- "reward_std": 0.21926994435489178,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
@@ -79,14 +79,14 @@
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
- "rewards/question_recreation_reward_func/mean": 0.010925075970590115,
83
- "rewards/question_recreation_reward_func/std": 0.009428349556401372,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
87
  "rewards/strict_format_reward_func/std": 0.0,
88
- "rewards/xmlcount_reward_func/mean": -0.15043750405311584,
89
- "rewards/xmlcount_reward_func/std": 0.4255015254020691,
90
  "step": 4
91
  },
92
  {
@@ -95,22 +95,22 @@
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
- "completions/clipped_ratio": 0.125,
99
- "completions/max_length": 887.0,
100
- "completions/max_terminated_length": 672.5,
101
- "completions/mean_length": 362.0,
102
- "completions/mean_terminated_length": 257.3125,
103
- "completions/min_length": 12.5,
104
- "completions/min_terminated_length": 12.5,
105
- "epoch": 0.3037974683544304,
106
  "frac_reward_zero_std": 0.0,
107
- "grad_norm": 7.230990886688232,
108
- "kl": 0.0008080850275291596,
109
  "learning_rate": 4.472851273490984e-07,
110
- "loss": 0.2574,
111
- "num_tokens": 31094.0,
112
- "reward": 0.022412247024476528,
113
- "reward_std": 0.011643366422504187,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
@@ -119,8 +119,8 @@
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
- "rewards/question_recreation_reward_func/mean": 0.022412247024476528,
123
- "rewards/question_recreation_reward_func/std": 0.01874966360628605,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -135,22 +135,22 @@
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
- "completions/clipped_ratio": 0.1875,
139
- "completions/max_length": 862.5,
140
- "completions/max_terminated_length": 702.5,
141
- "completions/mean_length": 461.6875,
142
- "completions/mean_terminated_length": 357.3249969482422,
143
- "completions/min_length": 67.0,
144
- "completions/min_terminated_length": 67.0,
145
- "epoch": 0.4050632911392405,
146
  "frac_reward_zero_std": 0.0,
147
- "grad_norm": 3.539825677871704,
148
- "kl": 0.0011349972373864148,
149
  "learning_rate": 3.867370395306068e-07,
150
- "loss": 0.2018,
151
- "num_tokens": 42577.0,
152
- "reward": 0.10562402009963989,
153
- "reward_std": 0.061309706419706345,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
@@ -159,8 +159,8 @@
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
- "rewards/question_recreation_reward_func/mean": 0.10562402009963989,
163
- "rewards/question_recreation_reward_func/std": 0.05985743924975395,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -175,32 +175,32 @@
175
  "clip_ratio/low_mean": 0.0,
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
- "completions/clipped_ratio": 0.0625,
179
- "completions/max_length": 664.0,
180
- "completions/max_terminated_length": 436.5,
181
- "completions/mean_length": 222.4375,
182
- "completions/mean_terminated_length": 173.0357208251953,
183
- "completions/min_length": 4.5,
184
- "completions/min_terminated_length": 4.5,
185
- "epoch": 0.5063291139240507,
186
  "frac_reward_zero_std": 0.0,
187
- "grad_norm": 13.28281021118164,
188
- "kl": 0.0055584801666554995,
189
  "learning_rate": 3.1137137178519977e-07,
190
- "loss": 0.0764,
191
- "num_tokens": 50232.0,
192
- "reward": 0.05675883777439594,
193
- "reward_std": 0.017147527541965246,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
- "rewards/consensus_reward_func/mean": 0.0,
197
- "rewards/consensus_reward_func/std": 0.0,
198
  "rewards/cumulative_reward_2/mean": 0.0,
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
- "rewards/question_recreation_reward_func/mean": 0.05675883777439594,
203
- "rewards/question_recreation_reward_func/std": 0.02832796238362789,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -215,32 +215,32 @@
215
  "clip_ratio/low_mean": 0.0,
216
  "clip_ratio/low_min": 0.0,
217
  "clip_ratio/region_mean": 0.0,
218
- "completions/clipped_ratio": 0.125,
219
- "completions/max_length": 681.5,
220
- "completions/max_terminated_length": 319.0,
221
- "completions/mean_length": 298.25,
222
- "completions/mean_terminated_length": 195.8541717529297,
223
- "completions/min_length": 73.5,
224
- "completions/min_terminated_length": 73.5,
225
- "epoch": 0.6075949367088608,
226
  "frac_reward_zero_std": 0.0,
227
- "grad_norm": 8.755450248718262,
228
- "kl": 0.0035302894830238074,
229
  "learning_rate": 2.2935516363191693e-07,
230
- "loss": 0.2667,
231
- "num_tokens": 59100.0,
232
- "reward": 0.015352241694927216,
233
- "reward_std": 0.0057732411660254,
234
- "rewards/concensus_correctness_reward_func/mean": 0.0,
235
- "rewards/concensus_correctness_reward_func/std": 0.0,
236
- "rewards/consensus_reward_func/mean": 0.0,
237
- "rewards/consensus_reward_func/std": 0.0,
238
  "rewards/cumulative_reward_2/mean": 0.0,
239
  "rewards/cumulative_reward_2/std": 0.0,
240
  "rewards/final_correctness_reward_func/mean": 0.0,
241
  "rewards/final_correctness_reward_func/std": 0.0,
242
- "rewards/question_recreation_reward_func/mean": 0.015352241694927216,
243
- "rewards/question_recreation_reward_func/std": 0.006488756742328405,
244
  "rewards/soft_format_reward_func/mean": 0.0,
245
  "rewards/soft_format_reward_func/std": 0.0,
246
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -255,22 +255,22 @@
255
  "clip_ratio/low_mean": 0.0,
256
  "clip_ratio/low_min": 0.0,
257
  "clip_ratio/region_mean": 0.0,
258
- "completions/clipped_ratio": 0.125,
259
- "completions/max_length": 991.0,
260
- "completions/max_terminated_length": 756.5,
261
- "completions/mean_length": 509.4375,
262
- "completions/mean_terminated_length": 421.9583435058594,
263
- "completions/min_length": 36.5,
264
- "completions/min_terminated_length": 36.5,
265
- "epoch": 0.7088607594936709,
266
  "frac_reward_zero_std": 0.0,
267
- "grad_norm": 4.25427770614624,
268
- "kl": 0.001760676721460186,
269
  "learning_rate": 1.4957614383675767e-07,
270
- "loss": 0.2682,
271
- "num_tokens": 71347.0,
272
- "reward": -0.05695461109280586,
273
- "reward_std": 0.22896763868629932,
274
  "rewards/concensus_correctness_reward_func/mean": 0.0,
275
  "rewards/concensus_correctness_reward_func/std": 0.0,
276
  "rewards/consensus_reward_func/mean": 0.0,
@@ -279,14 +279,14 @@
279
  "rewards/cumulative_reward_2/std": 0.0,
280
  "rewards/final_correctness_reward_func/mean": 0.0,
281
  "rewards/final_correctness_reward_func/std": 0.0,
282
- "rewards/question_recreation_reward_func/mean": 0.05810788832604885,
283
- "rewards/question_recreation_reward_func/std": 0.04673689045011997,
284
  "rewards/soft_format_reward_func/mean": 0.0,
285
  "rewards/soft_format_reward_func/std": 0.0,
286
  "rewards/strict_format_reward_func/mean": 0.0,
287
  "rewards/strict_format_reward_func/std": 0.0,
288
- "rewards/xmlcount_reward_func/mean": -0.11506249755620956,
289
- "rewards/xmlcount_reward_func/std": 0.3688294589519501,
290
  "step": 14
291
  },
292
  {
@@ -295,22 +295,22 @@
295
  "clip_ratio/low_mean": 0.0,
296
  "clip_ratio/low_min": 0.0,
297
  "clip_ratio/region_mean": 0.0,
298
- "completions/clipped_ratio": 0.0,
299
- "completions/max_length": 214.0,
300
- "completions/max_terminated_length": 214.0,
301
- "completions/mean_length": 103.6875,
302
- "completions/mean_terminated_length": 103.6875,
303
- "completions/min_length": 5.0,
304
- "completions/min_terminated_length": 5.0,
305
- "epoch": 0.810126582278481,
306
  "frac_reward_zero_std": 0.0,
307
- "grad_norm": 12.23747444152832,
308
- "kl": 0.007723030605120584,
309
  "learning_rate": 8.067960709356478e-08,
310
- "loss": 0.0009,
311
- "num_tokens": 77102.0,
312
- "reward": 0.17531824856996536,
313
- "reward_std": 0.03592286352068186,
314
  "rewards/concensus_correctness_reward_func/mean": 0.0,
315
  "rewards/concensus_correctness_reward_func/std": 0.0,
316
  "rewards/consensus_reward_func/mean": 0.0,
@@ -319,8 +319,8 @@
319
  "rewards/cumulative_reward_2/std": 0.0,
320
  "rewards/final_correctness_reward_func/mean": 0.0,
321
  "rewards/final_correctness_reward_func/std": 0.0,
322
- "rewards/question_recreation_reward_func/mean": 0.17531824856996536,
323
- "rewards/question_recreation_reward_func/std": 0.045727355405688286,
324
  "rewards/soft_format_reward_func/mean": 0.0,
325
  "rewards/soft_format_reward_func/std": 0.0,
326
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -335,32 +335,32 @@
335
  "clip_ratio/low_mean": 0.0,
336
  "clip_ratio/low_min": 0.0,
337
  "clip_ratio/region_mean": 0.0,
338
- "completions/clipped_ratio": 0.0625,
339
- "completions/max_length": 824.0,
340
- "completions/max_terminated_length": 530.5,
341
- "completions/mean_length": 194.3125,
342
- "completions/mean_terminated_length": 137.34821701049805,
343
  "completions/min_length": 3.5,
344
  "completions/min_terminated_length": 3.5,
345
- "epoch": 0.9113924050632911,
346
  "frac_reward_zero_std": 0.125,
347
- "grad_norm": 9.846582412719727,
348
- "kl": 0.006351641248329543,
349
  "learning_rate": 3.013156219837776e-08,
350
- "loss": 0.0552,
351
- "num_tokens": 84307.0,
352
- "reward": 0.13931425474584103,
353
- "reward_std": 0.058048633858561516,
354
- "rewards/concensus_correctness_reward_func/mean": 0.0,
355
- "rewards/concensus_correctness_reward_func/std": 0.0,
356
- "rewards/consensus_reward_func/mean": 0.0,
357
- "rewards/consensus_reward_func/std": 0.0,
358
  "rewards/cumulative_reward_2/mean": 0.0,
359
  "rewards/cumulative_reward_2/std": 0.0,
360
  "rewards/final_correctness_reward_func/mean": 0.0,
361
  "rewards/final_correctness_reward_func/std": 0.0,
362
- "rewards/question_recreation_reward_func/mean": 0.13931425474584103,
363
- "rewards/question_recreation_reward_func/std": 0.15117722004652023,
364
  "rewards/soft_format_reward_func/mean": 0.0,
365
  "rewards/soft_format_reward_func/std": 0.0,
366
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -375,46 +375,53 @@
375
  "clip_ratio/low_mean": 0.0,
376
  "clip_ratio/low_min": 0.0,
377
  "clip_ratio/region_mean": 0.0,
378
- "completions/clipped_ratio": 0.5,
379
- "completions/max_length": 1024.0,
380
- "completions/max_terminated_length": 565.0,
381
- "completions/mean_length": 665.25,
382
- "completions/mean_terminated_length": 306.5,
383
- "completions/min_length": 5.0,
384
- "completions/min_terminated_length": 5.0,
385
- "epoch": 0.9620253164556962,
386
  "frac_reward_zero_std": 0.0,
387
- "kl": 0.002655731455888599,
388
- "num_tokens": 91677.0,
389
- "reward": 0.012782756239175797,
390
- "reward_std": 0.005160772241652012,
 
 
 
391
  "rewards/concensus_correctness_reward_func/mean": 0.0,
392
  "rewards/concensus_correctness_reward_func/std": 0.0,
393
- "rewards/consensus_reward_func/mean": 0.0,
394
- "rewards/consensus_reward_func/std": 0.0,
395
  "rewards/cumulative_reward_2/mean": 0.0,
396
  "rewards/cumulative_reward_2/std": 0.0,
397
  "rewards/final_correctness_reward_func/mean": 0.0,
398
  "rewards/final_correctness_reward_func/std": 0.0,
399
- "rewards/question_recreation_reward_func/mean": 0.012782756239175797,
400
- "rewards/question_recreation_reward_func/std": 0.008415077812969685,
401
  "rewards/soft_format_reward_func/mean": 0.0,
402
  "rewards/soft_format_reward_func/std": 0.0,
403
  "rewards/strict_format_reward_func/mean": 0.0,
404
  "rewards/strict_format_reward_func/std": 0.0,
405
  "rewards/xmlcount_reward_func/mean": 0.0,
406
  "rewards/xmlcount_reward_func/std": 0.0,
407
- "step": 19,
 
 
 
 
408
  "total_flos": 0.0,
409
- "train_loss": 0.15644574165344238,
410
- "train_runtime": 1523.8731,
411
- "train_samples_per_second": 0.105,
412
  "train_steps_per_second": 0.013
413
  }
414
  ],
415
  "logging_steps": 2,
416
  "max_steps": 20,
417
- "num_input_tokens_seen": 91677,
418
  "num_train_epochs": 1,
419
  "save_steps": 25,
420
  "stateful_callbacks": {
@@ -423,8 +430,8 @@
423
  "should_epoch_stop": false,
424
  "should_evaluate": false,
425
  "should_log": false,
426
- "should_save": false,
427
- "should_training_stop": false
428
  },
429
  "attributes": {}
430
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8,
6
  "eval_steps": 500,
7
+ "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.125,
19
+ "completions/max_length": 1024.0,
20
+ "completions/max_terminated_length": 605.0,
21
+ "completions/mean_length": 324.4375,
22
+ "completions/mean_terminated_length": 224.50000762939453,
23
+ "completions/min_length": 5.5,
24
+ "completions/min_terminated_length": 5.5,
25
+ "epoch": 0.08,
26
  "frac_reward_zero_std": 0.0,
27
+ "grad_norm": 9.534358024597168,
28
+ "kl": -5.265982289115456e-09,
29
  "learning_rate": 5e-07,
30
+ "loss": 0.0881,
31
+ "num_tokens": 9287.0,
32
+ "reward": 0.02333822101354599,
33
+ "reward_std": 0.011800897307693958,
34
  "rewards/concensus_correctness_reward_func/mean": 0.0,
35
  "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
 
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.02333822101354599,
43
+ "rewards/question_recreation_reward_func/std": 0.014200002886354923,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
 
56
  "clip_ratio/low_min": 0.0,
57
  "clip_ratio/region_mean": 0.0,
58
  "completions/clipped_ratio": 0.125,
59
+ "completions/max_length": 831.5,
60
+ "completions/max_terminated_length": 502.5,
61
+ "completions/mean_length": 283.4375,
62
+ "completions/mean_terminated_length": 169.41666793823242,
63
+ "completions/min_length": 14.0,
64
+ "completions/min_terminated_length": 14.0,
65
+ "epoch": 0.16,
66
  "frac_reward_zero_std": 0.0,
67
+ "grad_norm": 12.92307186126709,
68
+ "kl": 0.0008189797645172803,
69
  "learning_rate": 4.864543104251586e-07,
70
+ "loss": -0.0098,
71
+ "num_tokens": 17918.0,
72
+ "reward": 0.08256983105093241,
73
+ "reward_std": 0.01837824168615043,
74
  "rewards/concensus_correctness_reward_func/mean": 0.0,
75
  "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
 
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.08256983105093241,
83
+ "rewards/question_recreation_reward_func/std": 0.023414009949192405,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
87
  "rewards/strict_format_reward_func/std": 0.0,
88
+ "rewards/xmlcount_reward_func/mean": 0.0,
89
+ "rewards/xmlcount_reward_func/std": 0.0,
90
  "step": 4
91
  },
92
  {
 
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
+ "completions/clipped_ratio": 0.0,
99
+ "completions/max_length": 722.5,
100
+ "completions/max_terminated_length": 722.5,
101
+ "completions/mean_length": 253.75,
102
+ "completions/mean_terminated_length": 253.75,
103
+ "completions/min_length": 24.5,
104
+ "completions/min_terminated_length": 24.5,
105
+ "epoch": 0.24,
106
  "frac_reward_zero_std": 0.0,
107
+ "grad_norm": 6.703396320343018,
108
+ "kl": 0.0009663624296081252,
109
  "learning_rate": 4.472851273490984e-07,
110
+ "loss": 0.159,
111
+ "num_tokens": 26074.0,
112
+ "reward": 0.026149642653763294,
113
+ "reward_std": 0.01439021248370409,
114
  "rewards/concensus_correctness_reward_func/mean": 0.0,
115
  "rewards/concensus_correctness_reward_func/std": 0.0,
116
  "rewards/consensus_reward_func/mean": 0.0,
 
119
  "rewards/cumulative_reward_2/std": 0.0,
120
  "rewards/final_correctness_reward_func/mean": 0.0,
121
  "rewards/final_correctness_reward_func/std": 0.0,
122
+ "rewards/question_recreation_reward_func/mean": 0.026149642653763294,
123
+ "rewards/question_recreation_reward_func/std": 0.01520916074514389,
124
  "rewards/soft_format_reward_func/mean": 0.0,
125
  "rewards/soft_format_reward_func/std": 0.0,
126
  "rewards/strict_format_reward_func/mean": 0.0,
 
135
  "clip_ratio/low_mean": 0.0,
136
  "clip_ratio/low_min": 0.0,
137
  "clip_ratio/region_mean": 0.0,
138
+ "completions/clipped_ratio": 0.0,
139
+ "completions/max_length": 678.5,
140
+ "completions/max_terminated_length": 678.5,
141
+ "completions/mean_length": 289.75,
142
+ "completions/mean_terminated_length": 289.75,
143
+ "completions/min_length": 70.5,
144
+ "completions/min_terminated_length": 70.5,
145
+ "epoch": 0.32,
146
  "frac_reward_zero_std": 0.0,
147
+ "grad_norm": 7.256834030151367,
148
+ "kl": 0.0016369151817343663,
149
  "learning_rate": 3.867370395306068e-07,
150
+ "loss": -0.0972,
151
+ "num_tokens": 34806.0,
152
+ "reward": 0.060055448208004236,
153
+ "reward_std": 0.020912725245580077,
154
  "rewards/concensus_correctness_reward_func/mean": 0.0,
155
  "rewards/concensus_correctness_reward_func/std": 0.0,
156
  "rewards/consensus_reward_func/mean": 0.0,
 
159
  "rewards/cumulative_reward_2/std": 0.0,
160
  "rewards/final_correctness_reward_func/mean": 0.0,
161
  "rewards/final_correctness_reward_func/std": 0.0,
162
+ "rewards/question_recreation_reward_func/mean": 0.060055448208004236,
163
+ "rewards/question_recreation_reward_func/std": 0.034508606884628534,
164
  "rewards/soft_format_reward_func/mean": 0.0,
165
  "rewards/soft_format_reward_func/std": 0.0,
166
  "rewards/strict_format_reward_func/mean": 0.0,
 
175
  "clip_ratio/low_mean": 0.0,
176
  "clip_ratio/low_min": 0.0,
177
  "clip_ratio/region_mean": 0.0,
178
+ "completions/clipped_ratio": 0.0,
179
+ "completions/max_length": 555.5,
180
+ "completions/max_terminated_length": 555.5,
181
+ "completions/mean_length": 258.375,
182
+ "completions/mean_terminated_length": 258.375,
183
+ "completions/min_length": 3.5,
184
+ "completions/min_terminated_length": 3.5,
185
+ "epoch": 0.4,
186
  "frac_reward_zero_std": 0.0,
187
+ "grad_norm": 5.563640594482422,
188
+ "kl": 0.0017646014493948314,
189
  "learning_rate": 3.1137137178519977e-07,
190
+ "loss": 0.1804,
191
+ "num_tokens": 43036.0,
192
+ "reward": 0.1413715137168765,
193
+ "reward_std": 0.18358006980270147,
194
  "rewards/concensus_correctness_reward_func/mean": 0.0,
195
  "rewards/concensus_correctness_reward_func/std": 0.0,
196
+ "rewards/consensus_reward_func/mean": 0.125,
197
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
198
  "rewards/cumulative_reward_2/mean": 0.0,
199
  "rewards/cumulative_reward_2/std": 0.0,
200
  "rewards/final_correctness_reward_func/mean": 0.0,
201
  "rewards/final_correctness_reward_func/std": 0.0,
202
+ "rewards/question_recreation_reward_func/mean": 0.016371519304811954,
203
+ "rewards/question_recreation_reward_func/std": 0.011078037787228823,
204
  "rewards/soft_format_reward_func/mean": 0.0,
205
  "rewards/soft_format_reward_func/std": 0.0,
206
  "rewards/strict_format_reward_func/mean": 0.0,
 
215
  "clip_ratio/low_mean": 0.0,
216
  "clip_ratio/low_min": 0.0,
217
  "clip_ratio/region_mean": 0.0,
218
+ "completions/clipped_ratio": 0.0625,
219
+ "completions/max_length": 670.0,
220
+ "completions/max_terminated_length": 533.5,
221
+ "completions/mean_length": 258.75,
222
+ "completions/mean_terminated_length": 214.0,
223
+ "completions/min_length": 4.5,
224
+ "completions/min_terminated_length": 4.5,
225
+ "epoch": 0.48,
226
  "frac_reward_zero_std": 0.0,
227
+ "grad_norm": 6.797975063323975,
228
+ "kl": 0.003329606697661802,
229
  "learning_rate": 2.2935516363191693e-07,
230
+ "loss": 0.0231,
231
+ "num_tokens": 51272.0,
232
+ "reward": 0.30666957050561905,
233
+ "reward_std": 0.37889517843723297,
234
+ "rewards/concensus_correctness_reward_func/mean": 0.1197500005364418,
235
+ "rewards/concensus_correctness_reward_func/std": 0.3387041389942169,
236
+ "rewards/consensus_reward_func/mean": 0.125,
237
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
238
  "rewards/cumulative_reward_2/mean": 0.0,
239
  "rewards/cumulative_reward_2/std": 0.0,
240
  "rewards/final_correctness_reward_func/mean": 0.0,
241
  "rewards/final_correctness_reward_func/std": 0.0,
242
+ "rewards/question_recreation_reward_func/mean": 0.061919582076370716,
243
+ "rewards/question_recreation_reward_func/std": 0.04408737272024155,
244
  "rewards/soft_format_reward_func/mean": 0.0,
245
  "rewards/soft_format_reward_func/std": 0.0,
246
  "rewards/strict_format_reward_func/mean": 0.0,
 
255
  "clip_ratio/low_mean": 0.0,
256
  "clip_ratio/low_min": 0.0,
257
  "clip_ratio/region_mean": 0.0,
258
+ "completions/clipped_ratio": 0.0,
259
+ "completions/max_length": 881.0,
260
+ "completions/max_terminated_length": 881.0,
261
+ "completions/mean_length": 285.3125,
262
+ "completions/mean_terminated_length": 285.3125,
263
+ "completions/min_length": 49.0,
264
+ "completions/min_terminated_length": 49.0,
265
+ "epoch": 0.56,
266
  "frac_reward_zero_std": 0.0,
267
+ "grad_norm": 7.043668270111084,
268
+ "kl": 0.002082884529954754,
269
  "learning_rate": 1.4957614383675767e-07,
270
+ "loss": 0.35,
271
+ "num_tokens": 59933.0,
272
+ "reward": 0.06385299749672413,
273
+ "reward_std": 0.03737428830936551,
274
  "rewards/concensus_correctness_reward_func/mean": 0.0,
275
  "rewards/concensus_correctness_reward_func/std": 0.0,
276
  "rewards/consensus_reward_func/mean": 0.0,
 
279
  "rewards/cumulative_reward_2/std": 0.0,
280
  "rewards/final_correctness_reward_func/mean": 0.0,
281
  "rewards/final_correctness_reward_func/std": 0.0,
282
+ "rewards/question_recreation_reward_func/mean": 0.06385299749672413,
283
+ "rewards/question_recreation_reward_func/std": 0.03331646043807268,
284
  "rewards/soft_format_reward_func/mean": 0.0,
285
  "rewards/soft_format_reward_func/std": 0.0,
286
  "rewards/strict_format_reward_func/mean": 0.0,
287
  "rewards/strict_format_reward_func/std": 0.0,
288
+ "rewards/xmlcount_reward_func/mean": 0.0,
289
+ "rewards/xmlcount_reward_func/std": 0.0,
290
  "step": 14
291
  },
292
  {
 
295
  "clip_ratio/low_mean": 0.0,
296
  "clip_ratio/low_min": 0.0,
297
  "clip_ratio/region_mean": 0.0,
298
+ "completions/clipped_ratio": 0.125,
299
+ "completions/max_length": 853.5,
300
+ "completions/max_terminated_length": 685.0,
301
+ "completions/mean_length": 341.3125,
302
+ "completions/mean_terminated_length": 246.9166717529297,
303
+ "completions/min_length": 64.5,
304
+ "completions/min_terminated_length": 64.5,
305
+ "epoch": 0.64,
306
  "frac_reward_zero_std": 0.0,
307
+ "grad_norm": 5.556763172149658,
308
+ "kl": 0.0018842843419406563,
309
  "learning_rate": 8.067960709356478e-08,
310
+ "loss": 0.0971,
311
+ "num_tokens": 69490.0,
312
+ "reward": 0.06051425402984023,
313
+ "reward_std": 0.019080545171163976,
314
  "rewards/concensus_correctness_reward_func/mean": 0.0,
315
  "rewards/concensus_correctness_reward_func/std": 0.0,
316
  "rewards/consensus_reward_func/mean": 0.0,
 
319
  "rewards/cumulative_reward_2/std": 0.0,
320
  "rewards/final_correctness_reward_func/mean": 0.0,
321
  "rewards/final_correctness_reward_func/std": 0.0,
322
+ "rewards/question_recreation_reward_func/mean": 0.06051425402984023,
323
+ "rewards/question_recreation_reward_func/std": 0.030638275435194373,
324
  "rewards/soft_format_reward_func/mean": 0.0,
325
  "rewards/soft_format_reward_func/std": 0.0,
326
  "rewards/strict_format_reward_func/mean": 0.0,
 
335
  "clip_ratio/low_mean": 0.0,
336
  "clip_ratio/low_min": 0.0,
337
  "clip_ratio/region_mean": 0.0,
338
+ "completions/clipped_ratio": 0.0,
339
+ "completions/max_length": 509.5,
340
+ "completions/max_terminated_length": 509.5,
341
+ "completions/mean_length": 192.875,
342
+ "completions/mean_terminated_length": 192.875,
343
  "completions/min_length": 3.5,
344
  "completions/min_terminated_length": 3.5,
345
+ "epoch": 0.72,
346
  "frac_reward_zero_std": 0.125,
347
+ "grad_norm": 6.232974529266357,
348
+ "kl": 0.006690542242722586,
349
  "learning_rate": 3.013156219837776e-08,
350
+ "loss": -0.0781,
351
+ "num_tokens": 76672.0,
352
+ "reward": 2.7677047792822123,
353
+ "reward_std": 0.006125873536802828,
354
+ "rewards/concensus_correctness_reward_func/mean": 2.5,
355
+ "rewards/concensus_correctness_reward_func/std": 4.629100322723389,
356
+ "rewards/consensus_reward_func/mean": 0.25,
357
+ "rewards/consensus_reward_func/std": 0.4629100561141968,
358
  "rewards/cumulative_reward_2/mean": 0.0,
359
  "rewards/cumulative_reward_2/std": 0.0,
360
  "rewards/final_correctness_reward_func/mean": 0.0,
361
  "rewards/final_correctness_reward_func/std": 0.0,
362
+ "rewards/question_recreation_reward_func/mean": 0.01770483050495386,
363
+ "rewards/question_recreation_reward_func/std": 0.01125000836327672,
364
  "rewards/soft_format_reward_func/mean": 0.0,
365
  "rewards/soft_format_reward_func/std": 0.0,
366
  "rewards/strict_format_reward_func/mean": 0.0,
 
375
  "clip_ratio/low_mean": 0.0,
376
  "clip_ratio/low_min": 0.0,
377
  "clip_ratio/region_mean": 0.0,
378
+ "completions/clipped_ratio": 0.0,
379
+ "completions/max_length": 564.5,
380
+ "completions/max_terminated_length": 564.5,
381
+ "completions/mean_length": 184.0625,
382
+ "completions/mean_terminated_length": 184.0625,
383
+ "completions/min_length": 11.5,
384
+ "completions/min_terminated_length": 11.5,
385
+ "epoch": 0.8,
386
  "frac_reward_zero_std": 0.0,
387
+ "grad_norm": 9.507555961608887,
388
+ "kl": 0.002568137046182528,
389
+ "learning_rate": 3.4096741493194193e-09,
390
+ "loss": 0.0613,
391
+ "num_tokens": 83713.0,
392
+ "reward": 0.14513505343347788,
393
+ "reward_std": 0.180526792537421,
394
  "rewards/concensus_correctness_reward_func/mean": 0.0,
395
  "rewards/concensus_correctness_reward_func/std": 0.0,
396
+ "rewards/consensus_reward_func/mean": 0.125,
397
+ "rewards/consensus_reward_func/std": 0.3535533845424652,
398
  "rewards/cumulative_reward_2/mean": 0.0,
399
  "rewards/cumulative_reward_2/std": 0.0,
400
  "rewards/final_correctness_reward_func/mean": 0.0,
401
  "rewards/final_correctness_reward_func/std": 0.0,
402
+ "rewards/question_recreation_reward_func/mean": 0.02013504970818758,
403
+ "rewards/question_recreation_reward_func/std": 0.011476744432002306,
404
  "rewards/soft_format_reward_func/mean": 0.0,
405
  "rewards/soft_format_reward_func/std": 0.0,
406
  "rewards/strict_format_reward_func/mean": 0.0,
407
  "rewards/strict_format_reward_func/std": 0.0,
408
  "rewards/xmlcount_reward_func/mean": 0.0,
409
  "rewards/xmlcount_reward_func/std": 0.0,
410
+ "step": 20
411
+ },
412
+ {
413
+ "epoch": 0.8,
414
+ "step": 20,
415
  "total_flos": 0.0,
416
+ "train_loss": 0.0773889608681202,
417
+ "train_runtime": 1482.6063,
418
+ "train_samples_per_second": 0.108,
419
  "train_steps_per_second": 0.013
420
  }
421
  ],
422
  "logging_steps": 2,
423
  "max_steps": 20,
424
+ "num_input_tokens_seen": 83713,
425
  "num_train_epochs": 1,
426
  "save_steps": 25,
427
  "stateful_callbacks": {
 
430
  "should_epoch_stop": false,
431
  "should_evaluate": false,
432
  "should_log": false,
433
+ "should_save": true,
434
+ "should_training_stop": true
435
  },
436
  "attributes": {}
437
  }