llearningone commited on
Commit
d152cca
·
verified ·
1 Parent(s): 7b17e7f

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +4 -4
  2. model.safetensors +1 -1
  3. train_results.json +4 -4
  4. trainer_state.json +230 -230
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0026065433954499894,
4
- "train_runtime": 212.6075,
5
  "train_samples": 320,
6
- "train_samples_per_second": 3.01,
7
- "train_steps_per_second": 0.188
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.16417283475748262,
4
+ "train_runtime": 209.2973,
5
  "train_samples": 320,
6
+ "train_samples_per_second": 3.058,
7
+ "train_steps_per_second": 0.191
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e9e355898edf3698d3600a592eb43a57f5b97a8feb3b2ddc76884c22cd406fa
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa3b7c2eb22954ed5bdfe4babb455302b1f9ad01a8f89be61226951f602a331
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0026065433954499894,
4
- "train_runtime": 212.6075,
5
  "train_samples": 320,
6
- "train_samples_per_second": 3.01,
7
- "train_steps_per_second": 0.188
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.16417283475748262,
4
+ "train_runtime": 209.2973,
5
  "train_samples": 320,
6
+ "train_samples_per_second": 3.058,
7
+ "train_steps_per_second": 0.191
8
  }
trainer_state.json CHANGED
@@ -10,393 +10,393 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 115.71875,
14
  "epoch": 0.05,
15
- "grad_norm": 28.141786575317383,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
  "loss": 0.0,
19
- "reward": 3.080722341313958,
20
- "reward_std": 0.6955505819059908,
21
- "rewards/concensus_correctness_reward_func": 0.9308124892413616,
22
- "rewards/consensus_reward_func": 0.875,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.0,
25
- "rewards/question_recreation_reward_func": 0.42944102408364415,
26
  "rewards/soft_format_reward_func": 0.0,
27
  "rewards/strict_format_reward_func": 0.109375,
28
- "rewards/xmlcount_reward_func": 0.7360937483608723,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 115.15625,
33
  "epoch": 0.1,
34
- "grad_norm": 18.37566375732422,
35
- "kl": 0.017843228910351172,
36
  "learning_rate": 4.965903258506806e-07,
37
  "loss": 0.0,
38
- "reward": 4.108856491744518,
39
- "reward_std": 1.4645002749748528,
40
- "rewards/concensus_correctness_reward_func": 1.100500002503395,
41
- "rewards/consensus_reward_func": 1.1875,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.0,
44
- "rewards/question_recreation_reward_func": 0.5959815399255604,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.25,
47
- "rewards/xmlcount_reward_func": 0.9748750030994415,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 101.96875,
52
  "epoch": 0.15,
53
- "grad_norm": 24.032485961914062,
54
- "kl": 0.07059068378293887,
55
  "learning_rate": 4.864543104251586e-07,
56
  "loss": 0.0001,
57
- "reward": 5.183237671852112,
58
- "reward_std": 0.9490681896859314,
59
- "rewards/concensus_correctness_reward_func": 1.5907499957829714,
60
- "rewards/consensus_reward_func": 1.6875,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.125,
63
- "rewards/question_recreation_reward_func": 0.45873774215579033,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.25,
66
- "rewards/xmlcount_reward_func": 1.0712500140070915,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 106.53125,
71
  "epoch": 0.2,
72
- "grad_norm": 23.589221954345703,
73
- "kl": 0.9075778928818181,
74
  "learning_rate": 4.698684378016222e-07,
75
- "loss": 0.0009,
76
- "reward": 4.851357385516167,
77
- "reward_std": 1.071611185092479,
78
- "rewards/concensus_correctness_reward_func": 1.4482499975711107,
79
- "rewards/consensus_reward_func": 1.5625,
80
  "rewards/cumulative_reward_2": 0.0,
81
  "rewards/final_correctness_reward_func": 0.0,
82
- "rewards/question_recreation_reward_func": 0.5514824334532022,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.265625,
85
- "rewards/xmlcount_reward_func": 1.0235000029206276,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 107.4375,
90
  "epoch": 0.25,
91
- "grad_norm": 24.931913375854492,
92
- "kl": 1.187446946860291,
93
  "learning_rate": 4.472851273490984e-07,
94
- "loss": 0.0012,
95
- "reward": 4.945584326982498,
96
- "reward_std": 0.8899367619305849,
97
- "rewards/concensus_correctness_reward_func": 1.4574999995529652,
98
- "rewards/consensus_reward_func": 1.5,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.125,
101
- "rewards/question_recreation_reward_func": 0.6286780787631869,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.234375,
104
- "rewards/xmlcount_reward_func": 1.0000312514603138,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 108.625,
109
  "epoch": 0.3,
110
- "grad_norm": 21.122905731201172,
111
- "kl": 0.7102811370277777,
112
  "learning_rate": 4.193203929064353e-07,
113
- "loss": 0.0007,
114
- "reward": 5.369517683982849,
115
- "reward_std": 1.5619308175519109,
116
- "rewards/concensus_correctness_reward_func": 1.6689374819397926,
117
- "rewards/consensus_reward_func": 1.5,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.25,
120
- "rewards/question_recreation_reward_func": 0.615455181337893,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.265625,
123
- "rewards/xmlcount_reward_func": 1.0695000030100346,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 109.15625,
128
  "epoch": 0.35,
129
- "grad_norm": 59.90224075317383,
130
- "kl": 5.442961755674332,
131
  "learning_rate": 3.867370395306068e-07,
132
- "loss": 0.0054,
133
- "reward": 6.124573543667793,
134
- "reward_std": 1.8788381005870178,
135
- "rewards/concensus_correctness_reward_func": 2.6781250201165676,
136
  "rewards/consensus_reward_func": 1.5,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.125,
139
- "rewards/question_recreation_reward_func": 0.5945110125467181,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.234375,
142
- "rewards/xmlcount_reward_func": 0.992562510073185,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 106.625,
147
  "epoch": 0.4,
148
- "grad_norm": 21.15949249267578,
149
- "kl": 0.5498712724074721,
150
  "learning_rate": 3.5042385616324236e-07,
151
- "loss": 0.0005,
152
- "reward": 5.564565867185593,
153
- "reward_std": 0.9688403338659555,
154
- "rewards/concensus_correctness_reward_func": 1.6811875104904175,
155
- "rewards/consensus_reward_func": 1.6875,
156
  "rewards/cumulative_reward_2": 0.0,
157
  "rewards/final_correctness_reward_func": 0.125,
158
- "rewards/question_recreation_reward_func": 0.6686283685266972,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.296875,
161
- "rewards/xmlcount_reward_func": 1.1053750030696392,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 112.1875,
166
  "epoch": 0.45,
167
- "grad_norm": 34.32904052734375,
168
- "kl": 11.562028194312006,
169
  "learning_rate": 3.1137137178519977e-07,
170
- "loss": 0.0116,
171
- "reward": 5.582300066947937,
172
- "reward_std": 1.7333503647241741,
173
- "rewards/concensus_correctness_reward_func": 2.07431248947978,
174
- "rewards/consensus_reward_func": 1.4375,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.125,
177
- "rewards/question_recreation_reward_func": 0.7077374998480082,
178
  "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.234375,
180
- "rewards/xmlcount_reward_func": 1.0033750012516975,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 117.71875,
185
  "epoch": 0.5,
186
- "grad_norm": 26.058582305908203,
187
- "kl": 1.126984592527151,
188
  "learning_rate": 2.706448363680831e-07,
189
- "loss": 0.0011,
190
- "reward": 4.181420147418976,
191
- "reward_std": 1.3247168653761037,
192
- "rewards/concensus_correctness_reward_func": 1.1436875015497208,
193
- "rewards/consensus_reward_func": 1.25,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.0,
196
- "rewards/question_recreation_reward_func": 0.5951076466590166,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.21875,
199
- "rewards/xmlcount_reward_func": 0.9738749992102385,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 106.71875,
204
  "epoch": 0.55,
205
- "grad_norm": 31.642139434814453,
206
- "kl": 0.7342154434882104,
207
  "learning_rate": 2.2935516363191693e-07,
208
- "loss": 0.0007,
209
- "reward": 4.2624650448560715,
210
- "reward_std": 1.736710302066058,
211
- "rewards/concensus_correctness_reward_func": 1.1389375030994415,
212
- "rewards/consensus_reward_func": 1.25,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.0,
215
- "rewards/question_recreation_reward_func": 0.6370588149875402,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.21875,
218
- "rewards/xmlcount_reward_func": 1.0177187509834766,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 99.46875,
223
  "epoch": 0.6,
224
- "grad_norm": 65.90033721923828,
225
- "kl": 5.04171756003052,
226
  "learning_rate": 1.886286282148002e-07,
227
- "loss": 0.005,
228
- "reward": 4.612813338637352,
229
- "reward_std": 1.5941019735182635,
230
- "rewards/concensus_correctness_reward_func": 1.2605624943971634,
231
- "rewards/consensus_reward_func": 1.4375,
232
  "rewards/cumulative_reward_2": 0.0,
233
  "rewards/final_correctness_reward_func": 0.0,
234
- "rewards/question_recreation_reward_func": 0.4941882867133245,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.3125,
237
- "rewards/xmlcount_reward_func": 1.1080624982714653,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 106.1875,
242
  "epoch": 0.65,
243
- "grad_norm": 24.204206466674805,
244
- "kl": 1.1527493232861161,
245
  "learning_rate": 1.4957614383675767e-07,
246
- "loss": 0.0012,
247
- "reward": 4.555570490658283,
248
- "reward_std": 1.3411388245076523,
249
- "rewards/concensus_correctness_reward_func": 1.3198125027120113,
250
- "rewards/consensus_reward_func": 1.3125,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 0.0625,
253
- "rewards/question_recreation_reward_func": 0.5504454995971173,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.265625,
256
- "rewards/xmlcount_reward_func": 1.0446874983608723,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 110.03125,
261
  "epoch": 0.7,
262
- "grad_norm": 24.5333194732666,
263
- "kl": 9.58880496234633,
264
  "learning_rate": 1.1326296046939333e-07,
265
- "loss": 0.0096,
266
- "reward": 4.167467080056667,
267
- "reward_std": 1.495884626870975,
268
- "rewards/concensus_correctness_reward_func": 1.2058124989271164,
269
- "rewards/consensus_reward_func": 1.3125,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.0625,
272
- "rewards/question_recreation_reward_func": 0.5166545738466084,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.140625,
275
- "rewards/xmlcount_reward_func": 0.9293750040233135,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 105.4375,
280
  "epoch": 0.75,
281
- "grad_norm": 45.431358337402344,
282
- "kl": 1.2492264057509601,
283
  "learning_rate": 8.067960709356478e-08,
284
- "loss": 0.0013,
285
- "reward": 5.73547525331378,
286
- "reward_std": 2.064389802981168,
287
- "rewards/concensus_correctness_reward_func": 2.322999984025955,
288
- "rewards/consensus_reward_func": 1.5625,
289
  "rewards/cumulative_reward_2": 0.0,
290
  "rewards/final_correctness_reward_func": 0.125,
291
- "rewards/question_recreation_reward_func": 0.5407251240685582,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.25,
294
- "rewards/xmlcount_reward_func": 0.9342499934136868,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 97.625,
299
  "epoch": 0.8,
300
- "grad_norm": 22.755542755126953,
301
- "kl": 0.5674767629243433,
302
  "learning_rate": 5.271487265090163e-08,
303
- "loss": 0.0006,
304
- "reward": 4.501953202299774,
305
- "reward_std": 1.0095994778093882,
306
- "rewards/concensus_correctness_reward_func": 1.274562492966652,
307
- "rewards/consensus_reward_func": 1.375,
308
  "rewards/cumulative_reward_2": 0.0,
309
  "rewards/final_correctness_reward_func": 0.0,
310
- "rewards/question_recreation_reward_func": 0.5317657189443707,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.3125,
313
- "rewards/xmlcount_reward_func": 1.0081249997019768,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 111.75,
318
  "epoch": 0.85,
319
- "grad_norm": 48.653533935546875,
320
- "kl": 7.750168148428202,
321
  "learning_rate": 3.013156219837776e-08,
322
- "loss": 0.0077,
323
- "reward": 4.9789127856493,
324
- "reward_std": 1.1694186381064355,
325
- "rewards/concensus_correctness_reward_func": 1.4983125068247318,
326
  "rewards/consensus_reward_func": 1.5,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.125,
329
- "rewards/question_recreation_reward_func": 0.6224753325805068,
330
  "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.21875,
332
- "rewards/xmlcount_reward_func": 1.014375001192093,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 110.03125,
337
  "epoch": 0.9,
338
- "grad_norm": 22.457836151123047,
339
- "kl": 1.2734893589513376,
340
  "learning_rate": 1.3545689574841341e-08,
341
- "loss": 0.0013,
342
- "reward": 4.401850417256355,
343
- "reward_std": 1.3007537302328274,
344
- "rewards/concensus_correctness_reward_func": 1.2692500054836273,
345
- "rewards/consensus_reward_func": 1.375,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.0,
348
- "rewards/question_recreation_reward_func": 0.6151004622224718,
349
  "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.203125,
351
- "rewards/xmlcount_reward_func": 0.9393750000745058,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 109.84375,
356
  "epoch": 0.95,
357
- "grad_norm": 20.912979125976562,
358
- "kl": 0.6942824618890882,
359
  "learning_rate": 3.4096741493194193e-09,
360
- "loss": 0.0007,
361
- "reward": 4.380308553576469,
362
- "reward_std": 1.1902506751939654,
363
- "rewards/concensus_correctness_reward_func": 1.2075624912977219,
364
- "rewards/consensus_reward_func": 1.25,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.0,
367
- "rewards/question_recreation_reward_func": 0.6136210327968001,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.265625,
370
- "rewards/xmlcount_reward_func": 1.0435000024735928,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 108.0,
375
  "epoch": 1.0,
376
- "grad_norm": 21.68805503845215,
377
- "kl": 2.4967825568746775,
378
  "learning_rate": 0.0,
379
- "loss": 0.0025,
380
- "reward": 5.049069061875343,
381
- "reward_std": 1.0629464890807867,
382
- "rewards/concensus_correctness_reward_func": 1.4604999981820583,
383
  "rewards/consensus_reward_func": 1.5625,
384
  "rewards/cumulative_reward_2": 0.0,
385
  "rewards/final_correctness_reward_func": 0.125,
386
- "rewards/question_recreation_reward_func": 0.5341940210200846,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.28125,
389
- "rewards/xmlcount_reward_func": 1.0856250040233135,
390
  "step": 40
391
  },
392
  {
393
  "epoch": 1.0,
394
  "step": 40,
395
  "total_flos": 0.0,
396
- "train_loss": 0.0026065433954499894,
397
- "train_runtime": 212.6075,
398
- "train_samples_per_second": 3.01,
399
- "train_steps_per_second": 0.188
400
  }
401
  ],
402
  "logging_steps": 2,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 115.53125,
14
  "epoch": 0.05,
15
+ "grad_norm": 37.139556884765625,
16
  "kl": 0.0,
17
  "learning_rate": 5e-07,
18
  "loss": 0.0,
19
+ "reward": 2.4475310780107975,
20
+ "reward_std": 1.3624604484939482,
21
+ "rewards/concensus_correctness_reward_func": 0.4649375006556511,
22
+ "rewards/consensus_reward_func": 0.5,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.125,
25
+ "rewards/question_recreation_reward_func": 0.48984355479478836,
26
  "rewards/soft_format_reward_func": 0.0,
27
  "rewards/strict_format_reward_func": 0.109375,
28
+ "rewards/xmlcount_reward_func": 0.7583750076591969,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 108.34375,
33
  "epoch": 0.1,
34
+ "grad_norm": 20.202695846557617,
35
+ "kl": 0.014847530546830967,
36
  "learning_rate": 4.965903258506806e-07,
37
  "loss": 0.0,
38
+ "reward": 5.1692750453948975,
39
+ "reward_std": 1.3825315139256418,
40
+ "rewards/concensus_correctness_reward_func": 1.6117499954998493,
41
+ "rewards/consensus_reward_func": 1.625,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.0625,
44
+ "rewards/question_recreation_reward_func": 0.6315875351428986,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.265625,
47
+ "rewards/xmlcount_reward_func": 0.9728124998509884,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 107.15625,
52
  "epoch": 0.15,
53
+ "grad_norm": 21.550765991210938,
54
+ "kl": 0.08239698398392648,
55
  "learning_rate": 4.864543104251586e-07,
56
  "loss": 0.0001,
57
+ "reward": 4.982221730053425,
58
+ "reward_std": 0.6625452353619039,
59
+ "rewards/concensus_correctness_reward_func": 1.4067500066012144,
60
+ "rewards/consensus_reward_func": 1.625,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.0,
63
+ "rewards/question_recreation_reward_func": 0.5759404562413692,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.296875,
66
+ "rewards/xmlcount_reward_func": 1.0776562541723251,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 116.3125,
71
  "epoch": 0.2,
72
+ "grad_norm": 26.379663467407227,
73
+ "kl": 0.2936959662474692,
74
  "learning_rate": 4.698684378016222e-07,
75
+ "loss": 0.0003,
76
+ "reward": 5.107673645019531,
77
+ "reward_std": 2.3407255806960166,
78
+ "rewards/concensus_correctness_reward_func": 1.9551249891519547,
79
+ "rewards/consensus_reward_func": 1.3125,
80
  "rewards/cumulative_reward_2": 0.0,
81
  "rewards/final_correctness_reward_func": 0.0,
82
+ "rewards/question_recreation_reward_func": 0.6457360591739416,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.203125,
85
+ "rewards/xmlcount_reward_func": 0.9911875016987324,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 102.28125,
90
  "epoch": 0.25,
91
+ "grad_norm": 25.451066970825195,
92
+ "kl": 0.7054350507678464,
93
  "learning_rate": 4.472851273490984e-07,
94
+ "loss": 0.0007,
95
+ "reward": 6.026900440454483,
96
+ "reward_std": 0.6154376133927144,
97
+ "rewards/concensus_correctness_reward_func": 1.9788749888539314,
98
+ "rewards/consensus_reward_func": 1.75,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.3125,
101
+ "rewards/question_recreation_reward_func": 0.5206817177822813,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.328125,
104
+ "rewards/xmlcount_reward_func": 1.13671875,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 109.875,
109
  "epoch": 0.3,
110
+ "grad_norm": 24.288022994995117,
111
+ "kl": 2.1607862473465502,
112
  "learning_rate": 4.193203929064353e-07,
113
+ "loss": 0.0022,
114
+ "reward": 4.333971664309502,
115
+ "reward_std": 1.4009621833683923,
116
+ "rewards/concensus_correctness_reward_func": 1.214937500655651,
117
+ "rewards/consensus_reward_func": 1.25,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.0,
120
+ "rewards/question_recreation_reward_func": 0.6222216123715043,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.25,
123
+ "rewards/xmlcount_reward_func": 0.9968124963343143,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 104.625,
128
  "epoch": 0.35,
129
+ "grad_norm": 28.804235458374023,
130
+ "kl": 0.6891685710288584,
131
  "learning_rate": 3.867370395306068e-07,
132
+ "loss": 0.0007,
133
+ "reward": 5.217689886689186,
134
+ "reward_std": 0.9694035681604873,
135
+ "rewards/concensus_correctness_reward_func": 1.6384375020861626,
136
  "rewards/consensus_reward_func": 1.5,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.25,
139
+ "rewards/question_recreation_reward_func": 0.5610647844150662,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.25,
142
+ "rewards/xmlcount_reward_func": 1.0181875005364418,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 103.21875,
147
  "epoch": 0.4,
148
+ "grad_norm": 35.775413513183594,
149
+ "kl": 0.8724039853550494,
150
  "learning_rate": 3.5042385616324236e-07,
151
+ "loss": 0.0009,
152
+ "reward": 5.001754555851221,
153
+ "reward_std": 0.8620858918002341,
154
+ "rewards/concensus_correctness_reward_func": 1.480874989181757,
155
+ "rewards/consensus_reward_func": 1.5,
156
  "rewards/cumulative_reward_2": 0.0,
157
  "rewards/final_correctness_reward_func": 0.125,
158
+ "rewards/question_recreation_reward_func": 0.5975045184604824,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.25,
161
+ "rewards/xmlcount_reward_func": 1.0483750030398369,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 116.625,
166
  "epoch": 0.45,
167
+ "grad_norm": 21.354928970336914,
168
+ "kl": 0.34439645055681467,
169
  "learning_rate": 3.1137137178519977e-07,
170
+ "loss": 0.0003,
171
+ "reward": 5.742968708276749,
172
+ "reward_std": 0.8694476368837059,
173
+ "rewards/concensus_correctness_reward_func": 1.9676875174045563,
174
+ "rewards/consensus_reward_func": 1.8125,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.1875,
177
+ "rewards/question_recreation_reward_func": 0.6388748935423791,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.203125,
180
+ "rewards/xmlcount_reward_func": 0.9332812521606684,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 103.84375,
185
  "epoch": 0.5,
186
+ "grad_norm": 57.22490310668945,
187
+ "kl": 1.7470143185928464,
188
  "learning_rate": 2.706448363680831e-07,
189
+ "loss": 0.0017,
190
+ "reward": 6.413925901055336,
191
+ "reward_std": 1.2340805160347372,
192
+ "rewards/concensus_correctness_reward_func": 2.8225624971091747,
193
+ "rewards/consensus_reward_func": 1.4375,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.25,
196
+ "rewards/question_recreation_reward_func": 0.5284571875818074,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.328125,
199
+ "rewards/xmlcount_reward_func": 1.0472812503576279,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 104.8125,
204
  "epoch": 0.55,
205
+ "grad_norm": 24.951213836669922,
206
+ "kl": 0.47896549943834543,
207
  "learning_rate": 2.2935516363191693e-07,
208
+ "loss": 0.0005,
209
+ "reward": 5.5750699788331985,
210
+ "reward_std": 0.750914608783205,
211
+ "rewards/concensus_correctness_reward_func": 1.6819374971091747,
212
+ "rewards/consensus_reward_func": 1.5625,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.1875,
215
+ "rewards/question_recreation_reward_func": 0.6984137212857604,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.328125,
218
+ "rewards/xmlcount_reward_func": 1.1165937520563602,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 112.3125,
223
  "epoch": 0.6,
224
+ "grad_norm": 21.453693389892578,
225
+ "kl": 0.5181310146581382,
226
  "learning_rate": 1.886286282148002e-07,
227
+ "loss": 0.0005,
228
+ "reward": 5.20386229455471,
229
+ "reward_std": 0.9414823611150496,
230
+ "rewards/concensus_correctness_reward_func": 1.5962499864399433,
231
+ "rewards/consensus_reward_func": 1.8125,
232
  "rewards/cumulative_reward_2": 0.0,
233
  "rewards/final_correctness_reward_func": 0.0,
234
+ "rewards/question_recreation_reward_func": 0.5575497858226299,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.234375,
237
+ "rewards/xmlcount_reward_func": 1.0031874999403954,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 105.96875,
242
  "epoch": 0.65,
243
+ "grad_norm": 27.866104125976562,
244
+ "kl": 3222.633953056298,
245
  "learning_rate": 1.4957614383675767e-07,
246
+ "loss": 3.2226,
247
+ "reward": 5.229758635163307,
248
+ "reward_std": 1.963500058511272,
249
+ "rewards/concensus_correctness_reward_func": 1.6254375018179417,
250
+ "rewards/consensus_reward_func": 1.5,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.1875,
253
+ "rewards/question_recreation_reward_func": 0.5854774066247046,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.28125,
256
+ "rewards/xmlcount_reward_func": 1.0500937551259995,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 106.5,
261
  "epoch": 0.7,
262
+ "grad_norm": 34.11473083496094,
263
+ "kl": 21.088456489145756,
264
  "learning_rate": 1.1326296046939333e-07,
265
+ "loss": 0.0211,
266
+ "reward": 4.6278437823057175,
267
+ "reward_std": 0.7609359241323546,
268
+ "rewards/concensus_correctness_reward_func": 1.32150000333786,
269
+ "rewards/consensus_reward_func": 1.375,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.0,
272
+ "rewards/question_recreation_reward_func": 0.5827812571078539,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.265625,
275
+ "rewards/xmlcount_reward_func": 1.0829375013709068,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 97.34375,
280
  "epoch": 0.75,
281
+ "grad_norm": 24.32623291015625,
282
+ "kl": 9.820234577637166,
283
  "learning_rate": 8.067960709356478e-08,
284
+ "loss": 0.0098,
285
+ "reward": 5.891083002090454,
286
+ "reward_std": 1.2470036953454837,
287
+ "rewards/concensus_correctness_reward_func": 2.527562513947487,
288
+ "rewards/consensus_reward_func": 1.3125,
289
  "rewards/cumulative_reward_2": 0.0,
290
  "rewards/final_correctness_reward_func": 0.125,
291
+ "rewards/question_recreation_reward_func": 0.5704892324283719,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.296875,
294
+ "rewards/xmlcount_reward_func": 1.0586562529206276,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 95.84375,
299
  "epoch": 0.8,
300
+ "grad_norm": 32.80009078979492,
301
+ "kl": 1.8810399202629924,
302
  "learning_rate": 5.271487265090163e-08,
303
+ "loss": 0.0018,
304
+ "reward": 4.773266464471817,
305
+ "reward_std": 0.8334280538319945,
306
+ "rewards/concensus_correctness_reward_func": 1.3728750012814999,
307
+ "rewards/consensus_reward_func": 1.4375,
308
  "rewards/cumulative_reward_2": 0.0,
309
  "rewards/final_correctness_reward_func": 0.0,
310
+ "rewards/question_recreation_reward_func": 0.5120790302753448,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.34375,
313
+ "rewards/xmlcount_reward_func": 1.1070624999701977,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 113.5,
318
  "epoch": 0.85,
319
+ "grad_norm": 35.31277084350586,
320
+ "kl": 13.102250803261995,
321
  "learning_rate": 3.013156219837776e-08,
322
+ "loss": 0.0131,
323
+ "reward": 4.830469772219658,
324
+ "reward_std": 1.5596857847704086,
325
+ "rewards/concensus_correctness_reward_func": 1.4805624820291996,
326
  "rewards/consensus_reward_func": 1.5,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.0625,
329
+ "rewards/question_recreation_reward_func": 0.5856571896001697,
330
  "rewards/soft_format_reward_func": 0.0,
331
+ "rewards/strict_format_reward_func": 0.203125,
332
+ "rewards/xmlcount_reward_func": 0.9986250065267086,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 103.9375,
337
  "epoch": 0.9,
338
+ "grad_norm": 25.091636657714844,
339
+ "kl": 1.2431108206510544,
340
  "learning_rate": 1.3545689574841341e-08,
341
+ "loss": 0.0012,
342
+ "reward": 5.303227871656418,
343
+ "reward_std": 1.3237640612060204,
344
+ "rewards/concensus_correctness_reward_func": 1.5557500012218952,
345
+ "rewards/consensus_reward_func": 1.625,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 0.125,
348
+ "rewards/question_recreation_reward_func": 0.5443528575124219,
349
  "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.328125,
351
+ "rewards/xmlcount_reward_func": 1.125,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 104.90625,
356
  "epoch": 0.95,
357
+ "grad_norm": 34.469093322753906,
358
+ "kl": 1.4800151162780821,
359
  "learning_rate": 3.4096741493194193e-09,
360
+ "loss": 0.0015,
361
+ "reward": 4.909093216061592,
362
+ "reward_std": 1.483071091468446,
363
+ "rewards/concensus_correctness_reward_func": 1.5221874974668026,
364
+ "rewards/consensus_reward_func": 1.4375,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.0625,
367
+ "rewards/question_recreation_reward_func": 0.561686968896538,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.296875,
370
+ "rewards/xmlcount_reward_func": 1.028343752026558,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 103.34375,
375
  "epoch": 1.0,
376
+ "grad_norm": 39.51275634765625,
377
+ "kl": 4.372888013720512,
378
  "learning_rate": 0.0,
379
+ "loss": 0.0044,
380
+ "reward": 5.756643958389759,
381
+ "reward_std": 1.508879809640348,
382
+ "rewards/concensus_correctness_reward_func": 2.2557499706745148,
383
  "rewards/consensus_reward_func": 1.5625,
384
  "rewards/cumulative_reward_2": 0.0,
385
  "rewards/final_correctness_reward_func": 0.125,
386
+ "rewards/question_recreation_reward_func": 0.5359564470127225,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.25,
389
+ "rewards/xmlcount_reward_func": 1.027437500655651,
390
  "step": 40
391
  },
392
  {
393
  "epoch": 1.0,
394
  "step": 40,
395
  "total_flos": 0.0,
396
+ "train_loss": 0.16417283475748262,
397
+ "train_runtime": 209.2973,
398
+ "train_samples_per_second": 3.058,
399
+ "train_steps_per_second": 0.191
400
  }
401
  ],
402
  "logging_steps": 2,