Blancy commited on
Commit
74066ff
·
verified ·
1 Parent(s): 0db6b9c

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
- base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3
- datasets: Blancy/1ktestfrom10kwithdifficultyclasses
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-GRPO
15
 
16
- This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [Blancy/1ktestfrom10kwithdifficultyclasses](https://huggingface.co/datasets/Blancy/1ktestfrom10kwithdifficultyclasses) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/hy313twm)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
+ base_model: Qwen/Qwen3-1.7B
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-GRPO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/lb954dij)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.004263622482312219,
4
- "train_runtime": 9387.6074,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.107,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.013564518235874741,
4
+ "train_runtime": 16822.299,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.059,
7
+ "train_steps_per_second": 0.002
8
  }
generation_config.json CHANGED
@@ -1,9 +1,13 @@
1
  {
2
- "_from_model_config": true,
3
- "bos_token_id": 151646,
4
  "do_sample": true,
5
- "eos_token_id": 151643,
 
 
 
 
6
  "temperature": 0.6,
 
7
  "top_p": 0.95,
8
  "transformers_version": "4.52.3"
9
  }
 
1
  {
2
+ "bos_token_id": 151643,
 
3
  "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
  "temperature": 0.6,
10
+ "top_k": 20,
11
  "top_p": 0.95,
12
  "transformers_version": "4.52.3"
13
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.004263622482312219,
4
- "train_runtime": 9387.6074,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.107,
7
- "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.013564518235874741,
4
+ "train_runtime": 16822.299,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.059,
7
+ "train_steps_per_second": 0.002
8
  }
trainer_state.json CHANGED
@@ -10,517 +10,517 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 3006.3005981445312,
14
  "epoch": 0.027972027972027972,
15
- "grad_norm": 0.5744067430496216,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
19
- "reward": 0.2819940522313118,
20
- "reward_std": 0.22714678570628166,
21
- "rewards/accuracy_reward": 0.0714285729918629,
22
  "rewards/format_reward": 0.0,
23
- "rewards/tag_count_reward": 0.210565485060215,
24
  "step": 1
25
  },
26
  {
27
- "completion_length": 3101.8661499023438,
28
  "epoch": 0.055944055944055944,
29
- "grad_norm": 0.17433099448680878,
30
  "kl": 0.0,
31
  "learning_rate": 5e-06,
32
  "loss": 0.0,
33
- "reward": 0.2872023843228817,
34
- "reward_std": 0.26565564423799515,
35
- "rewards/accuracy_reward": 0.09226190764456987,
36
  "rewards/format_reward": 0.0,
37
- "rewards/tag_count_reward": 0.1949404813349247,
38
  "step": 2
39
  },
40
  {
41
- "completion_length": 2855.636962890625,
42
  "epoch": 0.08391608391608392,
43
- "grad_norm": 1.1990246772766113,
44
- "kl": 0.00012302398681640625,
45
  "learning_rate": 1e-05,
46
  "loss": 0.0,
47
- "reward": 0.3303571417927742,
48
- "reward_std": 0.26726851612329483,
49
- "rewards/accuracy_reward": 0.10416667093522847,
50
  "rewards/format_reward": 0.0,
51
- "rewards/tag_count_reward": 0.2261904776096344,
52
  "step": 3
53
  },
54
  {
55
- "completion_length": 2929.4702758789062,
56
  "epoch": 0.11188811188811189,
57
- "grad_norm": 0.23280778527259827,
58
- "kl": 0.00044918060302734375,
59
  "learning_rate": 1.5000000000000002e-05,
60
- "loss": 0.0,
61
- "reward": 0.3162202350795269,
62
- "reward_std": 0.24618623778223991,
63
- "rewards/accuracy_reward": 0.09523809514939785,
64
  "rewards/format_reward": 0.0,
65
- "rewards/tag_count_reward": 0.2209821492433548,
66
  "step": 4
67
  },
68
  {
69
- "completion_length": 3607.6875610351562,
70
  "epoch": 0.13986013986013987,
71
- "grad_norm": 0.18857119977474213,
72
- "kl": 0.00334930419921875,
73
  "learning_rate": 2e-05,
74
- "loss": 0.0001,
75
- "reward": 0.3065476231276989,
76
- "reward_std": 0.2614261731505394,
77
- "rewards/accuracy_reward": 0.14583333395421505,
78
  "rewards/format_reward": 0.0,
79
- "rewards/tag_count_reward": 0.16071428544819355,
80
  "step": 5
81
  },
82
  {
83
- "completion_length": 4005.02685546875,
84
  "epoch": 0.16783216783216784,
85
- "grad_norm": 0.4723397195339203,
86
- "kl": 0.050689697265625,
87
  "learning_rate": 1.995184726672197e-05,
88
- "loss": 0.002,
89
- "reward": 0.19345238618552685,
90
- "reward_std": 0.22516153007745743,
91
- "rewards/accuracy_reward": 0.06845238339155912,
92
  "rewards/format_reward": 0.0,
93
- "rewards/tag_count_reward": 0.12500000558793545,
94
  "step": 6
95
  },
96
  {
97
- "completion_length": 4122.452392578125,
98
  "epoch": 0.1958041958041958,
99
- "grad_norm": 0.1486005038022995,
100
- "kl": 0.0227203369140625,
101
  "learning_rate": 1.9807852804032306e-05,
102
- "loss": 0.0009,
103
- "reward": 0.17633929196745157,
104
- "reward_std": 0.20393076166510582,
105
- "rewards/accuracy_reward": 0.07142857275903225,
106
  "rewards/format_reward": 0.0,
107
- "rewards/tag_count_reward": 0.10491071548312902,
108
  "step": 7
109
  },
110
  {
111
- "completion_length": 3902.4583129882812,
112
  "epoch": 0.22377622377622378,
113
- "grad_norm": 0.11479471623897552,
114
- "kl": 0.019866943359375,
115
  "learning_rate": 1.956940335732209e-05,
116
- "loss": 0.0008,
117
- "reward": 0.17261905409395695,
118
- "reward_std": 0.1820010170340538,
119
- "rewards/accuracy_reward": 0.04761904873885214,
120
  "rewards/format_reward": 0.0,
121
- "rewards/tag_count_reward": 0.12500000558793545,
122
  "step": 8
123
  },
124
  {
125
- "completion_length": 3809.5000610351562,
126
  "epoch": 0.2517482517482518,
127
- "grad_norm": 0.13024798035621643,
128
- "kl": 0.02362060546875,
129
  "learning_rate": 1.9238795325112867e-05,
130
- "loss": 0.0009,
131
- "reward": 0.2455357201397419,
132
- "reward_std": 0.24880634248256683,
133
- "rewards/accuracy_reward": 0.10714286006987095,
134
  "rewards/format_reward": 0.0,
135
- "rewards/tag_count_reward": 0.1383928582072258,
136
  "step": 9
137
  },
138
  {
139
- "completion_length": 3733.8929443359375,
140
  "epoch": 0.27972027972027974,
141
- "grad_norm": 0.13345155119895935,
142
- "kl": 0.029022216796875,
143
  "learning_rate": 1.881921264348355e-05,
144
- "loss": 0.0012,
145
- "reward": 0.2239583432674408,
146
- "reward_std": 0.23797475174069405,
147
- "rewards/accuracy_reward": 0.08333333674818277,
148
  "rewards/format_reward": 0.0,
149
- "rewards/tag_count_reward": 0.140625,
150
  "step": 10
151
  },
152
  {
153
- "completion_length": 3744.1756591796875,
154
  "epoch": 0.3076923076923077,
155
- "grad_norm": 0.26249733567237854,
156
- "kl": 0.045654296875,
157
  "learning_rate": 1.8314696123025456e-05,
158
- "loss": 0.0018,
159
- "reward": 0.2172619104385376,
160
- "reward_std": 0.2155022956430912,
161
- "rewards/accuracy_reward": 0.056547621730715036,
162
  "rewards/format_reward": 0.0,
163
- "rewards/tag_count_reward": 0.16071428917348385,
164
  "step": 11
165
  },
166
  {
167
- "completion_length": 3854.7827758789062,
168
  "epoch": 0.3356643356643357,
169
- "grad_norm": 0.19111429154872894,
170
- "kl": 0.05322265625,
171
  "learning_rate": 1.773010453362737e-05,
172
- "loss": 0.0021,
173
- "reward": 0.1696428582072258,
174
- "reward_std": 0.21632104367017746,
175
- "rewards/accuracy_reward": 0.029761905316263437,
176
  "rewards/format_reward": 0.0,
177
- "rewards/tag_count_reward": 0.13988095708191395,
178
  "step": 12
179
  },
180
  {
181
- "completion_length": 3603.0952758789062,
182
  "epoch": 0.36363636363636365,
183
- "grad_norm": 0.20008493959903717,
184
- "kl": 0.04888916015625,
185
  "learning_rate": 1.7071067811865477e-05,
186
- "loss": 0.002,
187
- "reward": 0.396577388048172,
188
- "reward_std": 0.330210629850626,
189
- "rewards/accuracy_reward": 0.2321428619325161,
190
  "rewards/format_reward": 0.0,
191
- "rewards/tag_count_reward": 0.1644345298409462,
192
  "step": 13
193
  },
194
  {
195
- "completion_length": 3576.5714721679688,
196
  "epoch": 0.3916083916083916,
197
- "grad_norm": 0.28434476256370544,
198
- "kl": 0.07000732421875,
199
  "learning_rate": 1.6343932841636455e-05,
200
- "loss": 0.0028,
201
- "reward": 0.4546131044626236,
202
- "reward_std": 0.38984502851963043,
203
- "rewards/accuracy_reward": 0.2797619141638279,
204
  "rewards/format_reward": 0.0,
205
- "rewards/tag_count_reward": 0.174851194024086,
206
  "step": 14
207
  },
208
  {
209
- "completion_length": 3697.041748046875,
210
  "epoch": 0.4195804195804196,
211
- "grad_norm": 0.1858980506658554,
212
- "kl": 0.076171875,
213
  "learning_rate": 1.5555702330196024e-05,
214
- "loss": 0.003,
215
- "reward": 0.351190485060215,
216
- "reward_std": 0.3252423033118248,
217
- "rewards/accuracy_reward": 0.1875000037252903,
218
  "rewards/format_reward": 0.0,
219
- "rewards/tag_count_reward": 0.1636904813349247,
220
  "step": 15
221
  },
222
  {
223
- "completion_length": 3621.52685546875,
224
  "epoch": 0.44755244755244755,
225
- "grad_norm": 0.2509734034538269,
226
- "kl": 0.0791015625,
227
  "learning_rate": 1.4713967368259981e-05,
228
- "loss": 0.0032,
229
- "reward": 0.4129464328289032,
230
- "reward_std": 0.36431775987148285,
231
- "rewards/accuracy_reward": 0.2291666679084301,
232
  "rewards/format_reward": 0.0,
233
- "rewards/tag_count_reward": 0.1837797649204731,
234
  "step": 16
235
  },
236
  {
237
- "completion_length": 3533.1578369140625,
238
  "epoch": 0.4755244755244755,
239
- "grad_norm": 0.20609919726848602,
240
- "kl": 0.1068115234375,
241
  "learning_rate": 1.3826834323650899e-05,
242
- "loss": 0.0043,
243
- "reward": 0.376488097012043,
244
- "reward_std": 0.36176759749650955,
245
- "rewards/accuracy_reward": 0.20238095708191395,
246
  "rewards/format_reward": 0.0,
247
- "rewards/tag_count_reward": 0.1741071492433548,
248
  "step": 17
249
  },
250
  {
251
- "completion_length": 3520.2650146484375,
252
  "epoch": 0.5034965034965035,
253
- "grad_norm": 0.17171701788902283,
254
- "kl": 0.09619140625,
255
  "learning_rate": 1.2902846772544625e-05,
256
- "loss": 0.0038,
257
- "reward": 0.3824404925107956,
258
- "reward_std": 0.35053466260433197,
259
- "rewards/accuracy_reward": 0.1964285708963871,
260
  "rewards/format_reward": 0.0,
261
- "rewards/tag_count_reward": 0.1860119104385376,
262
  "step": 18
263
  },
264
  {
265
- "completion_length": 3493.7142944335938,
266
  "epoch": 0.5314685314685315,
267
- "grad_norm": 0.2554529905319214,
268
- "kl": 0.0972900390625,
269
  "learning_rate": 1.1950903220161286e-05,
270
- "loss": 0.0039,
271
- "reward": 0.4367559552192688,
272
- "reward_std": 0.4041043147444725,
273
- "rewards/accuracy_reward": 0.2321428656578064,
274
  "rewards/format_reward": 0.0,
275
- "rewards/tag_count_reward": 0.2046131007373333,
276
  "step": 19
277
  },
278
  {
279
- "completion_length": 3441.1636962890625,
280
  "epoch": 0.5594405594405595,
281
- "grad_norm": 0.19453173875808716,
282
- "kl": 0.1204833984375,
283
  "learning_rate": 1.098017140329561e-05,
284
- "loss": 0.0048,
285
- "reward": 0.387648805975914,
286
- "reward_std": 0.3576350286602974,
287
- "rewards/accuracy_reward": 0.1815476231276989,
288
  "rewards/format_reward": 0.0,
289
- "rewards/tag_count_reward": 0.206101194024086,
290
  "step": 20
291
  },
292
  {
293
- "completion_length": 3643.386962890625,
294
  "epoch": 0.5874125874125874,
295
- "grad_norm": 0.20605766773223877,
296
- "kl": 0.150390625,
297
  "learning_rate": 1e-05,
298
- "loss": 0.006,
299
- "reward": 0.433779776096344,
300
- "reward_std": 0.4044775441288948,
301
- "rewards/accuracy_reward": 0.2351190522313118,
302
  "rewards/format_reward": 0.0,
303
- "rewards/tag_count_reward": 0.1986607201397419,
304
  "step": 21
305
  },
306
  {
307
- "completion_length": 3483.3482666015625,
308
  "epoch": 0.6153846153846154,
309
- "grad_norm": 0.26148831844329834,
310
- "kl": 0.164794921875,
311
  "learning_rate": 9.019828596704394e-06,
312
- "loss": 0.0066,
313
- "reward": 0.432291679084301,
314
- "reward_std": 0.40663766860961914,
315
- "rewards/accuracy_reward": 0.20535714365541935,
316
  "rewards/format_reward": 0.0,
317
- "rewards/tag_count_reward": 0.2269345298409462,
318
  "step": 22
319
  },
320
  {
321
- "completion_length": 3216.8928833007812,
322
  "epoch": 0.6433566433566433,
323
- "grad_norm": 0.9234132766723633,
324
- "kl": 0.19677734375,
325
  "learning_rate": 8.04909677983872e-06,
326
- "loss": 0.0079,
327
- "reward": 0.5305059626698494,
328
- "reward_std": 0.4592476785182953,
329
- "rewards/accuracy_reward": 0.2857142835855484,
330
  "rewards/format_reward": 0.0,
331
- "rewards/tag_count_reward": 0.2447916716337204,
332
  "step": 23
333
  },
334
  {
335
- "completion_length": 3309.6517944335938,
336
  "epoch": 0.6713286713286714,
337
- "grad_norm": 0.43054357171058655,
338
- "kl": 0.211669921875,
339
  "learning_rate": 7.097153227455379e-06,
340
- "loss": 0.0085,
341
- "reward": 0.4821428805589676,
342
- "reward_std": 0.421218641102314,
343
- "rewards/accuracy_reward": 0.23511905036866665,
344
  "rewards/format_reward": 0.0,
345
- "rewards/tag_count_reward": 0.2470238134264946,
346
  "step": 24
347
  },
348
  {
349
- "completion_length": 3227.5625610351562,
350
  "epoch": 0.6993006993006993,
351
- "grad_norm": 0.47960782051086426,
352
- "kl": 0.154541015625,
353
  "learning_rate": 6.173165676349103e-06,
354
- "loss": 0.0062,
355
- "reward": 0.5811012014746666,
356
- "reward_std": 0.45310985296964645,
357
- "rewards/accuracy_reward": 0.3005952425301075,
358
  "rewards/format_reward": 0.0,
359
- "rewards/tag_count_reward": 0.2805059626698494,
360
  "step": 25
361
  },
362
  {
363
- "completion_length": 3407.9524536132812,
364
  "epoch": 0.7272727272727273,
365
- "grad_norm": 0.22348760068416595,
366
- "kl": 0.145263671875,
367
  "learning_rate": 5.286032631740023e-06,
368
- "loss": 0.0058,
369
- "reward": 0.5096726343035698,
370
- "reward_std": 0.36870553344488144,
371
- "rewards/accuracy_reward": 0.2261904776096344,
372
  "rewards/format_reward": 0.0,
373
- "rewards/tag_count_reward": 0.2834821417927742,
374
  "step": 26
375
  },
376
  {
377
- "completion_length": 3143.4613647460938,
378
  "epoch": 0.7552447552447552,
379
- "grad_norm": 0.35857293009757996,
380
- "kl": 0.1617431640625,
381
  "learning_rate": 4.444297669803981e-06,
382
- "loss": 0.0065,
383
- "reward": 0.5260416865348816,
384
- "reward_std": 0.38500337302684784,
385
- "rewards/accuracy_reward": 0.23214286379516125,
386
  "rewards/format_reward": 0.0,
387
- "rewards/tag_count_reward": 0.293898805975914,
388
  "step": 27
389
  },
390
  {
391
- "completion_length": 3477.8214721679688,
392
  "epoch": 0.7832167832167832,
393
- "grad_norm": 0.22233816981315613,
394
- "kl": 0.16357421875,
395
  "learning_rate": 3.6560671583635467e-06,
396
- "loss": 0.0065,
397
- "reward": 0.3973214253783226,
398
- "reward_std": 0.297023706138134,
399
- "rewards/accuracy_reward": 0.11309523927047849,
400
  "rewards/format_reward": 0.0,
401
- "rewards/tag_count_reward": 0.284226194024086,
402
  "step": 28
403
  },
404
  {
405
- "completion_length": 3345.33935546875,
406
  "epoch": 0.8111888111888111,
407
- "grad_norm": 0.3590240180492401,
408
- "kl": 0.154541015625,
409
  "learning_rate": 2.9289321881345257e-06,
410
- "loss": 0.0062,
411
- "reward": 0.5654762014746666,
412
- "reward_std": 0.359061636030674,
413
- "rewards/accuracy_reward": 0.25892857648432255,
414
  "rewards/format_reward": 0.0,
415
- "rewards/tag_count_reward": 0.3065476268529892,
416
  "step": 29
417
  },
418
  {
419
- "completion_length": 3411.324462890625,
420
  "epoch": 0.8391608391608392,
421
- "grad_norm": 0.2620408833026886,
422
- "kl": 0.161376953125,
423
  "learning_rate": 2.26989546637263e-06,
424
- "loss": 0.0065,
425
- "reward": 0.4680059552192688,
426
- "reward_std": 0.3296178914606571,
427
- "rewards/accuracy_reward": 0.15773810190148652,
428
  "rewards/format_reward": 0.0,
429
- "rewards/tag_count_reward": 0.3102678656578064,
430
  "step": 30
431
  },
432
  {
433
- "completion_length": 3285.4940185546875,
434
  "epoch": 0.8671328671328671,
435
- "grad_norm": 0.27508166432380676,
436
- "kl": 0.1549072265625,
437
  "learning_rate": 1.6853038769745466e-06,
438
- "loss": 0.0062,
439
- "reward": 0.5357143133878708,
440
- "reward_std": 0.3590538948774338,
441
- "rewards/accuracy_reward": 0.2351190522313118,
442
  "rewards/format_reward": 0.0,
443
- "rewards/tag_count_reward": 0.3005952388048172,
444
  "step": 31
445
  },
446
  {
447
- "completion_length": 3199.0447387695312,
448
  "epoch": 0.8951048951048951,
449
- "grad_norm": 6.951571941375732,
450
- "kl": 0.44775390625,
451
  "learning_rate": 1.1807873565164507e-06,
452
- "loss": 0.0179,
453
- "reward": 0.5491071566939354,
454
- "reward_std": 0.34805624932050705,
455
- "rewards/accuracy_reward": 0.2380952425301075,
456
  "rewards/format_reward": 0.0,
457
- "rewards/tag_count_reward": 0.3110119178891182,
458
  "step": 32
459
  },
460
  {
461
- "completion_length": 3249.9732666015625,
462
  "epoch": 0.9230769230769231,
463
- "grad_norm": 0.3479785621166229,
464
- "kl": 0.165283203125,
465
  "learning_rate": 7.612046748871327e-07,
466
- "loss": 0.0066,
467
- "reward": 0.5736607313156128,
468
- "reward_std": 0.3619985207915306,
469
- "rewards/accuracy_reward": 0.2440476231276989,
470
  "rewards/format_reward": 0.0,
471
- "rewards/tag_count_reward": 0.3296131044626236,
472
  "step": 33
473
  },
474
  {
475
- "completion_length": 3028.9494018554688,
476
  "epoch": 0.951048951048951,
477
- "grad_norm": 0.3831622302532196,
478
- "kl": 0.178466796875,
479
  "learning_rate": 4.305966426779118e-07,
480
- "loss": 0.0071,
481
- "reward": 0.5811012089252472,
482
- "reward_std": 0.37812189757823944,
483
- "rewards/accuracy_reward": 0.2678571492433548,
484
  "rewards/format_reward": 0.0,
485
- "rewards/tag_count_reward": 0.3132440596818924,
486
  "step": 34
487
  },
488
  {
489
- "completion_length": 3763.1279907226562,
490
  "epoch": 0.9790209790209791,
491
- "grad_norm": 0.1765451729297638,
492
- "kl": 0.155029296875,
493
  "learning_rate": 1.921471959676957e-07,
494
- "loss": 0.0062,
495
- "reward": 0.3869047686457634,
496
- "reward_std": 0.28744565322995186,
497
- "rewards/accuracy_reward": 0.13095238525420427,
498
  "rewards/format_reward": 0.0,
499
- "rewards/tag_count_reward": 0.2559523917734623,
500
  "step": 35
501
  },
502
  {
503
- "completion_length": 3817.74560546875,
504
  "epoch": 1.0,
505
- "grad_norm": 0.1765451729297638,
506
- "kl": 0.17024739583333334,
507
  "learning_rate": 4.815273327803183e-08,
508
- "loss": 0.0051,
509
- "reward": 0.3432539800802867,
510
- "reward_std": 0.2780213952064514,
511
- "rewards/accuracy_reward": 0.0912698432803154,
512
  "rewards/format_reward": 0.0,
513
- "rewards/tag_count_reward": 0.2519841293493907,
514
  "step": 36
515
  },
516
  {
517
  "epoch": 1.0,
518
  "step": 36,
519
  "total_flos": 0.0,
520
- "train_loss": 0.004263622482312219,
521
- "train_runtime": 9387.6074,
522
- "train_samples_per_second": 0.107,
523
- "train_steps_per_second": 0.004
524
  }
525
  ],
526
  "logging_steps": 1,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 3466.5059814453125,
14
  "epoch": 0.027972027972027972,
15
+ "grad_norm": 0.8895137310028076,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
19
+ "reward": 0.4657738283276558,
20
+ "reward_std": 0.11145408265292645,
21
+ "rewards/accuracy_reward": 0.08630952658131719,
22
  "rewards/format_reward": 0.0,
23
+ "rewards/tag_count_reward": 0.379464291036129,
24
  "step": 1
25
  },
26
  {
27
+ "completion_length": 3675.1995239257812,
28
  "epoch": 0.055944055944055944,
29
+ "grad_norm": 0.545717716217041,
30
  "kl": 0.0,
31
  "learning_rate": 5e-06,
32
  "loss": 0.0,
33
+ "reward": 0.473214291036129,
34
+ "reward_std": 0.16795706376433372,
35
+ "rewards/accuracy_reward": 0.09523809934034944,
36
  "rewards/format_reward": 0.0,
37
+ "rewards/tag_count_reward": 0.377976194024086,
38
  "step": 2
39
  },
40
  {
41
+ "completion_length": 3419.232177734375,
42
  "epoch": 0.08391608391608392,
43
+ "grad_norm": 0.5103467106819153,
44
+ "kl": 0.0010442733764648438,
45
  "learning_rate": 1e-05,
46
  "loss": 0.0,
47
+ "reward": 0.5677083432674408,
48
+ "reward_std": 0.17301351390779018,
49
+ "rewards/accuracy_reward": 0.15178571664728224,
50
  "rewards/format_reward": 0.0,
51
+ "rewards/tag_count_reward": 0.4159226268529892,
52
  "step": 3
53
  },
54
  {
55
+ "completion_length": 3326.1488647460938,
56
  "epoch": 0.11188811188811189,
57
+ "grad_norm": 0.9779719114303589,
58
+ "kl": 0.01001739501953125,
59
  "learning_rate": 1.5000000000000002e-05,
60
+ "loss": 0.0004,
61
+ "reward": 0.5312500074505806,
62
+ "reward_std": 0.18081304244697094,
63
+ "rewards/accuracy_reward": 0.10416666511446238,
64
  "rewards/format_reward": 0.0,
65
+ "rewards/tag_count_reward": 0.4270833507180214,
66
  "step": 4
67
  },
68
  {
69
+ "completion_length": 3557.666748046875,
70
  "epoch": 0.13986013986013987,
71
+ "grad_norm": 0.7781193256378174,
72
+ "kl": 0.0257568359375,
73
  "learning_rate": 2e-05,
74
+ "loss": 0.001,
75
+ "reward": 0.5401785746216774,
76
+ "reward_std": 0.17106053419411182,
77
+ "rewards/accuracy_reward": 0.1547619067132473,
78
  "rewards/format_reward": 0.0,
79
+ "rewards/tag_count_reward": 0.385416679084301,
80
  "step": 5
81
  },
82
  {
83
+ "completion_length": 3472.3839111328125,
84
  "epoch": 0.16783216783216784,
85
+ "grad_norm": 0.3135736286640167,
86
+ "kl": 0.0687255859375,
87
  "learning_rate": 1.995184726672197e-05,
88
+ "loss": 0.0027,
89
+ "reward": 0.5096726268529892,
90
+ "reward_std": 0.1920771636068821,
91
+ "rewards/accuracy_reward": 0.10714285913854837,
92
  "rewards/format_reward": 0.0,
93
+ "rewards/tag_count_reward": 0.4025297686457634,
94
  "step": 6
95
  },
96
  {
97
+ "completion_length": 3685.6577758789062,
98
  "epoch": 0.1958041958041958,
99
+ "grad_norm": 0.23582090437412262,
100
+ "kl": 0.0960693359375,
101
  "learning_rate": 1.9807852804032306e-05,
102
+ "loss": 0.0038,
103
+ "reward": 0.5394345298409462,
104
+ "reward_std": 0.1671802718192339,
105
+ "rewards/accuracy_reward": 0.14583333395421505,
106
  "rewards/format_reward": 0.0,
107
+ "rewards/tag_count_reward": 0.3936012014746666,
108
  "step": 7
109
  },
110
  {
111
+ "completion_length": 3511.9107666015625,
112
  "epoch": 0.22377622377622378,
113
+ "grad_norm": 0.19193226099014282,
114
+ "kl": 0.146484375,
115
  "learning_rate": 1.956940335732209e-05,
116
+ "loss": 0.0059,
117
+ "reward": 0.5066964402794838,
118
+ "reward_std": 0.1798535594716668,
119
+ "rewards/accuracy_reward": 0.11309523973613977,
120
  "rewards/format_reward": 0.0,
121
+ "rewards/tag_count_reward": 0.3936012014746666,
122
  "step": 8
123
  },
124
  {
125
+ "completion_length": 3710.8781127929688,
126
  "epoch": 0.2517482517482518,
127
+ "grad_norm": 0.15887923538684845,
128
+ "kl": 0.1806640625,
129
  "learning_rate": 1.9238795325112867e-05,
130
+ "loss": 0.0072,
131
+ "reward": 0.543898805975914,
132
+ "reward_std": 0.16826315969228745,
133
+ "rewards/accuracy_reward": 0.1636904780752957,
134
  "rewards/format_reward": 0.0,
135
+ "rewards/tag_count_reward": 0.3802083432674408,
136
  "step": 9
137
  },
138
  {
139
+ "completion_length": 3805.0596313476562,
140
  "epoch": 0.27972027972027974,
141
+ "grad_norm": 0.17851299047470093,
142
+ "kl": 0.225341796875,
143
  "learning_rate": 1.881921264348355e-05,
144
+ "loss": 0.009,
145
+ "reward": 0.5074404925107956,
146
+ "reward_std": 0.1729382909834385,
147
+ "rewards/accuracy_reward": 0.13392857275903225,
148
  "rewards/format_reward": 0.0,
149
+ "rewards/tag_count_reward": 0.3735119104385376,
150
  "step": 10
151
  },
152
  {
153
+ "completion_length": 3785.4822998046875,
154
  "epoch": 0.3076923076923077,
155
+ "grad_norm": 0.18465878069400787,
156
+ "kl": 0.266845703125,
157
  "learning_rate": 1.8314696123025456e-05,
158
+ "loss": 0.0107,
159
+ "reward": 0.4315476343035698,
160
+ "reward_std": 0.11403452791273594,
161
+ "rewards/accuracy_reward": 0.07738095452077687,
162
  "rewards/format_reward": 0.0,
163
+ "rewards/tag_count_reward": 0.3541666716337204,
164
  "step": 11
165
  },
166
  {
167
+ "completion_length": 3463.0625610351562,
168
  "epoch": 0.3356643356643357,
169
+ "grad_norm": 0.8323477506637573,
170
+ "kl": 0.28662109375,
171
  "learning_rate": 1.773010453362737e-05,
172
+ "loss": 0.0115,
173
+ "reward": 0.4404762014746666,
174
+ "reward_std": 0.14237426407635212,
175
+ "rewards/accuracy_reward": 0.059523810632526875,
176
  "rewards/format_reward": 0.0,
177
+ "rewards/tag_count_reward": 0.380952388048172,
178
  "step": 12
179
  },
180
  {
181
+ "completion_length": 3813.7142944335938,
182
  "epoch": 0.36363636363636365,
183
+ "grad_norm": 0.19555552303791046,
184
+ "kl": 0.30908203125,
185
  "learning_rate": 1.7071067811865477e-05,
186
+ "loss": 0.0124,
187
+ "reward": 0.6488095223903656,
188
+ "reward_std": 0.16268891375511885,
189
+ "rewards/accuracy_reward": 0.2946428656578064,
190
  "rewards/format_reward": 0.0,
191
+ "rewards/tag_count_reward": 0.354166679084301,
192
  "step": 13
193
  },
194
  {
195
+ "completion_length": 3913.9733276367188,
196
  "epoch": 0.3916083916083916,
197
+ "grad_norm": 0.1619066596031189,
198
+ "kl": 0.3642578125,
199
  "learning_rate": 1.6343932841636455e-05,
200
+ "loss": 0.0146,
201
+ "reward": 0.685267873108387,
202
+ "reward_std": 0.22458449006080627,
203
+ "rewards/accuracy_reward": 0.3452380932867527,
204
  "rewards/format_reward": 0.0,
205
+ "rewards/tag_count_reward": 0.3400297686457634,
206
  "step": 14
207
  },
208
  {
209
+ "completion_length": 4126.169677734375,
210
  "epoch": 0.4195804195804196,
211
+ "grad_norm": 183.4498291015625,
212
+ "kl": 1.22119140625,
213
  "learning_rate": 1.5555702330196024e-05,
214
+ "loss": 0.0489,
215
+ "reward": 0.5327381044626236,
216
+ "reward_std": 0.168159706518054,
217
+ "rewards/accuracy_reward": 0.21726191230118275,
218
  "rewards/format_reward": 0.0,
219
+ "rewards/tag_count_reward": 0.3154762014746666,
220
  "step": 15
221
  },
222
  {
223
+ "completion_length": 4069.3423461914062,
224
  "epoch": 0.44755244755244755,
225
+ "grad_norm": 0.14787977933883667,
226
+ "kl": 0.39599609375,
227
  "learning_rate": 1.4713967368259981e-05,
228
+ "loss": 0.0158,
229
+ "reward": 0.5260416716337204,
230
+ "reward_std": 0.1734736319631338,
231
+ "rewards/accuracy_reward": 0.21130952797830105,
232
  "rewards/format_reward": 0.0,
233
+ "rewards/tag_count_reward": 0.3147321492433548,
234
  "step": 16
235
  },
236
  {
237
+ "completion_length": 4006.3988647460938,
238
  "epoch": 0.4755244755244755,
239
+ "grad_norm": 7.54030704498291,
240
+ "kl": 0.4892578125,
241
  "learning_rate": 1.3826834323650899e-05,
242
+ "loss": 0.0196,
243
+ "reward": 0.5386904776096344,
244
+ "reward_std": 0.21569561585783958,
245
+ "rewards/accuracy_reward": 0.21726191090419888,
246
  "rewards/format_reward": 0.0,
247
+ "rewards/tag_count_reward": 0.321428582072258,
248
  "step": 17
249
  },
250
  {
251
+ "completion_length": 3950.7174072265625,
252
  "epoch": 0.5034965034965035,
253
+ "grad_norm": 0.24552476406097412,
254
+ "kl": 0.3994140625,
255
  "learning_rate": 1.2902846772544625e-05,
256
+ "loss": 0.016,
257
+ "reward": 0.5223214477300644,
258
+ "reward_std": 0.21196102164685726,
259
+ "rewards/accuracy_reward": 0.19345238246023655,
260
  "rewards/format_reward": 0.0,
261
+ "rewards/tag_count_reward": 0.3288690596818924,
262
  "step": 18
263
  },
264
  {
265
+ "completion_length": 3340.389892578125,
266
  "epoch": 0.5314685314685315,
267
+ "grad_norm": 0.2469428926706314,
268
+ "kl": 0.3818359375,
269
  "learning_rate": 1.1950903220161286e-05,
270
+ "loss": 0.0153,
271
+ "reward": 0.7261904776096344,
272
+ "reward_std": 0.3062071055173874,
273
+ "rewards/accuracy_reward": 0.3363095298409462,
274
  "rewards/format_reward": 0.0,
275
+ "rewards/tag_count_reward": 0.3898809626698494,
276
  "step": 19
277
  },
278
  {
279
+ "completion_length": 3832.21142578125,
280
  "epoch": 0.5594405594405595,
281
+ "grad_norm": 0.20858395099639893,
282
+ "kl": 0.4306640625,
283
  "learning_rate": 1.098017140329561e-05,
284
+ "loss": 0.0172,
285
+ "reward": 0.5424107238650322,
286
+ "reward_std": 0.277970090508461,
287
+ "rewards/accuracy_reward": 0.18452381156384945,
288
  "rewards/format_reward": 0.0,
289
+ "rewards/tag_count_reward": 0.357886902987957,
290
  "step": 20
291
  },
292
  {
293
+ "completion_length": 3544.5535888671875,
294
  "epoch": 0.5874125874125874,
295
+ "grad_norm": 0.8287240266799927,
296
+ "kl": 0.39404296875,
297
  "learning_rate": 1e-05,
298
+ "loss": 0.0158,
299
+ "reward": 0.5654762089252472,
300
+ "reward_std": 0.29340360686182976,
301
+ "rewards/accuracy_reward": 0.19642857648432255,
302
  "rewards/format_reward": 0.0,
303
+ "rewards/tag_count_reward": 0.3690476268529892,
304
  "step": 21
305
  },
306
  {
307
+ "completion_length": 3684.431640625,
308
  "epoch": 0.6153846153846154,
309
+ "grad_norm": 0.18662597239017487,
310
+ "kl": 0.3759765625,
311
  "learning_rate": 9.019828596704394e-06,
312
+ "loss": 0.015,
313
+ "reward": 0.5915178656578064,
314
+ "reward_std": 0.26311009749770164,
315
+ "rewards/accuracy_reward": 0.22619047947227955,
316
  "rewards/format_reward": 0.0,
317
+ "rewards/tag_count_reward": 0.365327388048172,
318
  "step": 22
319
  },
320
  {
321
+ "completion_length": 3296.729248046875,
322
  "epoch": 0.6433566433566433,
323
+ "grad_norm": 2.3806121349334717,
324
+ "kl": 0.42236328125,
325
  "learning_rate": 8.04909677983872e-06,
326
+ "loss": 0.0169,
327
+ "reward": 0.7991071492433548,
328
+ "reward_std": 0.3090755343437195,
329
+ "rewards/accuracy_reward": 0.3988095298409462,
330
  "rewards/format_reward": 0.0,
331
+ "rewards/tag_count_reward": 0.4002976268529892,
332
  "step": 23
333
  },
334
  {
335
+ "completion_length": 3165.3363647460938,
336
  "epoch": 0.6713286713286714,
337
+ "grad_norm": 0.24191996455192566,
338
+ "kl": 0.3544921875,
339
  "learning_rate": 7.097153227455379e-06,
340
+ "loss": 0.0142,
341
+ "reward": 0.769345261156559,
342
+ "reward_std": 0.3235185295343399,
343
+ "rewards/accuracy_reward": 0.342261902987957,
344
  "rewards/format_reward": 0.0,
345
+ "rewards/tag_count_reward": 0.4270833358168602,
346
  "step": 24
347
  },
348
  {
349
+ "completion_length": 3155.2977294921875,
350
  "epoch": 0.6993006993006993,
351
+ "grad_norm": 0.2336803525686264,
352
+ "kl": 0.34521484375,
353
  "learning_rate": 6.173165676349103e-06,
354
+ "loss": 0.0138,
355
+ "reward": 0.8110119253396988,
356
+ "reward_std": 0.30220678821206093,
357
+ "rewards/accuracy_reward": 0.3630952462553978,
358
  "rewards/format_reward": 0.0,
359
+ "rewards/tag_count_reward": 0.4479166716337204,
360
  "step": 25
361
  },
362
  {
363
+ "completion_length": 3391.669677734375,
364
  "epoch": 0.7272727272727273,
365
+ "grad_norm": 0.20384787023067474,
366
+ "kl": 0.38623046875,
367
  "learning_rate": 5.286032631740023e-06,
368
+ "loss": 0.0154,
369
+ "reward": 0.7068452686071396,
370
+ "reward_std": 0.22004729695618153,
371
+ "rewards/accuracy_reward": 0.318452388048172,
372
  "rewards/format_reward": 0.0,
373
+ "rewards/tag_count_reward": 0.3883928656578064,
374
  "step": 26
375
  },
376
  {
377
+ "completion_length": 2888.4583740234375,
378
  "epoch": 0.7552447552447552,
379
+ "grad_norm": 0.21734486520290375,
380
+ "kl": 0.36328125,
381
  "learning_rate": 4.444297669803981e-06,
382
+ "loss": 0.0145,
383
+ "reward": 0.8251488208770752,
384
+ "reward_std": 0.23692942410707474,
385
+ "rewards/accuracy_reward": 0.354166679084301,
386
  "rewards/format_reward": 0.0,
387
+ "rewards/tag_count_reward": 0.4709821566939354,
388
  "step": 27
389
  },
390
  {
391
+ "completion_length": 3569.511962890625,
392
  "epoch": 0.7832167832167832,
393
+ "grad_norm": 0.22490565478801727,
394
+ "kl": 0.38720703125,
395
  "learning_rate": 3.6560671583635467e-06,
396
+ "loss": 0.0155,
397
+ "reward": 0.5215774029493332,
398
+ "reward_std": 0.1440001018345356,
399
+ "rewards/accuracy_reward": 0.12797619495540857,
400
  "rewards/format_reward": 0.0,
401
+ "rewards/tag_count_reward": 0.3936012014746666,
402
  "step": 28
403
  },
404
  {
405
+ "completion_length": 3503.2530517578125,
406
  "epoch": 0.8111888111888111,
407
+ "grad_norm": 0.161085844039917,
408
+ "kl": 0.32861328125,
409
  "learning_rate": 2.9289321881345257e-06,
410
+ "loss": 0.0131,
411
+ "reward": 0.658482164144516,
412
+ "reward_std": 0.23359560035169125,
413
+ "rewards/accuracy_reward": 0.23809524439275265,
414
  "rewards/format_reward": 0.0,
415
+ "rewards/tag_count_reward": 0.4203869178891182,
416
  "step": 29
417
  },
418
  {
419
+ "completion_length": 3520.7709350585938,
420
  "epoch": 0.8391608391608392,
421
+ "grad_norm": 0.6872840523719788,
422
+ "kl": 0.3876953125,
423
  "learning_rate": 2.26989546637263e-06,
424
+ "loss": 0.0155,
425
+ "reward": 0.5461309626698494,
426
+ "reward_std": 0.1894807331264019,
427
+ "rewards/accuracy_reward": 0.14880952727980912,
428
  "rewards/format_reward": 0.0,
429
+ "rewards/tag_count_reward": 0.3973214328289032,
430
  "step": 30
431
  },
432
  {
433
+ "completion_length": 3472.7084350585938,
434
  "epoch": 0.8671328671328671,
435
+ "grad_norm": 0.19596825540065765,
436
+ "kl": 0.345703125,
437
  "learning_rate": 1.6853038769745466e-06,
438
+ "loss": 0.0138,
439
+ "reward": 0.6770833432674408,
440
+ "reward_std": 0.2258823774755001,
441
+ "rewards/accuracy_reward": 0.2589285783469677,
442
  "rewards/format_reward": 0.0,
443
+ "rewards/tag_count_reward": 0.4181547686457634,
444
  "step": 31
445
  },
446
  {
447
+ "completion_length": 3174.8154907226562,
448
  "epoch": 0.8951048951048951,
449
+ "grad_norm": 0.16952919960021973,
450
+ "kl": 0.32080078125,
451
  "learning_rate": 1.1807873565164507e-06,
452
+ "loss": 0.0128,
453
+ "reward": 0.7038690596818924,
454
+ "reward_std": 0.2037985846400261,
455
+ "rewards/accuracy_reward": 0.2619047686457634,
456
  "rewards/format_reward": 0.0,
457
+ "rewards/tag_count_reward": 0.4419642984867096,
458
  "step": 32
459
  },
460
  {
461
+ "completion_length": 3343.431640625,
462
  "epoch": 0.9230769230769231,
463
+ "grad_norm": 0.38616687059402466,
464
+ "kl": 0.380859375,
465
  "learning_rate": 7.612046748871327e-07,
466
+ "loss": 0.0152,
467
+ "reward": 0.6845238357782364,
468
+ "reward_std": 0.25144897773861885,
469
+ "rewards/accuracy_reward": 0.258928582072258,
470
  "rewards/format_reward": 0.0,
471
+ "rewards/tag_count_reward": 0.4255952462553978,
472
  "step": 33
473
  },
474
  {
475
+ "completion_length": 2863.6845703125,
476
  "epoch": 0.951048951048951,
477
+ "grad_norm": 55.34281539916992,
478
+ "kl": 1.37109375,
479
  "learning_rate": 4.305966426779118e-07,
480
+ "loss": 0.0549,
481
+ "reward": 0.8660714328289032,
482
+ "reward_std": 0.29025010019540787,
483
+ "rewards/accuracy_reward": 0.348214291036129,
484
  "rewards/format_reward": 0.0,
485
+ "rewards/tag_count_reward": 0.5178571417927742,
486
  "step": 34
487
  },
488
  {
489
+ "completion_length": 3796.6666870117188,
490
  "epoch": 0.9790209790209791,
491
+ "grad_norm": 0.18026739358901978,
492
+ "kl": 0.412109375,
493
  "learning_rate": 1.921471959676957e-07,
494
+ "loss": 0.0165,
495
+ "reward": 0.4620535895228386,
496
+ "reward_std": 0.18349270708858967,
497
+ "rewards/accuracy_reward": 0.09523809794336557,
498
  "rewards/format_reward": 0.0,
499
+ "rewards/tag_count_reward": 0.366815485060215,
500
  "step": 35
501
  },
502
  {
503
+ "completion_length": 4218.495524088542,
504
  "epoch": 1.0,
505
+ "grad_norm": 0.18026739358901978,
506
+ "kl": 0.4368489583333333,
507
  "learning_rate": 4.815273327803183e-08,
508
+ "loss": 0.0131,
509
+ "reward": 0.3720238208770752,
510
+ "reward_std": 0.15442068378130594,
511
+ "rewards/accuracy_reward": 0.0476190485060215,
512
  "rewards/format_reward": 0.0,
513
+ "rewards/tag_count_reward": 0.3244047661622365,
514
  "step": 36
515
  },
516
  {
517
  "epoch": 1.0,
518
  "step": 36,
519
  "total_flos": 0.0,
520
+ "train_loss": 0.013564518235874741,
521
+ "train_runtime": 16822.299,
522
+ "train_samples_per_second": 0.059,
523
+ "train_steps_per_second": 0.002
524
  }
525
  ],
526
  "logging_steps": 1,