qingyangzhang commited on
Commit
6921ac4
·
verified ·
1 Parent(s): e4c8ad9

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +250 -10
README.md CHANGED
@@ -1,10 +1,8 @@
1
  ---
2
- datasets: domenicrosati/TruthfulQA
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-EMPO-TQA
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  - trl
9
  - grpo
10
  licence: license
@@ -12,7 +10,7 @@ licence: license
12
 
13
  # Model Card for Qwen2.5-3B-EMPO-TQA
14
 
15
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [domenicrosati/TruthfulQA](https://huggingface.co/datasets/domenicrosati/TruthfulQA) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,7 +26,7 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/gaqlrb6w)
32
 
33
 
34
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen2.5-3B-EMPO-TQA
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
  - grpo
8
  licence: license
 
10
 
11
  # Model Card for Qwen2.5-3B-EMPO-TQA
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/s54psly4)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00023655204281105568,
4
- "train_runtime": 1010.415,
5
  "train_samples": 490,
6
- "train_samples_per_second": 0.485,
7
- "train_steps_per_second": 0.01
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00038097099556277193,
4
+ "train_runtime": 1888.0541,
5
  "train_samples": 490,
6
+ "train_samples_per_second": 0.779,
7
+ "train_steps_per_second": 0.016
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00023655204281105568,
4
- "train_runtime": 1010.415,
5
  "train_samples": 490,
6
- "train_samples_per_second": 0.485,
7
- "train_steps_per_second": 0.01
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00038097099556277193,
4
+ "train_runtime": 1888.0541,
5
  "train_samples": 490,
6
+ "train_samples_per_second": 0.779,
7
+ "train_steps_per_second": 0.016
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.975609756097561,
5
  "eval_steps": 100,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -129,19 +129,259 @@
129
  "step": 10
130
  },
131
  {
132
- "epoch": 0.975609756097561,
133
- "step": 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  "total_flos": 0.0,
135
- "train_loss": 0.00023655204281105568,
136
- "train_runtime": 1010.415,
137
- "train_samples_per_second": 0.485,
138
- "train_steps_per_second": 0.01
139
  }
140
  ],
141
  "logging_steps": 1,
142
- "max_steps": 10,
143
  "num_input_tokens_seen": 0,
144
- "num_train_epochs": 1,
145
  "save_steps": 500,
146
  "stateful_callbacks": {
147
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.8780487804878048,
5
  "eval_steps": 100,
6
+ "global_step": 30,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
129
  "step": 10
130
  },
131
  {
132
+ "completion_length": 17.564236164093018,
133
+ "epoch": 1.0975609756097562,
134
+ "grad_norm": 0.6446972489356995,
135
+ "kl": 0.009983062744140625,
136
+ "learning_rate": 1.5971585917027862e-06,
137
+ "loss": 0.0004,
138
+ "reward": 0.5920138917863369,
139
+ "reward_std": 0.11490329634398222,
140
+ "rewards/semantic_entropy": 0.5920138917863369,
141
+ "step": 11
142
+ },
143
+ {
144
+ "completion_length": 20.96701431274414,
145
+ "epoch": 1.1951219512195121,
146
+ "grad_norm": 0.4366309642791748,
147
+ "kl": 0.012844085693359375,
148
+ "learning_rate": 1.5e-06,
149
+ "loss": 0.0005,
150
+ "reward": 0.5960648246109486,
151
+ "reward_std": 0.10349765885621309,
152
+ "rewards/semantic_entropy": 0.5960648246109486,
153
+ "step": 12
154
+ },
155
+ {
156
+ "completion_length": 20.28819489479065,
157
+ "epoch": 1.2926829268292683,
158
+ "grad_norm": 0.7926385998725891,
159
+ "kl": 0.017467498779296875,
160
+ "learning_rate": 1.3960797660391568e-06,
161
+ "loss": 0.0007,
162
+ "reward": 0.5014467723667622,
163
+ "reward_std": 0.12278107088059187,
164
+ "rewards/semantic_entropy": 0.5014467723667622,
165
+ "step": 13
166
+ },
167
+ {
168
+ "completion_length": 20.972223043441772,
169
+ "epoch": 1.3902439024390243,
170
+ "grad_norm": 0.67302006483078,
171
+ "kl": 0.017255783081054688,
172
+ "learning_rate": 1.2868032327110903e-06,
173
+ "loss": 0.0007,
174
+ "reward": 0.5095486305654049,
175
+ "reward_std": 0.09976449748501182,
176
+ "rewards/semantic_entropy": 0.5095486305654049,
177
+ "step": 14
178
+ },
179
+ {
180
+ "completion_length": 18.17013907432556,
181
+ "epoch": 1.4878048780487805,
182
+ "grad_norm": 0.43078961968421936,
183
+ "kl": 0.0076904296875,
184
+ "learning_rate": 1.1736481776669305e-06,
185
+ "loss": 0.0003,
186
+ "reward": 0.588252317160368,
187
+ "reward_std": 0.1104576913639903,
188
+ "rewards/semantic_entropy": 0.588252317160368,
189
+ "step": 15
190
+ },
191
+ {
192
+ "completion_length": 19.697917222976685,
193
+ "epoch": 1.5853658536585367,
194
+ "grad_norm": 0.4722602665424347,
195
+ "kl": 0.010288238525390625,
196
+ "learning_rate": 1.0581448289104758e-06,
197
+ "loss": 0.0004,
198
+ "reward": 0.559606496244669,
199
+ "reward_std": 0.11904297955334187,
200
+ "rewards/semantic_entropy": 0.559606496244669,
201
+ "step": 16
202
+ },
203
+ {
204
+ "completion_length": 19.661458730697632,
205
+ "epoch": 1.6829268292682928,
206
+ "grad_norm": 0.6996189951896667,
207
+ "kl": 0.012149810791015625,
208
+ "learning_rate": 9.418551710895241e-07,
209
+ "loss": 0.0005,
210
+ "reward": 0.591435182839632,
211
+ "reward_std": 0.10133868269622326,
212
+ "rewards/semantic_entropy": 0.591435182839632,
213
+ "step": 17
214
+ },
215
+ {
216
+ "completion_length": 21.19444465637207,
217
+ "epoch": 1.7804878048780488,
218
+ "grad_norm": 0.5424029231071472,
219
+ "kl": 0.023311614990234375,
220
+ "learning_rate": 8.263518223330696e-07,
221
+ "loss": 0.0009,
222
+ "reward": 0.5104166567325592,
223
+ "reward_std": 0.1390146454796195,
224
+ "rewards/semantic_entropy": 0.5104166567325592,
225
+ "step": 18
226
+ },
227
+ {
228
+ "completion_length": 21.192708253860474,
229
+ "epoch": 1.8780487804878048,
230
+ "grad_norm": 0.5004396438598633,
231
+ "kl": 0.007457733154296875,
232
+ "learning_rate": 7.1319676728891e-07,
233
+ "loss": 0.0003,
234
+ "reward": 0.5214120373129845,
235
+ "reward_std": 0.12240998912602663,
236
+ "rewards/semantic_entropy": 0.5214120373129845,
237
+ "step": 19
238
+ },
239
+ {
240
+ "completion_length": 19.510417342185974,
241
+ "epoch": 1.975609756097561,
242
+ "grad_norm": 0.5037679076194763,
243
+ "kl": 0.012096405029296875,
244
+ "learning_rate": 6.039202339608431e-07,
245
+ "loss": 0.0005,
246
+ "reward": 0.6221064738929272,
247
+ "reward_std": 0.11695278249680996,
248
+ "rewards/semantic_entropy": 0.6221064738929272,
249
+ "step": 20
250
+ },
251
+ {
252
+ "completion_length": 17.875,
253
+ "epoch": 2.0,
254
+ "grad_norm": 0.5037679076194763,
255
+ "kl": 0.03363037109375,
256
+ "learning_rate": 5.000000000000002e-07,
257
+ "loss": 0.0003,
258
+ "reward": 1.0,
259
+ "reward_std": 0.0,
260
+ "rewards/semantic_entropy": 1.0,
261
+ "step": 21
262
+ },
263
+ {
264
+ "completion_length": 18.83506989479065,
265
+ "epoch": 2.097560975609756,
266
+ "grad_norm": 0.727460503578186,
267
+ "kl": 0.01171875,
268
+ "learning_rate": 4.02841408297214e-07,
269
+ "loss": 0.0005,
270
+ "reward": 0.6105324216187,
271
+ "reward_std": 0.0801440766081214,
272
+ "rewards/semantic_entropy": 0.6105324216187,
273
+ "step": 22
274
+ },
275
+ {
276
+ "completion_length": 20.21701443195343,
277
+ "epoch": 2.1951219512195124,
278
+ "grad_norm": 0.6406362056732178,
279
+ "kl": 0.01398468017578125,
280
+ "learning_rate": 3.137583621312665e-07,
281
+ "loss": 0.0006,
282
+ "reward": 0.5815972313284874,
283
+ "reward_std": 0.12349289190024137,
284
+ "rewards/semantic_entropy": 0.5815972313284874,
285
+ "step": 23
286
+ },
287
+ {
288
+ "completion_length": 21.807291984558105,
289
+ "epoch": 2.292682926829268,
290
+ "grad_norm": 0.45703092217445374,
291
+ "kl": 0.012033462524414062,
292
+ "learning_rate": 2.339555568810221e-07,
293
+ "loss": 0.0005,
294
+ "reward": 0.5804398320615292,
295
+ "reward_std": 0.1076621082611382,
296
+ "rewards/semantic_entropy": 0.5804398320615292,
297
+ "step": 24
298
+ },
299
+ {
300
+ "completion_length": 20.090278387069702,
301
+ "epoch": 2.3902439024390243,
302
+ "grad_norm": 1.17950439453125,
303
+ "kl": 0.023633956909179688,
304
+ "learning_rate": 1.6451218858706372e-07,
305
+ "loss": 0.0009,
306
+ "reward": 0.4774305485188961,
307
+ "reward_std": 0.11873876117169857,
308
+ "rewards/semantic_entropy": 0.4774305485188961,
309
+ "step": 25
310
+ },
311
+ {
312
+ "completion_length": 19.114583492279053,
313
+ "epoch": 2.4878048780487805,
314
+ "grad_norm": 1.0232934951782227,
315
+ "kl": 0.021167755126953125,
316
+ "learning_rate": 1.0636735967658784e-07,
317
+ "loss": 0.0008,
318
+ "reward": 0.5917245410382748,
319
+ "reward_std": 0.10972362849861383,
320
+ "rewards/semantic_entropy": 0.5917245410382748,
321
+ "step": 26
322
+ },
323
+ {
324
+ "completion_length": 17.560763835906982,
325
+ "epoch": 2.5853658536585367,
326
+ "grad_norm": 0.45686405897140503,
327
+ "kl": 0.0112457275390625,
328
+ "learning_rate": 6.030737921409168e-08,
329
+ "loss": 0.0004,
330
+ "reward": 0.5879629701375961,
331
+ "reward_std": 0.10896958655212075,
332
+ "rewards/semantic_entropy": 0.5879629701375961,
333
+ "step": 27
334
+ },
335
+ {
336
+ "completion_length": 18.482638955116272,
337
+ "epoch": 2.682926829268293,
338
+ "grad_norm": 2.8627212047576904,
339
+ "kl": 0.028564453125,
340
+ "learning_rate": 2.6955129420176193e-08,
341
+ "loss": 0.0011,
342
+ "reward": 0.5341435223817825,
343
+ "reward_std": 0.11482170736417174,
344
+ "rewards/semantic_entropy": 0.5341435223817825,
345
+ "step": 28
346
+ },
347
+ {
348
+ "completion_length": 20.75868058204651,
349
+ "epoch": 2.7804878048780486,
350
+ "grad_norm": 0.60300213098526,
351
+ "kl": 0.014234542846679688,
352
+ "learning_rate": 6.761642258056976e-09,
353
+ "loss": 0.0006,
354
+ "reward": 0.47453703358769417,
355
+ "reward_std": 0.12850847654044628,
356
+ "rewards/semantic_entropy": 0.47453703358769417,
357
+ "step": 29
358
+ },
359
+ {
360
+ "completion_length": 19.67881965637207,
361
+ "epoch": 2.8780487804878048,
362
+ "grad_norm": 0.4153330624103546,
363
+ "kl": 0.01200103759765625,
364
+ "learning_rate": 0.0,
365
+ "loss": 0.0005,
366
+ "reward": 0.6588541865348816,
367
+ "reward_std": 0.10488813614938408,
368
+ "rewards/semantic_entropy": 0.6588541865348816,
369
+ "step": 30
370
+ },
371
+ {
372
+ "epoch": 2.8780487804878048,
373
+ "step": 30,
374
  "total_flos": 0.0,
375
+ "train_loss": 0.00038097099556277193,
376
+ "train_runtime": 1888.0541,
377
+ "train_samples_per_second": 0.779,
378
+ "train_steps_per_second": 0.016
379
  }
380
  ],
381
  "logging_steps": 1,
382
+ "max_steps": 30,
383
  "num_input_tokens_seen": 0,
384
+ "num_train_epochs": 3,
385
  "save_steps": 500,
386
  "stateful_callbacks": {
387
  "TrainerControl": {