Lansechen commited on
Commit
10650a2
·
verified ·
1 Parent(s): 9bac3e9

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-3B-Instruct
3
- datasets: Lansechen/om220k_collection_filtered_easy_maxlength32768
4
  library_name: transformers
5
  model_name: Qwen2.5-3B-Instruct-Distill-om220k-fem32768-batch32-epoch3-8192-SORTED
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-3B-Instruct-Distill-om220k-fem32768-batch32-epoch3-8192-SORTED
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) on the [Lansechen/om220k_collection_filtered_easy_maxlength32768](https://huggingface.co/datasets/Lansechen/om220k_collection_filtered_easy_maxlength32768) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenran1995-the-chinese-university-of-hong-kong/huggingface/runs/lctnu2ca)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2.5-3B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-Instruct-Distill-om220k-fem32768-batch32-epoch3-8192-SORTED
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-3B-Instruct-Distill-om220k-fem32768-batch32-epoch3-8192-SORTED
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenran1995-the-chinese-university-of-hong-kong/huggingface/runs/rk7o4un1)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 487266319859712.0,
3
- "train_loss": 0.4755451946547537,
4
- "train_runtime": 8692.0512,
5
  "train_samples": 14090,
6
- "train_samples_per_second": 4.863,
7
- "train_steps_per_second": 0.038
8
  }
 
1
  {
2
+ "total_flos": 462777357434880.0,
3
+ "train_loss": 0.5179573546827229,
4
+ "train_runtime": 8239.8327,
5
  "train_samples": 14090,
6
+ "train_samples_per_second": 4.764,
7
+ "train_steps_per_second": 0.037
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2012204f22afb8d3fff387d7aea0393e487cee227a595908f641521dd8d1b68a
3
  size 4957560304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad833d57d0b1eb76651d5b858ae2d07e6c8b91a3f4c34caebd88f76814a8350f
3
  size 4957560304
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5dac48cba186c8391cdf5567f226f16b298eac1f67ea0b8bd2405207d8b76a8
3
  size 1214366696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9bc9d2ff8756128c2da47bcb02eb188a1f70654f5018f9b492f5b022e845a2a
3
  size 1214366696
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 487266319859712.0,
3
- "train_loss": 0.4755451946547537,
4
- "train_runtime": 8692.0512,
5
  "train_samples": 14090,
6
- "train_samples_per_second": 4.863,
7
- "train_steps_per_second": 0.038
8
  }
 
1
  {
2
+ "total_flos": 462777357434880.0,
3
+ "train_loss": 0.5179573546827229,
4
+ "train_runtime": 8239.8327,
5
  "train_samples": 14090,
6
+ "train_samples_per_second": 4.764,
7
+ "train_steps_per_second": 0.037
8
  }
trainer_state.json CHANGED
@@ -1,553 +1,514 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.9807037457434733,
5
  "eval_steps": 500,
6
- "global_step": 330,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.04540295119182747,
13
- "grad_norm": 2.2832868099212646,
14
- "learning_rate": 1.4705882352941177e-05,
15
- "loss": 0.8501,
16
- "mean_token_accuracy": 0.7685548841953278,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.09080590238365494,
21
- "grad_norm": 1.0126409530639648,
22
- "learning_rate": 2.9411764705882354e-05,
23
- "loss": 0.737,
24
- "mean_token_accuracy": 0.7846408411860466,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.1362088535754824,
29
- "grad_norm": 0.6942329406738281,
30
- "learning_rate": 4.411764705882353e-05,
31
- "loss": 0.6464,
32
- "mean_token_accuracy": 0.803183613717556,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.18161180476730987,
37
- "grad_norm": 0.43081212043762207,
38
- "learning_rate": 4.9989800631379443e-05,
39
- "loss": 0.6263,
40
- "mean_token_accuracy": 0.8069212257862091,
41
  "step": 20
42
  },
43
  {
44
- "epoch": 0.22701475595913734,
45
- "grad_norm": 0.3860848844051361,
46
- "learning_rate": 4.992750463988114e-05,
47
- "loss": 0.5918,
48
- "mean_token_accuracy": 0.8146931797266006,
49
  "step": 25
50
  },
51
  {
52
- "epoch": 0.2724177071509648,
53
- "grad_norm": 0.3675363063812256,
54
- "learning_rate": 4.9808735645324125e-05,
55
- "loss": 0.5639,
56
- "mean_token_accuracy": 0.8224288642406463,
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.3178206583427923,
61
- "grad_norm": 0.3828999698162079,
62
- "learning_rate": 4.963379271093012e-05,
63
- "loss": 0.5616,
64
- "mean_token_accuracy": 0.8219838857650756,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.36322360953461974,
69
- "grad_norm": 0.3435409665107727,
70
- "learning_rate": 4.9403116347269866e-05,
71
- "loss": 0.5552,
72
- "mean_token_accuracy": 0.8231068581342698,
73
  "step": 40
74
  },
75
  {
76
- "epoch": 0.4086265607264472,
77
- "grad_norm": 0.34848248958587646,
78
- "learning_rate": 4.9117287403046766e-05,
79
- "loss": 0.5448,
80
- "mean_token_accuracy": 0.8261528626084328,
81
  "step": 45
82
  },
83
  {
84
- "epoch": 0.4540295119182747,
85
- "grad_norm": 0.3018947243690491,
86
- "learning_rate": 4.87770256025057e-05,
87
- "loss": 0.5387,
88
- "mean_token_accuracy": 0.8274358585476875,
89
  "step": 50
90
  },
91
  {
92
- "epoch": 0.49943246311010214,
93
- "grad_norm": 0.29304125905036926,
94
- "learning_rate": 4.8383187733149814e-05,
95
- "loss": 0.5286,
96
- "mean_token_accuracy": 0.8303813666105271,
97
  "step": 55
98
  },
99
  {
100
- "epoch": 0.5448354143019296,
101
- "grad_norm": 0.3363098204135895,
102
- "learning_rate": 4.7936765488328794e-05,
103
- "loss": 0.5338,
104
- "mean_token_accuracy": 0.8282567322254181,
105
  "step": 60
106
  },
107
  {
108
- "epoch": 0.5902383654937571,
109
- "grad_norm": 0.3552097678184509,
110
- "learning_rate": 4.7438882970130756e-05,
111
- "loss": 0.5262,
112
- "mean_token_accuracy": 0.8306325897574425,
113
  "step": 65
114
  },
115
  {
116
- "epoch": 0.6356413166855845,
117
- "grad_norm": 0.3871628940105438,
118
- "learning_rate": 4.6890793858865865e-05,
119
- "loss": 0.5315,
120
- "mean_token_accuracy": 0.8285770401358604,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.681044267877412,
125
- "grad_norm": 0.3822765350341797,
126
- "learning_rate": 4.629387825626875e-05,
127
- "loss": 0.5281,
128
- "mean_token_accuracy": 0.8299038335680962,
129
  "step": 75
130
  },
131
  {
132
- "epoch": 0.7264472190692395,
133
- "grad_norm": 0.4144015312194824,
134
- "learning_rate": 4.5649639210368714e-05,
135
- "loss": 0.5234,
136
- "mean_token_accuracy": 0.8312319874763489,
137
  "step": 80
138
  },
139
  {
140
- "epoch": 0.771850170261067,
141
- "grad_norm": 0.3905055820941925,
142
- "learning_rate": 4.4959698930778184e-05,
143
- "loss": 0.5214,
144
- "mean_token_accuracy": 0.8313256472349166,
145
  "step": 85
146
  },
147
  {
148
- "epoch": 0.8172531214528944,
149
- "grad_norm": 0.34477153420448303,
150
- "learning_rate": 4.422579470392941e-05,
151
- "loss": 0.5219,
152
- "mean_token_accuracy": 0.8311609581112862,
153
  "step": 90
154
  },
155
  {
156
- "epoch": 0.8626560726447219,
157
- "grad_norm": 0.3220530152320862,
158
- "learning_rate": 4.3449774518544837e-05,
159
- "loss": 0.5187,
160
- "mean_token_accuracy": 0.8318586707115173,
161
  "step": 95
162
  },
163
  {
164
- "epoch": 0.9080590238365494,
165
- "grad_norm": 0.36191967129707336,
166
- "learning_rate": 4.263359241235657e-05,
167
- "loss": 0.5095,
168
- "mean_token_accuracy": 0.8344970971345902,
169
  "step": 100
170
  },
171
  {
172
- "epoch": 0.9534619750283768,
173
- "grad_norm": 0.333427369594574,
174
- "learning_rate": 4.1779303551791695e-05,
175
- "loss": 0.5083,
176
- "mean_token_accuracy": 0.8353533893823624,
177
  "step": 105
178
  },
179
  {
180
- "epoch": 0.9988649262202043,
181
- "grad_norm": 0.35690993070602417,
182
- "learning_rate": 4.088905905701316e-05,
183
- "loss": 0.5166,
184
- "mean_token_accuracy": 0.8316411137580871,
185
  "step": 110
186
  },
187
  {
188
- "epoch": 1.036322360953462,
189
- "grad_norm": 0.5551149845123291,
190
- "learning_rate": 3.996510058534682e-05,
191
- "loss": 0.5007,
192
- "mean_token_accuracy": 0.8440239573969985,
193
  "step": 115
194
  },
195
  {
196
- "epoch": 1.0817253121452894,
197
- "grad_norm": 0.47316232323646545,
198
- "learning_rate": 3.900975468673368e-05,
199
- "loss": 0.4582,
200
- "mean_token_accuracy": 0.8475249111652374,
201
  "step": 120
202
  },
203
  {
204
- "epoch": 1.127128263337117,
205
- "grad_norm": 0.416748583316803,
206
- "learning_rate": 3.8025426945420426e-05,
207
- "loss": 0.4556,
208
- "mean_token_accuracy": 0.848306542634964,
209
  "step": 125
210
  },
211
  {
212
- "epoch": 1.1725312145289444,
213
- "grad_norm": 0.3407842218875885,
214
- "learning_rate": 3.701459592263974e-05,
215
- "loss": 0.4654,
216
- "mean_token_accuracy": 0.8451288223266602,
217
  "step": 130
218
  },
219
  {
220
- "epoch": 1.2179341657207718,
221
- "grad_norm": 0.319158673286438,
222
- "learning_rate": 3.59798069155327e-05,
223
- "loss": 0.4574,
224
- "mean_token_accuracy": 0.8477920219302177,
225
  "step": 135
226
  },
227
  {
228
- "epoch": 1.2633371169125993,
229
- "grad_norm": 0.3241802155971527,
230
- "learning_rate": 3.492366554802856e-05,
231
- "loss": 0.4583,
232
- "mean_token_accuracy": 0.8482500284910202,
233
  "step": 140
234
  },
235
  {
236
- "epoch": 1.3087400681044268,
237
- "grad_norm": 0.31475913524627686,
238
- "learning_rate": 3.384883120982027e-05,
239
- "loss": 0.4611,
240
- "mean_token_accuracy": 0.8468337655067444,
241
  "step": 145
242
  },
243
  {
244
- "epoch": 1.3541430192962542,
245
- "grad_norm": 0.2913879454135895,
246
- "learning_rate": 3.2758010359956376e-05,
247
- "loss": 0.4507,
248
- "mean_token_accuracy": 0.8497300013899803,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 1.3995459704880817,
253
- "grad_norm": 0.32135486602783203,
254
- "learning_rate": 3.165394971191125e-05,
255
- "loss": 0.4553,
256
- "mean_token_accuracy": 0.8484074637293816,
257
  "step": 155
258
  },
259
  {
260
- "epoch": 1.4449489216799092,
261
- "grad_norm": 0.3051537871360779,
262
- "learning_rate": 3.053942931729365e-05,
263
- "loss": 0.4649,
264
- "mean_token_accuracy": 0.8455980613827705,
265
  "step": 160
266
  },
267
  {
268
- "epoch": 1.4903518728717366,
269
- "grad_norm": 0.31487956643104553,
270
- "learning_rate": 2.9417255565608982e-05,
271
- "loss": 0.4579,
272
- "mean_token_accuracy": 0.8473424926400185,
273
  "step": 165
274
  },
275
  {
276
- "epoch": 1.5357548240635641,
277
- "grad_norm": 0.29999110102653503,
278
- "learning_rate": 2.8290254117702204e-05,
279
- "loss": 0.4505,
280
- "mean_token_accuracy": 0.8498715803027153,
281
  "step": 170
282
  },
283
  {
284
- "epoch": 1.5811577752553916,
285
- "grad_norm": 0.33342310786247253,
286
- "learning_rate": 2.7161262790675013e-05,
287
- "loss": 0.4557,
288
- "mean_token_accuracy": 0.8481177806854248,
289
  "step": 175
290
  },
291
  {
292
- "epoch": 1.626560726447219,
293
- "grad_norm": 0.31186771392822266,
294
- "learning_rate": 2.6033124412193167e-05,
295
- "loss": 0.4561,
296
- "mean_token_accuracy": 0.8478582665324211,
297
  "step": 180
298
  },
299
  {
300
- "epoch": 1.6719636776390465,
301
- "grad_norm": 0.3249659538269043,
302
- "learning_rate": 2.4908679662177216e-05,
303
- "loss": 0.4579,
304
- "mean_token_accuracy": 0.8474476784467697,
305
  "step": 185
306
  },
307
  {
308
- "epoch": 1.717366628830874,
309
- "grad_norm": 0.26083502173423767,
310
- "learning_rate": 2.379075991990126e-05,
311
- "loss": 0.4556,
312
- "mean_token_accuracy": 0.8481593802571297,
313
  "step": 190
314
  },
315
  {
316
- "epoch": 1.7627695800227015,
317
- "grad_norm": 0.29590320587158203,
318
- "learning_rate": 2.2682180134510943e-05,
319
- "loss": 0.4574,
320
- "mean_token_accuracy": 0.8473795130848885,
321
  "step": 195
322
  },
323
  {
324
- "epoch": 1.808172531214529,
325
- "grad_norm": 0.2711656391620636,
326
- "learning_rate": 2.1585731736912922e-05,
327
- "loss": 0.4514,
328
- "mean_token_accuracy": 0.8492301076650619,
329
  "step": 200
330
  },
331
  {
332
- "epoch": 1.8535754824063564,
333
- "grad_norm": 0.26013004779815674,
334
- "learning_rate": 2.0504175610883876e-05,
335
- "loss": 0.4496,
336
- "mean_token_accuracy": 0.8500913769006729,
337
  "step": 205
338
  },
339
  {
340
- "epoch": 1.8989784335981839,
341
- "grad_norm": 0.2390243411064148,
342
- "learning_rate": 1.944023514109784e-05,
343
- "loss": 0.4454,
344
- "mean_token_accuracy": 0.8509305417537689,
345
  "step": 210
346
  },
347
  {
348
- "epoch": 1.9443813847900113,
349
- "grad_norm": 0.23900045454502106,
350
- "learning_rate": 1.83965893555773e-05,
351
- "loss": 0.4527,
352
- "mean_token_accuracy": 0.8493241637945175,
353
  "step": 215
354
  },
355
  {
356
- "epoch": 1.9897843359818388,
357
- "grad_norm": 0.237857386469841,
358
- "learning_rate": 1.737586617983534e-05,
359
- "loss": 0.453,
360
- "mean_token_accuracy": 0.8486923798918724,
361
  "step": 220
362
  },
363
  {
364
- "epoch": 2.0272417707150963,
365
- "grad_norm": 0.32961711287498474,
366
- "learning_rate": 1.6380635819695172e-05,
367
- "loss": 0.4169,
368
- "mean_token_accuracy": 0.8589759830272559,
369
  "step": 225
370
  },
371
  {
372
- "epoch": 2.072644721906924,
373
- "grad_norm": 0.3402288556098938,
374
- "learning_rate": 1.541340428944929e-05,
375
- "loss": 0.4057,
376
- "mean_token_accuracy": 0.861960718035698,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 2.1180476730987516,
381
- "grad_norm": 0.31487351655960083,
382
- "learning_rate": 1.44766071016544e-05,
383
- "loss": 0.3978,
384
- "mean_token_accuracy": 0.8647524937987328,
385
  "step": 235
386
  },
387
  {
388
- "epoch": 2.163450624290579,
389
- "grad_norm": 0.27213922142982483,
390
- "learning_rate": 1.3572603134451479e-05,
391
- "loss": 0.4021,
392
- "mean_token_accuracy": 0.8630986794829368,
393
  "step": 240
394
  },
395
  {
396
- "epoch": 2.208853575482406,
397
- "grad_norm": 0.28667008876800537,
398
- "learning_rate": 1.2703668691853155e-05,
399
- "loss": 0.4033,
400
- "mean_token_accuracy": 0.862886956334114,
401
  "step": 245
402
  },
403
  {
404
- "epoch": 2.254256526674234,
405
- "grad_norm": 0.2372172772884369,
406
- "learning_rate": 1.1871991771954748e-05,
407
- "loss": 0.3977,
408
- "mean_token_accuracy": 0.8642744541168212,
409
  "step": 250
410
  },
411
  {
412
- "epoch": 2.2996594778660615,
413
- "grad_norm": 0.23380817472934723,
414
- "learning_rate": 1.1079666557501736e-05,
415
- "loss": 0.3989,
416
- "mean_token_accuracy": 0.8641743138432503,
417
  "step": 255
418
  },
419
  {
420
- "epoch": 2.3450624290578888,
421
- "grad_norm": 0.24048562347888947,
422
- "learning_rate": 1.0328688142686627e-05,
423
- "loss": 0.402,
424
- "mean_token_accuracy": 0.8630986511707306,
425
  "step": 260
426
  },
427
  {
428
- "epoch": 2.390465380249716,
429
- "grad_norm": 0.2319810539484024,
430
- "learning_rate": 9.620947509453155e-06,
431
- "loss": 0.3917,
432
- "mean_token_accuracy": 0.866476172208786,
433
  "step": 265
434
  },
435
  {
436
- "epoch": 2.4358683314415437,
437
- "grad_norm": 0.23446470499038696,
438
- "learning_rate": 8.958226765957655e-06,
439
- "loss": 0.3991,
440
- "mean_token_accuracy": 0.8653428852558136,
441
  "step": 270
442
  },
443
  {
444
- "epoch": 2.4812712826333714,
445
- "grad_norm": 0.23361656069755554,
446
- "learning_rate": 8.342194659177358e-06,
447
- "loss": 0.3973,
448
- "mean_token_accuracy": 0.864533805847168,
449
  "step": 275
450
  },
451
  {
452
- "epoch": 2.5266742338251986,
453
- "grad_norm": 0.24215641617774963,
454
- "learning_rate": 7.774402372964833e-06,
455
- "loss": 0.4069,
456
- "mean_token_accuracy": 0.8616099029779434,
457
  "step": 280
458
  },
459
  {
460
- "epoch": 2.572077185017026,
461
- "grad_norm": 0.23187781870365143,
462
- "learning_rate": 7.256279622129215e-06,
463
- "loss": 0.4077,
464
- "mean_token_accuracy": 0.8610607802867889,
465
  "step": 285
466
  },
467
  {
468
- "epoch": 2.6174801362088536,
469
- "grad_norm": 0.2463005632162094,
470
- "learning_rate": 6.789131052379549e-06,
471
- "loss": 0.3955,
472
- "mean_token_accuracy": 0.8653189897537231,
473
  "step": 290
474
  },
475
  {
476
- "epoch": 2.6628830874006812,
477
- "grad_norm": 0.25329700112342834,
478
- "learning_rate": 6.374132955195062e-06,
479
- "loss": 0.399,
480
- "mean_token_accuracy": 0.8640432074666023,
481
  "step": 295
482
  },
483
  {
484
- "epoch": 2.7082860385925085,
485
- "grad_norm": 0.23303279280662537,
486
- "learning_rate": 6.012330305894584e-06,
487
- "loss": 0.4019,
488
- "mean_token_accuracy": 0.8630045488476753,
489
  "step": 300
490
  },
491
  {
492
- "epoch": 2.7536889897843357,
493
- "grad_norm": 0.23156684637069702,
494
- "learning_rate": 5.704634132363239e-06,
495
- "loss": 0.3993,
496
- "mean_token_accuracy": 0.8640620157122612,
497
  "step": 305
498
  },
499
  {
500
- "epoch": 2.7990919409761634,
501
- "grad_norm": 0.2185850739479065,
502
- "learning_rate": 5.451819221062024e-06,
503
- "loss": 0.4009,
504
- "mean_token_accuracy": 0.863526065647602,
505
- "step": 310
506
- },
507
- {
508
- "epoch": 2.844494892167991,
509
- "grad_norm": 0.22054985165596008,
510
- "learning_rate": 5.254522166096635e-06,
511
- "loss": 0.4016,
512
- "mean_token_accuracy": 0.8630533397197724,
513
- "step": 315
514
- },
515
- {
516
- "epoch": 2.8898978433598184,
517
- "grad_norm": 0.23411568999290466,
518
- "learning_rate": 5.113239766257999e-06,
519
- "loss": 0.398,
520
- "mean_token_accuracy": 0.8644332170486451,
521
- "step": 320
522
- },
523
- {
524
- "epoch": 2.9353007945516456,
525
- "grad_norm": 0.24771377444267273,
526
- "learning_rate": 5.028327774070807e-06,
527
- "loss": 0.4119,
528
- "mean_token_accuracy": 0.8622763112187386,
529
- "step": 325
530
- },
531
- {
532
- "epoch": 2.9807037457434733,
533
- "grad_norm": 0.21957579255104065,
534
- "learning_rate": 5e-06,
535
- "loss": 0.3961,
536
- "mean_token_accuracy": 0.8651398867368698,
537
- "step": 330
538
- },
539
- {
540
- "epoch": 2.9807037457434733,
541
- "step": 330,
542
- "total_flos": 487266319859712.0,
543
- "train_loss": 0.4755451946547537,
544
- "train_runtime": 8692.0512,
545
- "train_samples_per_second": 4.863,
546
- "train_steps_per_second": 0.038
547
  }
548
  ],
549
  "logging_steps": 5,
550
- "max_steps": 330,
551
  "num_input_tokens_seen": 0,
552
  "num_train_epochs": 3,
553
  "save_steps": 100,
@@ -563,7 +524,7 @@
563
  "attributes": {}
564
  }
565
  },
566
- "total_flos": 487266319859712.0,
567
  "train_batch_size": 4,
568
  "trial_name": null,
569
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.97799511002445,
5
  "eval_steps": 500,
6
+ "global_step": 306,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0488997555012225,
13
+ "grad_norm": 2.0351288318634033,
14
+ "learning_rate": 1.5625e-05,
15
+ "loss": 0.8708,
16
+ "mean_token_accuracy": 0.7660431817173958,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.097799511002445,
21
+ "grad_norm": 0.9313664436340332,
22
+ "learning_rate": 3.125e-05,
23
+ "loss": 0.7742,
24
+ "mean_token_accuracy": 0.777975857257843,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 0.1466992665036675,
29
+ "grad_norm": 0.6834899187088013,
30
+ "learning_rate": 4.6875e-05,
31
+ "loss": 0.6951,
32
+ "mean_token_accuracy": 0.7929965913295746,
33
  "step": 15
34
  },
35
  {
36
+ "epoch": 0.19559902200489,
37
+ "grad_norm": 0.4830226004123688,
38
+ "learning_rate": 4.997887930048948e-05,
39
+ "loss": 0.6486,
40
+ "mean_token_accuracy": 0.8033514261245728,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 0.24449877750611246,
45
+ "grad_norm": 0.3558278977870941,
46
+ "learning_rate": 4.989314441231019e-05,
47
+ "loss": 0.6276,
48
+ "mean_token_accuracy": 0.8075312823057175,
49
  "step": 25
50
  },
51
  {
52
+ "epoch": 0.293398533007335,
53
+ "grad_norm": 0.29995062947273254,
54
+ "learning_rate": 4.9741726573281165e-05,
55
+ "loss": 0.6126,
56
+ "mean_token_accuracy": 0.8111417979001999,
57
  "step": 30
58
  },
59
  {
60
+ "epoch": 0.3422982885085575,
61
+ "grad_norm": 0.3162994980812073,
62
+ "learning_rate": 4.95250699180132e-05,
63
+ "loss": 0.6036,
64
+ "mean_token_accuracy": 0.8128530994057656,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.39119804400978,
69
+ "grad_norm": 0.25494644045829773,
70
+ "learning_rate": 4.9243809937805656e-05,
71
+ "loss": 0.59,
72
+ "mean_token_accuracy": 0.8160383135080338,
73
  "step": 40
74
  },
75
  {
76
+ "epoch": 0.4400977995110024,
77
+ "grad_norm": 0.34374159574508667,
78
+ "learning_rate": 4.889877161664096e-05,
79
+ "loss": 0.586,
80
+ "mean_token_accuracy": 0.8166863277554512,
81
  "step": 45
82
  },
83
  {
84
+ "epoch": 0.4889975550122249,
85
+ "grad_norm": 0.34626278281211853,
86
+ "learning_rate": 4.8490967011364394e-05,
87
+ "loss": 0.5791,
88
+ "mean_token_accuracy": 0.818395821750164,
89
  "step": 50
90
  },
91
  {
92
+ "epoch": 0.5378973105134475,
93
+ "grad_norm": 0.27951428294181824,
94
+ "learning_rate": 4.8021592283147074e-05,
95
+ "loss": 0.5784,
96
+ "mean_token_accuracy": 0.8185736656188964,
97
  "step": 55
98
  },
99
  {
100
+ "epoch": 0.58679706601467,
101
+ "grad_norm": 0.31336385011672974,
102
+ "learning_rate": 4.749202418893905e-05,
103
+ "loss": 0.5667,
104
+ "mean_token_accuracy": 0.8213964581489563,
105
  "step": 60
106
  },
107
  {
108
+ "epoch": 0.6356968215158925,
109
+ "grad_norm": 0.3097311556339264,
110
+ "learning_rate": 4.690381604320405e-05,
111
+ "loss": 0.5616,
112
+ "mean_token_accuracy": 0.822909340262413,
113
  "step": 65
114
  },
115
  {
116
+ "epoch": 0.684596577017115,
117
+ "grad_norm": 0.3286179304122925,
118
+ "learning_rate": 4.625869316178043e-05,
119
+ "loss": 0.5609,
120
+ "mean_token_accuracy": 0.8229187726974487,
121
  "step": 70
122
  },
123
  {
124
+ "epoch": 0.7334963325183375,
125
+ "grad_norm": 0.32643210887908936,
126
+ "learning_rate": 4.5558547801232645e-05,
127
+ "loss": 0.5575,
128
+ "mean_token_accuracy": 0.8237524956464768,
129
  "step": 75
130
  },
131
  {
132
+ "epoch": 0.78239608801956,
133
+ "grad_norm": 0.30941641330718994,
134
+ "learning_rate": 4.480543360853665e-05,
135
+ "loss": 0.5633,
136
+ "mean_token_accuracy": 0.8215603858232499,
137
  "step": 80
138
  },
139
  {
140
+ "epoch": 0.8312958435207825,
141
+ "grad_norm": 0.31723615527153015,
142
+ "learning_rate": 4.4001559597379503e-05,
143
+ "loss": 0.5534,
144
+ "mean_token_accuracy": 0.8242628321051597,
145
  "step": 85
146
  },
147
  {
148
+ "epoch": 0.8801955990220048,
149
+ "grad_norm": 0.37186378240585327,
150
+ "learning_rate": 4.3149283668741616e-05,
151
+ "loss": 0.5611,
152
+ "mean_token_accuracy": 0.8221262007951736,
153
  "step": 90
154
  },
155
  {
156
+ "epoch": 0.9290953545232273,
157
+ "grad_norm": 0.3594883978366852,
158
+ "learning_rate": 4.225110569476691e-05,
159
+ "loss": 0.5532,
160
+ "mean_token_accuracy": 0.824306620657444,
161
  "step": 95
162
  },
163
  {
164
+ "epoch": 0.9779951100244498,
165
+ "grad_norm": 0.34556683897972107,
166
+ "learning_rate": 4.1309660186207094e-05,
167
+ "loss": 0.5511,
168
+ "mean_token_accuracy": 0.8247394233942031,
169
  "step": 100
170
  },
171
  {
172
+ "epoch": 1.019559902200489,
173
+ "grad_norm": 0.47831159830093384,
174
+ "learning_rate": 4.0327708564947725e-05,
175
+ "loss": 0.5286,
176
+ "mean_token_accuracy": 0.8304704760803896,
177
  "step": 105
178
  },
179
  {
180
+ "epoch": 1.0684596577017116,
181
+ "grad_norm": 0.38634800910949707,
182
+ "learning_rate": 3.930813106428202e-05,
183
+ "loss": 0.511,
184
+ "mean_token_accuracy": 0.8345152765512467,
185
  "step": 110
186
  },
187
  {
188
+ "epoch": 1.117359413202934,
189
+ "grad_norm": 0.32826101779937744,
190
+ "learning_rate": 3.825391828069032e-05,
191
+ "loss": 0.5022,
192
+ "mean_token_accuracy": 0.8373170182108879,
193
  "step": 115
194
  },
195
  {
196
+ "epoch": 1.1662591687041566,
197
+ "grad_norm": 0.32246339321136475,
198
+ "learning_rate": 3.7168162401905414e-05,
199
+ "loss": 0.5074,
200
+ "mean_token_accuracy": 0.8353763505816459,
201
  "step": 120
202
  },
203
  {
204
+ "epoch": 1.215158924205379,
205
+ "grad_norm": 0.2942683696746826,
206
+ "learning_rate": 3.605404813699307e-05,
207
+ "loss": 0.4969,
208
+ "mean_token_accuracy": 0.8387214556336403,
209
  "step": 125
210
  },
211
  {
212
+ "epoch": 1.2640586797066016,
213
+ "grad_norm": 0.32895198464393616,
214
+ "learning_rate": 3.491484337505166e-05,
215
+ "loss": 0.4994,
216
+ "mean_token_accuracy": 0.837771400809288,
217
  "step": 130
218
  },
219
  {
220
+ "epoch": 1.312958435207824,
221
+ "grad_norm": 0.3600136339664459,
222
+ "learning_rate": 3.375388959993037e-05,
223
+ "loss": 0.4996,
224
+ "mean_token_accuracy": 0.8377428948879242,
225
  "step": 135
226
  },
227
  {
228
+ "epoch": 1.3618581907090466,
229
+ "grad_norm": 0.27240756154060364,
230
+ "learning_rate": 3.2574592089081374e-05,
231
+ "loss": 0.5001,
232
+ "mean_token_accuracy": 0.8374892711639405,
233
  "step": 140
234
  },
235
  {
236
+ "epoch": 1.410757946210269,
237
+ "grad_norm": 0.2881303131580353,
238
+ "learning_rate": 3.1380409925294285e-05,
239
+ "loss": 0.4915,
240
+ "mean_token_accuracy": 0.8399991437792778,
241
  "step": 145
242
  },
243
  {
244
+ "epoch": 1.4596577017114916,
245
+ "grad_norm": 0.2721833884716034,
246
+ "learning_rate": 3.0174845850610395e-05,
247
+ "loss": 0.495,
248
+ "mean_token_accuracy": 0.8390778675675392,
249
  "step": 150
250
  },
251
  {
252
+ "epoch": 1.508557457212714,
253
+ "grad_norm": 0.25566068291664124,
254
+ "learning_rate": 2.8961435992176745e-05,
255
+ "loss": 0.5031,
256
+ "mean_token_accuracy": 0.8365625068545341,
257
  "step": 155
258
  },
259
  {
260
+ "epoch": 1.5574572127139366,
261
+ "grad_norm": 0.2996728718280792,
262
+ "learning_rate": 2.7743739490175903e-05,
263
+ "loss": 0.5002,
264
+ "mean_token_accuracy": 0.8372272953391076,
265
  "step": 160
266
  },
267
  {
268
+ "epoch": 1.606356968215159,
269
+ "grad_norm": 0.38631442189216614,
270
+ "learning_rate": 2.6525328058254604e-05,
271
+ "loss": 0.4926,
272
+ "mean_token_accuracy": 0.8397956892848015,
273
  "step": 165
274
  },
275
  {
276
+ "epoch": 1.6552567237163816,
277
+ "grad_norm": 0.2722865045070648,
278
+ "learning_rate": 2.5309775507072236e-05,
279
+ "loss": 0.5055,
280
+ "mean_token_accuracy": 0.8355408251285553,
281
  "step": 170
282
  },
283
  {
284
+ "epoch": 1.704156479217604,
285
+ "grad_norm": 0.24514281749725342,
286
+ "learning_rate": 2.4100647261698377e-05,
287
+ "loss": 0.4891,
288
+ "mean_token_accuracy": 0.840637344121933,
289
  "step": 175
290
  },
291
  {
292
+ "epoch": 1.7530562347188265,
293
+ "grad_norm": 0.24317032098770142,
294
+ "learning_rate": 2.2901489903606794e-05,
295
+ "loss": 0.5004,
296
+ "mean_token_accuracy": 0.8374736487865448,
297
  "step": 180
298
  },
299
  {
300
+ "epoch": 1.801955990220049,
301
+ "grad_norm": 0.2663393020629883,
302
+ "learning_rate": 2.171582076794088e-05,
303
+ "loss": 0.4871,
304
+ "mean_token_accuracy": 0.8413556531071663,
305
  "step": 185
306
  },
307
  {
308
+ "epoch": 1.8508557457212715,
309
+ "grad_norm": 0.24300876259803772,
310
+ "learning_rate": 2.054711762656369e-05,
311
+ "loss": 0.4951,
312
+ "mean_token_accuracy": 0.8386538654565812,
313
  "step": 190
314
  },
315
  {
316
+ "epoch": 1.899755501222494,
317
+ "grad_norm": 0.23298484086990356,
318
+ "learning_rate": 1.939880848715378e-05,
319
+ "loss": 0.4961,
320
+ "mean_token_accuracy": 0.8383206754922867,
321
  "step": 195
322
  },
323
  {
324
+ "epoch": 1.9486552567237165,
325
+ "grad_norm": 0.2504541873931885,
326
+ "learning_rate": 1.827426153826814e-05,
327
+ "loss": 0.4834,
328
+ "mean_token_accuracy": 0.842043687403202,
329
  "step": 200
330
  },
331
  {
332
+ "epoch": 1.997555012224939,
333
+ "grad_norm": 0.2542346119880676,
334
+ "learning_rate": 1.7176775269864764e-05,
335
+ "loss": 0.4956,
336
+ "mean_token_accuracy": 0.8383987873792649,
337
  "step": 205
338
  },
339
  {
340
+ "epoch": 2.039119804400978,
341
+ "grad_norm": 0.2872137725353241,
342
+ "learning_rate": 1.6109568798263285e-05,
343
+ "loss": 0.4561,
344
+ "mean_token_accuracy": 0.8502998352050781,
345
  "step": 210
346
  },
347
  {
348
+ "epoch": 2.0880195599022007,
349
+ "grad_norm": 0.28450778126716614,
350
+ "learning_rate": 1.5075772423922191e-05,
351
+ "loss": 0.4526,
352
+ "mean_token_accuracy": 0.8503674760460853,
353
  "step": 215
354
  },
355
  {
356
+ "epoch": 2.136919315403423,
357
+ "grad_norm": 0.26681244373321533,
358
+ "learning_rate": 1.4078418449728237e-05,
359
+ "loss": 0.4442,
360
+ "mean_token_accuracy": 0.8526661232113838,
361
  "step": 220
362
  },
363
  {
364
+ "epoch": 2.1858190709046457,
365
+ "grad_norm": 0.25027355551719666,
366
+ "learning_rate": 1.3120432286729548e-05,
367
+ "loss": 0.4487,
368
+ "mean_token_accuracy": 0.8512815818190574,
369
  "step": 225
370
  },
371
  {
372
+ "epoch": 2.234718826405868,
373
+ "grad_norm": 0.2478124052286148,
374
+ "learning_rate": 1.2204623873401074e-05,
375
+ "loss": 0.4461,
376
+ "mean_token_accuracy": 0.8521116316318512,
377
  "step": 230
378
  },
379
  {
380
+ "epoch": 2.28361858190709,
381
+ "grad_norm": 0.23781350255012512,
382
+ "learning_rate": 1.1333679433610849e-05,
383
+ "loss": 0.4419,
384
+ "mean_token_accuracy": 0.8534019276499748,
385
  "step": 235
386
  },
387
  {
388
+ "epoch": 2.332518337408313,
389
+ "grad_norm": 0.23598520457744598,
390
+ "learning_rate": 1.0510153597462537e-05,
391
+ "loss": 0.4454,
392
+ "mean_token_accuracy": 0.852020500600338,
393
  "step": 240
394
  },
395
  {
396
+ "epoch": 2.381418092909535,
397
+ "grad_norm": 0.2218804508447647,
398
+ "learning_rate": 9.736461908125213e-06,
399
+ "loss": 0.4422,
400
+ "mean_token_accuracy": 0.8531288385391236,
401
  "step": 245
402
  },
403
  {
404
+ "epoch": 2.430317848410758,
405
+ "grad_norm": 0.22182683646678925,
406
+ "learning_rate": 9.014873736629045e-06,
407
+ "loss": 0.4551,
408
+ "mean_token_accuracy": 0.8493878960609436,
409
  "step": 250
410
  },
411
  {
412
+ "epoch": 2.47921760391198,
413
+ "grad_norm": 0.20888520777225494,
414
+ "learning_rate": 8.347505625409046e-06,
415
+ "loss": 0.4499,
416
+ "mean_token_accuracy": 0.8507405325770379,
417
  "step": 255
418
  },
419
  {
420
+ "epoch": 2.528117359413203,
421
+ "grad_norm": 0.2135392725467682,
422
+ "learning_rate": 7.73631508012146e-06,
423
+ "loss": 0.4482,
424
+ "mean_token_accuracy": 0.85140231102705,
425
  "step": 260
426
  },
427
  {
428
+ "epoch": 2.577017114914425,
429
+ "grad_norm": 0.21610134840011597,
430
+ "learning_rate": 7.183094827942359e-06,
431
+ "loss": 0.4454,
432
+ "mean_token_accuracy": 0.8524220660328865,
433
  "step": 265
434
  },
435
  {
436
+ "epoch": 2.625916870415648,
437
+ "grad_norm": 0.19675850868225098,
438
+ "learning_rate": 6.689467559190002e-06,
439
+ "loss": 0.4448,
440
+ "mean_token_accuracy": 0.8523233592510223,
441
  "step": 270
442
  },
443
  {
444
+ "epoch": 2.67481662591687,
445
+ "grad_norm": 0.20300239324569702,
446
+ "learning_rate": 6.256881167694502e-06,
447
+ "loss": 0.4482,
448
+ "mean_token_accuracy": 0.8513378649950027,
449
  "step": 275
450
  },
451
  {
452
+ "epoch": 2.723716381418093,
453
+ "grad_norm": 0.2115592211484909,
454
+ "learning_rate": 5.886604503875781e-06,
455
+ "loss": 0.4464,
456
+ "mean_token_accuracy": 0.8521811455488205,
457
  "step": 280
458
  },
459
  {
460
+ "epoch": 2.772616136919315,
461
+ "grad_norm": 0.19751520454883575,
462
+ "learning_rate": 5.579723652986655e-06,
463
+ "loss": 0.446,
464
+ "mean_token_accuracy": 0.8521023660898208,
465
  "step": 285
466
  },
467
  {
468
+ "epoch": 2.821515892420538,
469
+ "grad_norm": 0.19509336352348328,
470
+ "learning_rate": 5.337138749437662e-06,
471
+ "loss": 0.4449,
472
+ "mean_token_accuracy": 0.8526278078556061,
473
  "step": 290
474
  },
475
  {
476
+ "epoch": 2.87041564792176,
477
+ "grad_norm": 0.21524977684020996,
478
+ "learning_rate": 5.159561336547745e-06,
479
+ "loss": 0.4511,
480
+ "mean_token_accuracy": 0.8506037518382072,
481
  "step": 295
482
  },
483
  {
484
+ "epoch": 2.919315403422983,
485
+ "grad_norm": 0.1960809975862503,
486
+ "learning_rate": 5.047512279465102e-06,
487
+ "loss": 0.4423,
488
+ "mean_token_accuracy": 0.8532179862260818,
489
  "step": 300
490
  },
491
  {
492
+ "epoch": 2.968215158924205,
493
+ "grad_norm": 0.19248132407665253,
494
+ "learning_rate": 5.001320237379956e-06,
495
+ "loss": 0.4461,
496
+ "mean_token_accuracy": 0.8518981352448464,
497
  "step": 305
498
  },
499
  {
500
+ "epoch": 2.97799511002445,
501
+ "mean_token_accuracy": 0.8586726412177086,
502
+ "step": 306,
503
+ "total_flos": 462777357434880.0,
504
+ "train_loss": 0.5179573546827229,
505
+ "train_runtime": 8239.8327,
506
+ "train_samples_per_second": 4.764,
507
+ "train_steps_per_second": 0.037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  }
509
  ],
510
  "logging_steps": 5,
511
+ "max_steps": 306,
512
  "num_input_tokens_seen": 0,
513
  "num_train_epochs": 3,
514
  "save_steps": 100,
 
524
  "attributes": {}
525
  }
526
  },
527
+ "total_flos": 462777357434880.0,
528
  "train_batch_size": 4,
529
  "trial_name": null,
530
  "trial_params": null