a-F1 commited on
Commit
a49391f
·
verified ·
1 Parent(s): e736b57

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +5 -10
  3. train_results.json +5 -5
  4. trainer_state.json +373 -323
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/fandou-team/huggingface/runs/mlhp0slv)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/fandou-team/huggingface/runs/m3e8fia3)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "eval_loss": 0.8189318776130676,
3
- "eval_runtime": 25.7782,
4
- "eval_samples": 100,
5
- "eval_samples_per_second": 5.004,
6
- "eval_steps_per_second": 1.28,
7
- "total_flos": 7.883277455484518e+16,
8
- "train_loss": 1.23619465266957,
9
- "train_runtime": 1783.7305,
10
  "train_samples": 1000,
11
- "train_samples_per_second": 1.374,
12
- "train_steps_per_second": 0.172
13
  }
 
1
  {
2
+ "total_flos": 43871077662720.0,
3
+ "train_loss": 0.9090226414915803,
4
+ "train_runtime": 2127.6426,
 
 
 
 
 
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 23.091,
7
+ "train_steps_per_second": 0.181
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 7.883277455484518e+16,
3
- "train_loss": 1.23619465266957,
4
- "train_runtime": 1783.7305,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 1.374,
7
- "train_steps_per_second": 0.172
8
  }
 
1
  {
2
+ "total_flos": 43871077662720.0,
3
+ "train_loss": 0.9090226414915803,
4
+ "train_runtime": 2127.6426,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 23.091,
7
+ "train_steps_per_second": 0.181
8
  }
trainer_state.json CHANGED
@@ -1,531 +1,581 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.99836867862969,
5
- "eval_steps": 100,
6
- "global_step": 306,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01631321370309951,
13
- "grad_norm": 2.53125,
14
- "learning_rate": 3.225806451612903e-06,
15
- "loss": 1.4286,
16
- "mean_token_accuracy": 0.6584272754958135,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.03262642740619902,
21
- "grad_norm": 2.4375,
22
- "learning_rate": 6.451612903225806e-06,
23
- "loss": 1.4087,
24
- "mean_token_accuracy": 0.662085565849035,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.048939641109298535,
29
- "grad_norm": 2.640625,
30
- "learning_rate": 9.67741935483871e-06,
31
- "loss": 1.5058,
32
- "mean_token_accuracy": 0.6418076518243186,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.06525285481239804,
37
- "grad_norm": 1.7421875,
38
- "learning_rate": 1.2903225806451613e-05,
39
- "loss": 1.4118,
40
- "mean_token_accuracy": 0.6534564204767823,
41
  "step": 20
42
  },
43
  {
44
- "epoch": 0.08156606851549755,
45
- "grad_norm": 1.515625,
46
- "learning_rate": 1.6129032258064517e-05,
47
- "loss": 1.3321,
48
- "mean_token_accuracy": 0.6651619066075792,
49
  "step": 25
50
  },
51
  {
52
- "epoch": 0.09787928221859707,
53
- "grad_norm": 1.3671875,
54
- "learning_rate": 1.935483870967742e-05,
55
- "loss": 1.3164,
56
- "mean_token_accuracy": 0.6717782021288032,
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.11419249592169657,
61
- "grad_norm": 1.3359375,
62
- "learning_rate": 1.9989561243382313e-05,
63
- "loss": 1.1912,
64
- "mean_token_accuracy": 0.694698959999242,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.13050570962479607,
69
- "grad_norm": 1.3203125,
70
- "learning_rate": 1.9947191143073185e-05,
71
- "loss": 1.2719,
72
- "mean_token_accuracy": 0.6780847732470013,
73
  "step": 40
74
  },
75
  {
76
- "epoch": 0.1468189233278956,
77
- "grad_norm": 1.234375,
78
- "learning_rate": 1.9872375372801627e-05,
79
- "loss": 1.1799,
80
- "mean_token_accuracy": 0.6968185226866065,
81
  "step": 45
82
  },
83
  {
84
- "epoch": 0.1631321370309951,
85
- "grad_norm": 1.2421875,
86
- "learning_rate": 1.9765357966059638e-05,
87
- "loss": 1.2948,
88
- "mean_token_accuracy": 0.672758400550632,
89
  "step": 50
90
  },
91
  {
92
- "epoch": 0.17944535073409462,
93
- "grad_norm": 1.203125,
94
- "learning_rate": 1.9626487991384194e-05,
95
- "loss": 1.217,
96
- "mean_token_accuracy": 0.6891739777463799,
97
  "step": 55
98
  },
99
  {
100
- "epoch": 0.19575856443719414,
101
- "grad_norm": 1.2109375,
102
- "learning_rate": 1.945621841376825e-05,
103
- "loss": 1.1489,
104
- "mean_token_accuracy": 0.7052165754765654,
105
  "step": 60
106
  },
107
  {
108
- "epoch": 0.21207177814029363,
109
- "grad_norm": 1.296875,
110
- "learning_rate": 1.9255104617183068e-05,
111
- "loss": 1.2015,
112
- "mean_token_accuracy": 0.6921520639873796,
113
  "step": 65
114
  },
115
  {
116
- "epoch": 0.22838499184339314,
117
- "grad_norm": 1.140625,
118
- "learning_rate": 1.9023802593031156e-05,
119
- "loss": 1.2204,
120
- "mean_token_accuracy": 0.6918742825268815,
121
  "step": 70
122
  },
123
  {
124
- "epoch": 0.24469820554649266,
125
- "grad_norm": 1.234375,
126
- "learning_rate": 1.8763066800438638e-05,
127
- "loss": 1.2876,
128
- "mean_token_accuracy": 0.6744394009627726,
129
  "step": 75
130
  },
131
  {
132
- "epoch": 0.26101141924959215,
133
- "grad_norm": 1.2421875,
134
- "learning_rate": 1.8473747705366427e-05,
135
- "loss": 1.2127,
136
- "mean_token_accuracy": 0.6919606949918741,
137
  "step": 80
138
  },
139
  {
140
- "epoch": 0.27732463295269166,
141
- "grad_norm": 1.234375,
142
- "learning_rate": 1.8156789006567018e-05,
143
- "loss": 1.2829,
144
- "mean_token_accuracy": 0.6730983288833745,
145
  "step": 85
146
  },
147
  {
148
- "epoch": 0.2936378466557912,
149
- "grad_norm": 1.171875,
150
- "learning_rate": 1.7813224557435313e-05,
151
- "loss": 1.2617,
152
- "mean_token_accuracy": 0.677167603530814,
153
  "step": 90
154
  },
155
  {
156
- "epoch": 0.3099510603588907,
157
- "grad_norm": 1.1953125,
158
- "learning_rate": 1.744417499379372e-05,
159
- "loss": 1.2741,
160
- "mean_token_accuracy": 0.6805341821135611,
161
  "step": 95
162
  },
163
  {
164
- "epoch": 0.3262642740619902,
165
- "grad_norm": 1.125,
166
- "learning_rate": 1.7050844078611058e-05,
167
- "loss": 1.2389,
168
- "mean_token_accuracy": 0.6857057946710582,
169
  "step": 100
170
  },
171
  {
172
- "epoch": 0.3425774877650897,
173
- "grad_norm": 1.2578125,
174
- "learning_rate": 1.663451477557792e-05,
175
- "loss": 1.2223,
176
- "mean_token_accuracy": 0.687469468283924,
177
  "step": 105
178
  },
179
  {
180
- "epoch": 0.35889070146818924,
181
- "grad_norm": 1.234375,
182
- "learning_rate": 1.6196545064345813e-05,
183
- "loss": 1.214,
184
- "mean_token_accuracy": 0.6902886780811677,
185
  "step": 110
186
  },
187
  {
188
- "epoch": 0.37520391517128876,
189
- "grad_norm": 1.203125,
190
- "learning_rate": 1.5738363511079776e-05,
191
- "loss": 1.2245,
192
- "mean_token_accuracy": 0.6857961919369919,
193
  "step": 115
194
  },
195
  {
196
- "epoch": 0.3915171288743883,
197
- "grad_norm": 1.2109375,
198
- "learning_rate": 1.5261464608772487e-05,
199
- "loss": 1.1697,
200
- "mean_token_accuracy": 0.7004586354413661,
201
  "step": 120
202
  },
203
  {
204
- "epoch": 0.4078303425774878,
205
- "grad_norm": 1.1875,
206
- "learning_rate": 1.476740390251875e-05,
207
- "loss": 1.1864,
208
- "mean_token_accuracy": 0.6946914374684998,
209
  "step": 125
210
  },
211
  {
212
- "epoch": 0.42414355628058725,
213
- "grad_norm": 1.2578125,
214
- "learning_rate": 1.4257792915650728e-05,
215
- "loss": 1.2256,
216
- "mean_token_accuracy": 0.6917642628755305,
217
  "step": 130
218
  },
219
  {
220
- "epoch": 0.44045676998368677,
221
- "grad_norm": 1.2109375,
222
- "learning_rate": 1.3734293893283783e-05,
223
- "loss": 1.2234,
224
- "mean_token_accuracy": 0.6803009834502376,
225
  "step": 135
226
  },
227
  {
228
- "epoch": 0.4567699836867863,
229
- "grad_norm": 1.296875,
230
- "learning_rate": 1.3198614380418412e-05,
231
- "loss": 1.2891,
232
- "mean_token_accuracy": 0.66906340898559,
233
  "step": 140
234
  },
235
  {
236
- "epoch": 0.4730831973898858,
237
- "grad_norm": 1.140625,
238
- "learning_rate": 1.2652501652283378e-05,
239
- "loss": 1.2833,
240
- "mean_token_accuracy": 0.6748017583716992,
241
  "step": 145
242
  },
243
  {
244
- "epoch": 0.4893964110929853,
245
- "grad_norm": 1.234375,
246
- "learning_rate": 1.2097737015087094e-05,
247
- "loss": 1.1936,
248
- "mean_token_accuracy": 0.691278874465356,
249
  "step": 150
250
  },
251
  {
252
- "epoch": 0.5057096247960848,
253
- "grad_norm": 1.140625,
254
- "learning_rate": 1.1536129995766995e-05,
255
- "loss": 1.1923,
256
- "mean_token_accuracy": 0.6915639418258935,
257
  "step": 155
258
  },
259
  {
260
- "epoch": 0.5220228384991843,
261
- "grad_norm": 1.2578125,
262
- "learning_rate": 1.0969512439688816e-05,
263
- "loss": 1.2689,
264
- "mean_token_accuracy": 0.6795721711260161,
265
  "step": 160
266
  },
267
  {
268
- "epoch": 0.5383360522022839,
269
- "grad_norm": 1.2109375,
270
- "learning_rate": 1.0399732535547735e-05,
271
- "loss": 1.2322,
272
- "mean_token_accuracy": 0.6911618495404431,
273
  "step": 165
274
  },
275
  {
276
- "epoch": 0.5546492659053833,
277
- "grad_norm": 1.140625,
278
- "learning_rate": 9.828648786961009e-06,
279
- "loss": 1.3013,
280
- "mean_token_accuracy": 0.6691213045203754,
281
  "step": 170
282
  },
283
  {
284
- "epoch": 0.5709624796084829,
285
- "grad_norm": 1.140625,
286
- "learning_rate": 9.25812395041548e-06,
287
- "loss": 1.2362,
288
- "mean_token_accuracy": 0.6846231446931508,
289
  "step": 175
290
  },
291
  {
292
- "epoch": 0.5872756933115824,
293
- "grad_norm": 1.1171875,
294
- "learning_rate": 8.690018959343071e-06,
295
- "loss": 1.1778,
296
- "mean_token_accuracy": 0.6980786468955738,
297
  "step": 180
298
  },
299
  {
300
- "epoch": 0.6035889070146819,
301
- "grad_norm": 1.1796875,
302
- "learning_rate": 8.126186854142752e-06,
303
- "loss": 1.1546,
304
- "mean_token_accuracy": 0.698524151500448,
305
  "step": 185
306
  },
307
  {
308
- "epoch": 0.6199021207177814,
309
- "grad_norm": 1.1875,
310
- "learning_rate": 7.568466737947905e-06,
311
- "loss": 1.1121,
312
- "mean_token_accuracy": 0.7087621864221246,
313
  "step": 190
314
  },
315
  {
316
- "epoch": 0.636215334420881,
317
- "grad_norm": 1.28125,
318
- "learning_rate": 7.018677777854158e-06,
319
- "loss": 1.2979,
320
- "mean_token_accuracy": 0.6694032774839085,
321
  "step": 195
322
  },
323
  {
324
- "epoch": 0.6525285481239804,
325
- "grad_norm": 1.2734375,
326
- "learning_rate": 6.478613271174453e-06,
327
- "loss": 1.3048,
328
- "mean_token_accuracy": 0.6659829648734708,
329
  "step": 200
330
  },
331
  {
332
- "epoch": 0.6688417618270799,
333
- "grad_norm": 1.2265625,
334
- "learning_rate": 5.950034796075948e-06,
335
- "loss": 1.2435,
336
- "mean_token_accuracy": 0.6830356432134138,
337
  "step": 205
338
  },
339
  {
340
- "epoch": 0.6851549755301795,
341
- "grad_norm": 1.203125,
342
- "learning_rate": 5.434666465678176e-06,
343
- "loss": 1.3219,
344
- "mean_token_accuracy": 0.6673393247330371,
345
  "step": 210
346
  },
347
  {
348
- "epoch": 0.7014681892332789,
349
- "grad_norm": 1.3046875,
350
- "learning_rate": 4.934189304354418e-06,
351
- "loss": 1.2301,
352
- "mean_token_accuracy": 0.6839774101528373,
353
  "step": 215
354
  },
355
  {
356
- "epoch": 0.7177814029363785,
357
- "grad_norm": 1.1328125,
358
- "learning_rate": 4.450235764579598e-06,
359
- "loss": 1.2208,
360
- "mean_token_accuracy": 0.6862650424423485,
361
  "step": 220
362
  },
363
  {
364
- "epoch": 0.734094616639478,
365
- "grad_norm": 1.265625,
366
- "learning_rate": 3.984384402209613e-06,
367
- "loss": 1.2143,
368
- "mean_token_accuracy": 0.6849734049971641,
369
  "step": 225
370
  },
371
  {
372
- "epoch": 0.7504078303425775,
373
- "grad_norm": 1.140625,
374
- "learning_rate": 3.538154727560259e-06,
375
- "loss": 1.1558,
376
- "mean_token_accuracy": 0.6994565541878519,
377
  "step": 230
378
  },
379
  {
380
- "epoch": 0.766721044045677,
381
- "grad_norm": 1.09375,
382
- "learning_rate": 3.1130022490803856e-06,
383
- "loss": 1.1774,
384
- "mean_token_accuracy": 0.6945823398256653,
385
  "step": 235
386
  },
387
  {
388
- "epoch": 0.7830342577487766,
389
- "grad_norm": 1.25,
390
- "learning_rate": 2.7103137257858867e-06,
391
- "loss": 1.142,
392
- "mean_token_accuracy": 0.7025941539981695,
393
  "step": 240
394
  },
395
  {
396
- "epoch": 0.799347471451876,
397
- "grad_norm": 1.2109375,
398
- "learning_rate": 2.3314026439400217e-06,
399
- "loss": 1.2946,
400
- "mean_token_accuracy": 0.6670428901128733,
401
  "step": 245
402
  },
403
  {
404
- "epoch": 0.8156606851549756,
405
- "grad_norm": 1.09375,
406
- "learning_rate": 1.9775049327342486e-06,
407
- "loss": 1.1889,
408
- "mean_token_accuracy": 0.6966172545481755,
409
  "step": 250
410
  },
411
  {
412
- "epoch": 0.831973898858075,
413
- "grad_norm": 1.1640625,
414
- "learning_rate": 1.649774932944075e-06,
415
- "loss": 1.1777,
416
- "mean_token_accuracy": 0.7020263962587155,
417
  "step": 255
418
  },
419
  {
420
- "epoch": 0.8482871125611745,
421
- "grad_norm": 1.1796875,
422
- "learning_rate": 1.3492816317093894e-06,
423
- "loss": 1.1444,
424
- "mean_token_accuracy": 0.7035694430910295,
425
  "step": 260
426
  },
427
  {
428
- "epoch": 0.8646003262642741,
429
- "grad_norm": 1.1171875,
430
- "learning_rate": 1.0770051757206078e-06,
431
- "loss": 1.1997,
432
- "mean_token_accuracy": 0.6922763738951797,
433
  "step": 265
434
  },
435
  {
436
- "epoch": 0.8809135399673735,
437
- "grad_norm": 1.140625,
438
- "learning_rate": 8.338336741838837e-07,
439
- "loss": 1.1993,
440
- "mean_token_accuracy": 0.6892363770014897,
441
  "step": 270
442
  },
443
  {
444
- "epoch": 0.8972267536704731,
445
- "grad_norm": 1.1328125,
446
- "learning_rate": 6.205603019934791e-07,
447
- "loss": 1.2311,
448
- "mean_token_accuracy": 0.6834054369061775,
449
  "step": 275
450
  },
451
  {
452
- "epoch": 0.9135399673735726,
453
- "grad_norm": 1.1328125,
454
- "learning_rate": 4.3788071256013033e-07,
455
- "loss": 1.2076,
456
- "mean_token_accuracy": 0.6909252205130498,
457
  "step": 280
458
  },
459
  {
460
- "epoch": 0.9298531810766721,
461
- "grad_norm": 1.1640625,
462
- "learning_rate": 2.863907687341949e-07,
463
- "loss": 1.1762,
464
- "mean_token_accuracy": 0.69716578948841,
465
  "step": 285
466
  },
467
  {
468
- "epoch": 0.9461663947797716,
469
- "grad_norm": 1.1484375,
470
- "learning_rate": 1.665845992249071e-07,
471
- "loss": 1.1792,
472
- "mean_token_accuracy": 0.6928531967188305,
473
  "step": 290
474
  },
475
  {
476
- "epoch": 0.9624796084828712,
477
- "grad_norm": 1.3046875,
478
- "learning_rate": 7.885298685522235e-08,
479
- "loss": 1.1642,
480
- "mean_token_accuracy": 0.694887794722856,
481
  "step": 295
482
  },
483
  {
484
- "epoch": 0.9787928221859706,
485
- "grad_norm": 1.28125,
486
- "learning_rate": 2.348209390947376e-08,
487
- "loss": 1.234,
488
- "mean_token_accuracy": 0.6835920887329172,
489
  "step": 300
490
  },
491
  {
492
- "epoch": 0.9951060358890701,
493
- "grad_norm": 1.203125,
494
- "learning_rate": 6.525287314851358e-10,
495
- "loss": 1.1268,
496
- "mean_token_accuracy": 0.7035911209586428,
497
  "step": 305
498
  },
499
  {
500
- "epoch": 0.99836867862969,
501
- "mean_token_accuracy": 0.7046485858069031,
502
- "step": 306,
503
- "total_flos": 7.883277455484518e+16,
504
- "train_loss": 1.23619465266957,
505
- "train_runtime": 1783.7305,
506
- "train_samples_per_second": 1.374,
507
- "train_steps_per_second": 0.172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  }
509
  ],
510
  "logging_steps": 5,
511
- "max_steps": 306,
512
  "num_input_tokens_seen": 0,
513
- "num_train_epochs": 1,
514
- "save_steps": 500,
515
  "stateful_callbacks": {
516
  "TrainerControl": {
517
  "args": {
518
  "should_epoch_stop": false,
519
  "should_evaluate": false,
520
  "should_log": false,
521
- "should_save": false,
522
- "should_training_stop": false
523
  },
524
  "attributes": {}
525
  }
526
  },
527
- "total_flos": 7.883277455484518e+16,
528
- "train_batch_size": 2,
529
  "trial_name": null,
530
  "trial_params": null
531
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 385,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.06493506493506493,
13
+ "grad_norm": 2.600665284354585,
14
+ "learning_rate": 1.25e-05,
15
+ "loss": 1.6075,
 
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.12987012987012986,
20
+ "grad_norm": 2.3704504908479493,
21
+ "learning_rate": 2.5e-05,
22
+ "loss": 1.4792,
 
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.19480519480519481,
27
+ "grad_norm": 1.2779795738968422,
28
+ "learning_rate": 3.7500000000000003e-05,
29
+ "loss": 1.4115,
 
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.2597402597402597,
34
+ "grad_norm": 1.267787561996043,
35
+ "learning_rate": 5e-05,
36
+ "loss": 1.4018,
 
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.3246753246753247,
41
+ "grad_norm": 0.9468473252185556,
42
+ "learning_rate": 4.9979167589800175e-05,
43
+ "loss": 1.3367,
 
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.38961038961038963,
48
+ "grad_norm": 0.9893253972313399,
49
+ "learning_rate": 4.991670893602868e-05,
50
+ "loss": 1.3354,
 
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.45454545454545453,
55
+ "grad_norm": 1.0881069970614872,
56
+ "learning_rate": 4.9812739697734024e-05,
57
+ "loss": 1.3007,
 
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.5194805194805194,
62
+ "grad_norm": 1.0194980858529235,
63
+ "learning_rate": 4.9667452402011365e-05,
64
+ "loss": 1.2934,
 
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.5844155844155844,
69
+ "grad_norm": 0.9223444885083613,
70
+ "learning_rate": 4.94811160874866e-05,
71
+ "loss": 1.3045,
 
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.6493506493506493,
76
+ "grad_norm": 0.923065081102753,
77
+ "learning_rate": 4.925407580611875e-05,
78
+ "loss": 1.2753,
 
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.7142857142857143,
83
+ "grad_norm": 1.0718722181581042,
84
+ "learning_rate": 4.898675198424325e-05,
85
+ "loss": 1.2756,
 
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.7792207792207793,
90
+ "grad_norm": 1.052542976749598,
91
+ "learning_rate": 4.867963964403906e-05,
92
+ "loss": 1.2911,
 
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.8441558441558441,
97
+ "grad_norm": 0.8128323214762116,
98
+ "learning_rate": 4.833330748686162e-05,
99
+ "loss": 1.2779,
 
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.9090909090909091,
104
+ "grad_norm": 0.8676636174920215,
105
+ "learning_rate": 4.794839684013882e-05,
106
+ "loss": 1.2848,
 
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.974025974025974,
111
+ "grad_norm": 1.042065531092415,
112
+ "learning_rate": 4.7525620469780227e-05,
113
+ "loss": 1.2567,
 
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 1.0389610389610389,
118
+ "grad_norm": 0.972909624095194,
119
+ "learning_rate": 4.7065761260298747e-05,
120
+ "loss": 1.1721,
 
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 1.103896103896104,
125
+ "grad_norm": 1.032654503817413,
126
+ "learning_rate": 4.6569670765088703e-05,
127
+ "loss": 1.0906,
 
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 1.1688311688311688,
132
+ "grad_norm": 1.0351195035386815,
133
+ "learning_rate": 4.603826762954497e-05,
134
+ "loss": 1.1172,
 
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 1.2337662337662338,
139
+ "grad_norm": 0.96162795934013,
140
+ "learning_rate": 4.5472535889943214e-05,
141
+ "loss": 1.0908,
 
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 1.2987012987012987,
146
+ "grad_norm": 0.8351069694923322,
147
+ "learning_rate": 4.487352315123119e-05,
148
+ "loss": 1.1043,
 
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 1.3636363636363638,
153
+ "grad_norm": 1.0009708201516048,
154
+ "learning_rate": 4.424233864710562e-05,
155
+ "loss": 1.0782,
 
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 1.4285714285714286,
160
+ "grad_norm": 0.8512417675384085,
161
+ "learning_rate": 4.3580151185966625e-05,
162
+ "loss": 1.0982,
 
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 1.4935064935064934,
167
+ "grad_norm": 1.0103004383513716,
168
+ "learning_rate": 4.288818698655374e-05,
169
+ "loss": 1.0942,
 
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 1.5584415584415585,
174
+ "grad_norm": 0.8573478770737569,
175
+ "learning_rate": 4.216772740727103e-05,
176
+ "loss": 1.0897,
 
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 1.6233766233766234,
181
+ "grad_norm": 0.781876249476916,
182
+ "learning_rate": 4.142010657340632e-05,
183
+ "loss": 1.0763,
 
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 1.6883116883116882,
188
+ "grad_norm": 0.9579078710565259,
189
+ "learning_rate": 4.064670890663829e-05,
190
+ "loss": 1.1087,
 
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 1.7532467532467533,
195
+ "grad_norm": 0.7939190983394226,
196
+ "learning_rate": 3.9848966561406185e-05,
197
+ "loss": 1.0746,
 
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 1.8181818181818183,
202
+ "grad_norm": 1.0342389011696922,
203
+ "learning_rate": 3.902835677288954e-05,
204
+ "loss": 1.0677,
 
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 1.883116883116883,
209
+ "grad_norm": 0.7587581170618278,
210
+ "learning_rate": 3.818639912150864e-05,
211
+ "loss": 1.0916,
 
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 1.948051948051948,
216
+ "grad_norm": 0.7978087410640827,
217
+ "learning_rate": 3.7324652719011446e-05,
218
+ "loss": 1.0789,
 
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 2.012987012987013,
223
+ "grad_norm": 1.1656631051781983,
224
+ "learning_rate": 3.644471332135751e-05,
225
+ "loss": 1.0629,
 
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 2.0779220779220777,
230
+ "grad_norm": 1.0903515201097143,
231
+ "learning_rate": 3.554821037374533e-05,
232
+ "loss": 0.8856,
 
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 2.142857142857143,
237
+ "grad_norm": 0.7595990161921076,
238
+ "learning_rate": 3.463680399325489e-05,
239
+ "loss": 0.8855,
 
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 2.207792207792208,
244
+ "grad_norm": 0.7774624026109681,
245
+ "learning_rate": 3.371218189469306e-05,
246
+ "loss": 0.8659,
 
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 2.2727272727272725,
251
+ "grad_norm": 0.7461072755462708,
252
+ "learning_rate": 3.277605626533422e-05,
253
+ "loss": 0.8772,
 
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 2.3376623376623376,
258
+ "grad_norm": 0.7432368434791046,
259
+ "learning_rate": 3.183016059434367e-05,
260
+ "loss": 0.8525,
 
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 2.4025974025974026,
265
+ "grad_norm": 0.7590366429814798,
266
+ "learning_rate": 3.0876246462754685e-05,
267
+ "loss": 0.8742,
 
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 2.4675324675324677,
272
+ "grad_norm": 0.7247155849317677,
273
+ "learning_rate": 2.9916080299943672e-05,
274
+ "loss": 0.9026,
 
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 2.5324675324675323,
279
+ "grad_norm": 0.7310534548444702,
280
+ "learning_rate": 2.8951440112609623e-05,
281
+ "loss": 0.8883,
 
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 2.5974025974025974,
286
+ "grad_norm": 0.7129005263727524,
287
+ "learning_rate": 2.7984112192315004e-05,
288
+ "loss": 0.8736,
 
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 2.6623376623376624,
293
+ "grad_norm": 0.7437690616597665,
294
+ "learning_rate": 2.7015887807685002e-05,
295
+ "loss": 0.8981,
 
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 2.7272727272727275,
300
+ "grad_norm": 0.7385255276801924,
301
+ "learning_rate": 2.604855988739039e-05,
302
+ "loss": 0.8895,
 
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 2.792207792207792,
307
+ "grad_norm": 0.7144722424573203,
308
+ "learning_rate": 2.5083919700056337e-05,
309
+ "loss": 0.8744,
 
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 2.857142857142857,
314
+ "grad_norm": 0.7010146967901848,
315
+ "learning_rate": 2.412375353724532e-05,
316
+ "loss": 0.896,
 
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 2.9220779220779223,
321
+ "grad_norm": 0.7504002323519033,
322
+ "learning_rate": 2.316983940565633e-05,
323
+ "loss": 0.8868,
 
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 2.987012987012987,
328
+ "grad_norm": 0.7865383039242797,
329
+ "learning_rate": 2.2223943734665787e-05,
330
+ "loss": 0.8745,
 
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 3.051948051948052,
335
+ "grad_norm": 1.0159335857839107,
336
+ "learning_rate": 2.128781810530695e-05,
337
+ "loss": 0.7511,
 
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 3.116883116883117,
342
+ "grad_norm": 1.2002393858292175,
343
+ "learning_rate": 2.0363196006745117e-05,
344
+ "loss": 0.6809,
 
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 3.1818181818181817,
349
+ "grad_norm": 0.7878609155092301,
350
+ "learning_rate": 1.9451789626254672e-05,
351
+ "loss": 0.6976,
 
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 3.2467532467532467,
356
+ "grad_norm": 0.8016037192685418,
357
+ "learning_rate": 1.8555286678642496e-05,
358
+ "loss": 0.6747,
 
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 3.311688311688312,
363
+ "grad_norm": 0.7587512144299318,
364
+ "learning_rate": 1.7675347280988562e-05,
365
+ "loss": 0.6964,
 
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 3.3766233766233764,
370
+ "grad_norm": 0.7356037246911953,
371
+ "learning_rate": 1.6813600878491376e-05,
372
+ "loss": 0.6865,
 
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 3.4415584415584415,
377
+ "grad_norm": 0.7631434364636613,
378
+ "learning_rate": 1.597164322711047e-05,
379
+ "loss": 0.6909,
 
380
  "step": 265
381
  },
382
  {
383
+ "epoch": 3.5064935064935066,
384
+ "grad_norm": 0.7131404046904652,
385
+ "learning_rate": 1.5151033438593826e-05,
386
+ "loss": 0.6934,
 
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 3.571428571428571,
391
+ "grad_norm": 0.711551265262125,
392
+ "learning_rate": 1.4353291093361709e-05,
393
+ "loss": 0.6973,
 
394
  "step": 275
395
  },
396
  {
397
+ "epoch": 3.6363636363636362,
398
+ "grad_norm": 0.705346898473505,
399
+ "learning_rate": 1.3579893426593681e-05,
400
+ "loss": 0.68,
 
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 3.7012987012987013,
405
+ "grad_norm": 0.7118443145611075,
406
+ "learning_rate": 1.2832272592728966e-05,
407
+ "loss": 0.6904,
 
408
  "step": 285
409
  },
410
  {
411
+ "epoch": 3.7662337662337664,
412
+ "grad_norm": 0.7427739908839855,
413
+ "learning_rate": 1.211181301344627e-05,
414
+ "loss": 0.6793,
 
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 3.8311688311688314,
419
+ "grad_norm": 0.6999298394629496,
420
+ "learning_rate": 1.141984881403338e-05,
421
+ "loss": 0.6876,
 
422
  "step": 295
423
  },
424
  {
425
+ "epoch": 3.896103896103896,
426
+ "grad_norm": 0.7120263608160156,
427
+ "learning_rate": 1.0757661352894394e-05,
428
+ "loss": 0.6893,
 
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 3.961038961038961,
433
+ "grad_norm": 0.7144317661772965,
434
+ "learning_rate": 1.0126476848768805e-05,
435
+ "loss": 0.6789,
 
436
  "step": 305
437
  },
438
  {
439
+ "epoch": 4.025974025974026,
440
+ "grad_norm": 1.063865329311315,
441
+ "learning_rate": 9.527464110056795e-06,
442
+ "loss": 0.6269,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 4.090909090909091,
447
+ "grad_norm": 1.2451272279185113,
448
+ "learning_rate": 8.961732370455032e-06,
449
+ "loss": 0.5587,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 4.1558441558441555,
454
+ "grad_norm": 0.8874131926210624,
455
+ "learning_rate": 8.430329234911305e-06,
456
+ "loss": 0.5358,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 4.220779220779221,
461
+ "grad_norm": 0.8464865813880585,
462
+ "learning_rate": 7.934238739701252e-06,
463
+ "loss": 0.5414,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 4.285714285714286,
468
+ "grad_norm": 0.8425143948713799,
469
+ "learning_rate": 7.4743795302197754e-06,
470
+ "loss": 0.5494,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 4.35064935064935,
475
+ "grad_norm": 0.7956269077359945,
476
+ "learning_rate": 7.051603159861185e-06,
477
+ "loss": 0.5434,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 4.415584415584416,
482
+ "grad_norm": 0.7869754406789251,
483
+ "learning_rate": 6.66669251313838e-06,
484
+ "loss": 0.5545,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 4.48051948051948,
489
+ "grad_norm": 0.7360290448864131,
490
+ "learning_rate": 6.320360355960941e-06,
491
+ "loss": 0.5292,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 4.545454545454545,
496
+ "grad_norm": 0.7623311010292727,
497
+ "learning_rate": 6.013248015756759e-06,
498
+ "loss": 0.547,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 4.6103896103896105,
503
+ "grad_norm": 0.7371275230206066,
504
+ "learning_rate": 5.745924193881257e-06,
505
+ "loss": 0.5501,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 4.675324675324675,
510
+ "grad_norm": 0.7480895176553981,
511
+ "learning_rate": 5.518883912513413e-06,
512
+ "loss": 0.5357,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 4.740259740259741,
517
+ "grad_norm": 0.7362818934957748,
518
+ "learning_rate": 5.332547597988636e-06,
519
+ "loss": 0.5451,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 4.805194805194805,
524
+ "grad_norm": 0.7678966619355423,
525
+ "learning_rate": 5.1872603022659765e-06,
526
+ "loss": 0.5297,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 4.87012987012987,
531
+ "grad_norm": 0.7535710804010259,
532
+ "learning_rate": 5.083291063971324e-06,
533
+ "loss": 0.5356,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 4.935064935064935,
538
+ "grad_norm": 0.7578922717450597,
539
+ "learning_rate": 5.020832410199826e-06,
540
+ "loss": 0.5347,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 5.0,
545
+ "grad_norm": 0.7276287792812202,
546
+ "learning_rate": 5e-06,
547
+ "loss": 0.5505,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 5.0,
552
+ "step": 385,
553
+ "total_flos": 43871077662720.0,
554
+ "train_loss": 0.9090226414915803,
555
+ "train_runtime": 2127.6426,
556
+ "train_samples_per_second": 23.091,
557
+ "train_steps_per_second": 0.181
558
  }
559
  ],
560
  "logging_steps": 5,
561
+ "max_steps": 385,
562
  "num_input_tokens_seen": 0,
563
+ "num_train_epochs": 5,
564
+ "save_steps": 100,
565
  "stateful_callbacks": {
566
  "TrainerControl": {
567
  "args": {
568
  "should_epoch_stop": false,
569
  "should_evaluate": false,
570
  "should_log": false,
571
+ "should_save": true,
572
+ "should_training_stop": true
573
  },
574
  "attributes": {}
575
  }
576
  },
577
+ "total_flos": 43871077662720.0,
578
+ "train_batch_size": 16,
579
  "trial_name": null,
580
  "trial_params": null
581
  }