sedrickkeh commited on
Commit
455ba30
·
verified ·
1 Parent(s): 2f6f914

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_glaive_code_assistant
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_glaive_code_assistant
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6738
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_glaive_code_assistant
 
16
 
17
  # OH_DCFT_V3_wo_glaive_code_assistant
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_glaive_code_assistant dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6738
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 2.9952,
3
- "eval_loss": 0.6776005029678345,
4
- "eval_runtime": 168.3612,
5
- "eval_samples_per_second": 50.0,
6
- "eval_steps_per_second": 0.392,
7
  "total_flos": 1567416102420480.0,
8
- "train_loss": 0.6459089978637859,
9
- "train_runtime": 28277.8546,
10
- "train_samples_per_second": 16.967,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 2.9952,
3
+ "eval_loss": 0.6738138794898987,
4
+ "eval_runtime": 167.1791,
5
+ "eval_samples_per_second": 50.353,
6
+ "eval_steps_per_second": 0.395,
7
  "total_flos": 1567416102420480.0,
8
+ "train_loss": 0.647635899293117,
9
+ "train_runtime": 28091.6471,
10
+ "train_samples_per_second": 17.079,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 2.9952,
3
- "eval_loss": 0.6776005029678345,
4
- "eval_runtime": 168.3612,
5
- "eval_samples_per_second": 50.0,
6
- "eval_steps_per_second": 0.392
7
  }
 
1
  {
2
  "epoch": 2.9952,
3
+ "eval_loss": 0.6738138794898987,
4
+ "eval_runtime": 167.1791,
5
+ "eval_samples_per_second": 50.353,
6
+ "eval_steps_per_second": 0.395
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.9952,
3
  "total_flos": 1567416102420480.0,
4
- "train_loss": 0.6459089978637859,
5
- "train_runtime": 28277.8546,
6
- "train_samples_per_second": 16.967,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 2.9952,
3
  "total_flos": 1567416102420480.0,
4
+ "train_loss": 0.647635899293117,
5
+ "train_runtime": 28091.6471,
6
+ "train_samples_per_second": 17.079,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,686 +10,686 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.032,
13
- "grad_norm": 4.681128681680185,
14
  "learning_rate": 5e-06,
15
- "loss": 0.944,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.064,
20
- "grad_norm": 1.4664616243938466,
21
  "learning_rate": 5e-06,
22
- "loss": 0.8428,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.096,
27
- "grad_norm": 1.8998988032315547,
28
  "learning_rate": 5e-06,
29
- "loss": 0.7982,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.128,
34
- "grad_norm": 1.92324741635604,
35
  "learning_rate": 5e-06,
36
- "loss": 0.7714,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 2.4002264952882335,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7676,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.192,
48
- "grad_norm": 1.2695794968340761,
49
  "learning_rate": 5e-06,
50
- "loss": 0.7461,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.224,
55
- "grad_norm": 0.9170291045990026,
56
  "learning_rate": 5e-06,
57
- "loss": 0.7377,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.256,
62
- "grad_norm": 1.2359528897417442,
63
  "learning_rate": 5e-06,
64
- "loss": 0.7232,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.288,
69
- "grad_norm": 0.8951767220444212,
70
  "learning_rate": 5e-06,
71
- "loss": 0.7189,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
- "grad_norm": 0.7069383759659909,
77
  "learning_rate": 5e-06,
78
- "loss": 0.7113,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.352,
83
- "grad_norm": 1.0819828977879882,
84
  "learning_rate": 5e-06,
85
- "loss": 0.7162,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.384,
90
- "grad_norm": 1.304901636304286,
91
  "learning_rate": 5e-06,
92
- "loss": 0.713,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.416,
97
- "grad_norm": 0.6373106432007576,
98
  "learning_rate": 5e-06,
99
- "loss": 0.711,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.448,
104
- "grad_norm": 0.6304264579464928,
105
  "learning_rate": 5e-06,
106
- "loss": 0.7103,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.48,
111
- "grad_norm": 0.7649551484704428,
112
  "learning_rate": 5e-06,
113
- "loss": 0.7074,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.512,
118
- "grad_norm": 0.6150803591520316,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6995,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.544,
125
- "grad_norm": 0.60459672442792,
126
  "learning_rate": 5e-06,
127
- "loss": 0.7,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.576,
132
- "grad_norm": 0.5617178085268674,
133
  "learning_rate": 5e-06,
134
- "loss": 0.7039,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.608,
139
- "grad_norm": 0.6434460581488514,
140
  "learning_rate": 5e-06,
141
- "loss": 0.704,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.64,
146
- "grad_norm": 0.7168502004019935,
147
  "learning_rate": 5e-06,
148
- "loss": 0.693,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.672,
153
- "grad_norm": 0.7600011834386737,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6971,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.704,
160
- "grad_norm": 0.6044774583977053,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6973,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.736,
167
- "grad_norm": 0.65068804006829,
168
  "learning_rate": 5e-06,
169
- "loss": 0.691,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.768,
174
- "grad_norm": 0.5634185501342828,
175
  "learning_rate": 5e-06,
176
- "loss": 0.6956,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.8,
181
- "grad_norm": 0.6768741126198279,
182
  "learning_rate": 5e-06,
183
- "loss": 0.693,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.832,
188
- "grad_norm": 0.6013038960725747,
189
  "learning_rate": 5e-06,
190
- "loss": 0.695,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.864,
195
- "grad_norm": 0.5521163219149124,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6956,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.896,
202
- "grad_norm": 0.6101943227594748,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6863,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.928,
209
- "grad_norm": 0.7006837763958106,
210
  "learning_rate": 5e-06,
211
- "loss": 0.686,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.96,
216
- "grad_norm": 0.7462008507832877,
217
  "learning_rate": 5e-06,
218
- "loss": 0.679,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.992,
223
- "grad_norm": 0.5329536650880176,
224
  "learning_rate": 5e-06,
225
- "loss": 0.6722,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9984,
230
- "eval_loss": 0.684551477432251,
231
- "eval_runtime": 168.4477,
232
- "eval_samples_per_second": 49.974,
233
- "eval_steps_per_second": 0.392,
234
  "step": 312
235
  },
236
  {
237
  "epoch": 1.024,
238
- "grad_norm": 0.877682536384727,
239
  "learning_rate": 5e-06,
240
- "loss": 0.6394,
241
  "step": 320
242
  },
243
  {
244
  "epoch": 1.056,
245
- "grad_norm": 0.6299661618385989,
246
  "learning_rate": 5e-06,
247
- "loss": 0.6282,
248
  "step": 330
249
  },
250
  {
251
  "epoch": 1.088,
252
- "grad_norm": 0.9299954358565179,
253
  "learning_rate": 5e-06,
254
- "loss": 0.6327,
255
  "step": 340
256
  },
257
  {
258
  "epoch": 1.12,
259
- "grad_norm": 0.6355052821952561,
260
  "learning_rate": 5e-06,
261
- "loss": 0.6326,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.152,
266
- "grad_norm": 0.6315825615381476,
267
  "learning_rate": 5e-06,
268
- "loss": 0.6289,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.184,
273
- "grad_norm": 0.5622112607620553,
274
  "learning_rate": 5e-06,
275
- "loss": 0.6448,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.216,
280
- "grad_norm": 0.6577442757912503,
281
  "learning_rate": 5e-06,
282
- "loss": 0.6314,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.248,
287
- "grad_norm": 0.6462394250281904,
288
  "learning_rate": 5e-06,
289
- "loss": 0.6307,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.28,
294
- "grad_norm": 0.6305710848857419,
295
  "learning_rate": 5e-06,
296
- "loss": 0.6335,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.312,
301
- "grad_norm": 0.6739586427264055,
302
  "learning_rate": 5e-06,
303
- "loss": 0.6305,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.3439999999999999,
308
- "grad_norm": 0.5926724148807778,
309
  "learning_rate": 5e-06,
310
- "loss": 0.6244,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.376,
315
- "grad_norm": 0.5553139599738963,
316
  "learning_rate": 5e-06,
317
- "loss": 0.6385,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.408,
322
- "grad_norm": 0.6469857046293095,
323
  "learning_rate": 5e-06,
324
- "loss": 0.6266,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.44,
329
- "grad_norm": 0.5756297365306258,
330
  "learning_rate": 5e-06,
331
- "loss": 0.6274,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.472,
336
- "grad_norm": 0.5935989657812096,
337
  "learning_rate": 5e-06,
338
- "loss": 0.643,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.504,
343
- "grad_norm": 0.646580130562456,
344
  "learning_rate": 5e-06,
345
- "loss": 0.6347,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.536,
350
- "grad_norm": 0.6662432182668429,
351
  "learning_rate": 5e-06,
352
- "loss": 0.637,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.568,
357
- "grad_norm": 0.6098294179034147,
358
  "learning_rate": 5e-06,
359
- "loss": 0.6339,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.6,
364
- "grad_norm": 0.5297071787734379,
365
  "learning_rate": 5e-06,
366
- "loss": 0.6385,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.6320000000000001,
371
- "grad_norm": 0.719632379699023,
372
  "learning_rate": 5e-06,
373
- "loss": 0.626,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.6640000000000001,
378
- "grad_norm": 0.5076459047888962,
379
  "learning_rate": 5e-06,
380
- "loss": 0.6359,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.696,
385
- "grad_norm": 0.5629232051091821,
386
  "learning_rate": 5e-06,
387
- "loss": 0.635,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.728,
392
- "grad_norm": 0.5166400346895539,
393
  "learning_rate": 5e-06,
394
- "loss": 0.6255,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.76,
399
- "grad_norm": 0.6147003690488106,
400
  "learning_rate": 5e-06,
401
- "loss": 0.6328,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.792,
406
- "grad_norm": 0.6210621909628569,
407
  "learning_rate": 5e-06,
408
- "loss": 0.6375,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.8239999999999998,
413
- "grad_norm": 0.6198915290710809,
414
  "learning_rate": 5e-06,
415
- "loss": 0.6421,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.8559999999999999,
420
- "grad_norm": 0.5425204952547927,
421
  "learning_rate": 5e-06,
422
- "loss": 0.636,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.888,
427
- "grad_norm": 0.4753843958385197,
428
  "learning_rate": 5e-06,
429
- "loss": 0.6214,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.92,
434
- "grad_norm": 0.5703253197864113,
435
  "learning_rate": 5e-06,
436
- "loss": 0.6245,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.952,
441
- "grad_norm": 0.6133997537829436,
442
  "learning_rate": 5e-06,
443
- "loss": 0.6315,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.984,
448
- "grad_norm": 0.5628290861601559,
449
  "learning_rate": 5e-06,
450
- "loss": 0.6363,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 2.0,
455
- "eval_loss": 0.6738976836204529,
456
- "eval_runtime": 168.9768,
457
- "eval_samples_per_second": 49.817,
458
- "eval_steps_per_second": 0.391,
459
  "step": 625
460
  },
461
  {
462
  "epoch": 2.016,
463
- "grad_norm": 0.8168281752833556,
464
  "learning_rate": 5e-06,
465
- "loss": 0.6036,
466
  "step": 630
467
  },
468
  {
469
  "epoch": 2.048,
470
- "grad_norm": 0.7580260940938055,
471
  "learning_rate": 5e-06,
472
- "loss": 0.5682,
473
  "step": 640
474
  },
475
  {
476
  "epoch": 2.08,
477
- "grad_norm": 0.7062292404432674,
478
  "learning_rate": 5e-06,
479
- "loss": 0.5747,
480
  "step": 650
481
  },
482
  {
483
  "epoch": 2.112,
484
- "grad_norm": 0.5992453206728847,
485
  "learning_rate": 5e-06,
486
- "loss": 0.5836,
487
  "step": 660
488
  },
489
  {
490
  "epoch": 2.144,
491
- "grad_norm": 0.5619811705811777,
492
  "learning_rate": 5e-06,
493
- "loss": 0.581,
494
  "step": 670
495
  },
496
  {
497
  "epoch": 2.176,
498
- "grad_norm": 0.5958747225446613,
499
  "learning_rate": 5e-06,
500
- "loss": 0.578,
501
  "step": 680
502
  },
503
  {
504
  "epoch": 2.208,
505
- "grad_norm": 0.6497415815637109,
506
  "learning_rate": 5e-06,
507
- "loss": 0.5732,
508
  "step": 690
509
  },
510
  {
511
  "epoch": 2.24,
512
- "grad_norm": 0.5780693478259846,
513
  "learning_rate": 5e-06,
514
- "loss": 0.5816,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.2720000000000002,
519
- "grad_norm": 0.596184390481402,
520
  "learning_rate": 5e-06,
521
- "loss": 0.5741,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.304,
526
- "grad_norm": 0.6306865439208578,
527
  "learning_rate": 5e-06,
528
- "loss": 0.5833,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.336,
533
- "grad_norm": 0.583701490553111,
534
  "learning_rate": 5e-06,
535
- "loss": 0.5867,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.368,
540
- "grad_norm": 0.6055766388646271,
541
  "learning_rate": 5e-06,
542
- "loss": 0.5894,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.4,
547
- "grad_norm": 0.67228235956439,
548
  "learning_rate": 5e-06,
549
- "loss": 0.5802,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.432,
554
- "grad_norm": 0.6434293537696463,
555
  "learning_rate": 5e-06,
556
- "loss": 0.5797,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.464,
561
- "grad_norm": 0.8446283351461378,
562
  "learning_rate": 5e-06,
563
- "loss": 0.5847,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.496,
568
- "grad_norm": 0.6805593232272164,
569
  "learning_rate": 5e-06,
570
- "loss": 0.5791,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.528,
575
- "grad_norm": 0.6396132909626955,
576
  "learning_rate": 5e-06,
577
- "loss": 0.576,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.56,
582
- "grad_norm": 0.5939211838981279,
583
  "learning_rate": 5e-06,
584
- "loss": 0.5762,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.592,
589
- "grad_norm": 0.617055354774866,
590
  "learning_rate": 5e-06,
591
- "loss": 0.5828,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.624,
596
- "grad_norm": 0.8572554082731082,
597
  "learning_rate": 5e-06,
598
- "loss": 0.5879,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.656,
603
- "grad_norm": 0.6390611228450237,
604
  "learning_rate": 5e-06,
605
- "loss": 0.5849,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.6879999999999997,
610
- "grad_norm": 0.616019451907882,
611
  "learning_rate": 5e-06,
612
- "loss": 0.5836,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.7199999999999998,
617
- "grad_norm": 0.6021038686149264,
618
  "learning_rate": 5e-06,
619
- "loss": 0.582,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.752,
624
- "grad_norm": 0.6506064567719853,
625
  "learning_rate": 5e-06,
626
- "loss": 0.5895,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.784,
631
- "grad_norm": 0.5976578928465824,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5888,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.816,
638
- "grad_norm": 0.7052250144169027,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5835,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.848,
645
- "grad_norm": 0.5708818723724719,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5873,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.88,
652
- "grad_norm": 0.5395580422880416,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5813,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.912,
659
- "grad_norm": 0.6226733841326677,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5845,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.944,
666
- "grad_norm": 0.9233745991272709,
667
  "learning_rate": 5e-06,
668
- "loss": 0.5937,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.976,
673
- "grad_norm": 0.6314988399397768,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5882,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.9952,
680
- "eval_loss": 0.6776005029678345,
681
- "eval_runtime": 169.1842,
682
- "eval_samples_per_second": 49.756,
683
- "eval_steps_per_second": 0.39,
684
  "step": 936
685
  },
686
  {
687
  "epoch": 2.9952,
688
  "step": 936,
689
  "total_flos": 1567416102420480.0,
690
- "train_loss": 0.6459089978637859,
691
- "train_runtime": 28277.8546,
692
- "train_samples_per_second": 16.967,
693
  "train_steps_per_second": 0.033
694
  }
695
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.032,
13
+ "grad_norm": 11.096345041133668,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.9124,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.064,
20
+ "grad_norm": 2.0005484215366707,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.8231,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.096,
27
+ "grad_norm": 1.7931305619074691,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.7884,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.128,
34
+ "grad_norm": 0.851189870496473,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7635,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 0.9889043585826149,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.7591,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.192,
48
+ "grad_norm": 0.8749373478404373,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.7381,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.224,
55
+ "grad_norm": 1.0946174467023697,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.7318,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.256,
62
+ "grad_norm": 0.9291846880748199,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.7179,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.288,
69
+ "grad_norm": 0.5509636442658649,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.7139,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
+ "grad_norm": 0.593197960775897,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.7063,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.352,
83
+ "grad_norm": 0.9507443053374502,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.7116,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.384,
90
+ "grad_norm": 0.8188616654629195,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.7087,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.416,
97
+ "grad_norm": 0.8138893569857725,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.7071,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.448,
104
+ "grad_norm": 0.6168823207810579,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.707,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.48,
111
+ "grad_norm": 0.5851796811481069,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.7042,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.512,
118
+ "grad_norm": 0.6381484672996786,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6966,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.544,
125
+ "grad_norm": 1.1453372921610079,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6971,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.576,
132
+ "grad_norm": 0.45645996706514147,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.7013,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.608,
139
+ "grad_norm": 0.6963644924074289,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.7016,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.64,
146
+ "grad_norm": 0.6629526941218145,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6907,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.672,
153
+ "grad_norm": 0.46342013498108614,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.6947,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.704,
160
+ "grad_norm": 0.5188693861900772,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6947,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.736,
167
+ "grad_norm": 0.46151796592999117,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6888,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.768,
174
+ "grad_norm": 0.5322678506039507,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.6933,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.8,
181
+ "grad_norm": 0.4694684079232731,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6907,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.832,
188
+ "grad_norm": 0.5841370812658152,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6932,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.864,
195
+ "grad_norm": 0.49903482096358526,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6937,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.896,
202
+ "grad_norm": 0.5304368954337139,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.6845,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.928,
209
+ "grad_norm": 0.6838692426374734,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6842,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.96,
216
+ "grad_norm": 0.6180107533054702,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6772,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.992,
223
+ "grad_norm": 0.550556182335329,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6705,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9984,
230
+ "eval_loss": 0.6830303072929382,
231
+ "eval_runtime": 166.7227,
232
+ "eval_samples_per_second": 50.491,
233
+ "eval_steps_per_second": 0.396,
234
  "step": 312
235
  },
236
  {
237
  "epoch": 1.024,
238
+ "grad_norm": 0.7196672666972574,
239
  "learning_rate": 5e-06,
240
+ "loss": 0.6414,
241
  "step": 320
242
  },
243
  {
244
  "epoch": 1.056,
245
+ "grad_norm": 0.6864929334563745,
246
  "learning_rate": 5e-06,
247
+ "loss": 0.6315,
248
  "step": 330
249
  },
250
  {
251
  "epoch": 1.088,
252
+ "grad_norm": 0.7530113138351922,
253
  "learning_rate": 5e-06,
254
+ "loss": 0.6361,
255
  "step": 340
256
  },
257
  {
258
  "epoch": 1.12,
259
+ "grad_norm": 0.5287710086719117,
260
  "learning_rate": 5e-06,
261
+ "loss": 0.6356,
262
  "step": 350
263
  },
264
  {
265
  "epoch": 1.152,
266
+ "grad_norm": 0.5840075309781476,
267
  "learning_rate": 5e-06,
268
+ "loss": 0.6322,
269
  "step": 360
270
  },
271
  {
272
  "epoch": 1.184,
273
+ "grad_norm": 0.5008293781573286,
274
  "learning_rate": 5e-06,
275
+ "loss": 0.648,
276
  "step": 370
277
  },
278
  {
279
  "epoch": 1.216,
280
+ "grad_norm": 0.5734259910470129,
281
  "learning_rate": 5e-06,
282
+ "loss": 0.6344,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.248,
287
+ "grad_norm": 0.5742943072984884,
288
  "learning_rate": 5e-06,
289
+ "loss": 0.634,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.28,
294
+ "grad_norm": 0.5646558201841438,
295
  "learning_rate": 5e-06,
296
+ "loss": 0.6367,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.312,
301
+ "grad_norm": 0.582405938579332,
302
  "learning_rate": 5e-06,
303
+ "loss": 0.6333,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.3439999999999999,
308
+ "grad_norm": 0.5464516067637085,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.627,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.376,
315
+ "grad_norm": 0.526922439327205,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.6413,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.408,
322
+ "grad_norm": 0.5460672292153811,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.6293,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.44,
329
+ "grad_norm": 0.5139358671369992,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.63,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.472,
336
+ "grad_norm": 0.5915093665608144,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.6459,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.504,
343
+ "grad_norm": 0.5570676911617528,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.6376,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.536,
350
+ "grad_norm": 0.5728608013877954,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.6401,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.568,
357
+ "grad_norm": 0.4896266564265616,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.6369,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.6,
364
+ "grad_norm": 0.4962652191813324,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.6415,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.6320000000000001,
371
+ "grad_norm": 0.5518896628136374,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.6287,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.6640000000000001,
378
+ "grad_norm": 0.43944356606683,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.6387,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.696,
385
+ "grad_norm": 0.5183777489924765,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.6376,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.728,
392
+ "grad_norm": 0.4913940452170003,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.6284,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.76,
399
+ "grad_norm": 0.6170566214388444,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.6356,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.792,
406
+ "grad_norm": 0.6377778383394684,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.6403,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.8239999999999998,
413
+ "grad_norm": 0.5663340732106481,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.6449,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.8559999999999999,
420
+ "grad_norm": 0.5736342081997541,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.6387,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.888,
427
+ "grad_norm": 0.49306085144025097,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.6241,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.92,
434
+ "grad_norm": 0.5775658004357387,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.6271,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.952,
441
+ "grad_norm": 0.5577832354086514,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.634,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.984,
448
+ "grad_norm": 0.5187494053729239,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.6388,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 2.0,
455
+ "eval_loss": 0.6722739934921265,
456
+ "eval_runtime": 167.9944,
457
+ "eval_samples_per_second": 50.109,
458
+ "eval_steps_per_second": 0.393,
459
  "step": 625
460
  },
461
  {
462
  "epoch": 2.016,
463
+ "grad_norm": 0.7554552806950715,
464
  "learning_rate": 5e-06,
465
+ "loss": 0.609,
466
  "step": 630
467
  },
468
  {
469
  "epoch": 2.048,
470
+ "grad_norm": 0.7015587384161144,
471
  "learning_rate": 5e-06,
472
+ "loss": 0.5764,
473
  "step": 640
474
  },
475
  {
476
  "epoch": 2.08,
477
+ "grad_norm": 0.6710139474563853,
478
  "learning_rate": 5e-06,
479
+ "loss": 0.583,
480
  "step": 650
481
  },
482
  {
483
  "epoch": 2.112,
484
+ "grad_norm": 0.5068349693062657,
485
  "learning_rate": 5e-06,
486
+ "loss": 0.592,
487
  "step": 660
488
  },
489
  {
490
  "epoch": 2.144,
491
+ "grad_norm": 0.5399107301121943,
492
  "learning_rate": 5e-06,
493
+ "loss": 0.5891,
494
  "step": 670
495
  },
496
  {
497
  "epoch": 2.176,
498
+ "grad_norm": 0.5827025174626241,
499
  "learning_rate": 5e-06,
500
+ "loss": 0.5862,
501
  "step": 680
502
  },
503
  {
504
  "epoch": 2.208,
505
+ "grad_norm": 0.6588198083574598,
506
  "learning_rate": 5e-06,
507
+ "loss": 0.581,
508
  "step": 690
509
  },
510
  {
511
  "epoch": 2.24,
512
+ "grad_norm": 0.5284463724994527,
513
  "learning_rate": 5e-06,
514
+ "loss": 0.5895,
515
  "step": 700
516
  },
517
  {
518
  "epoch": 2.2720000000000002,
519
+ "grad_norm": 0.5554858601981788,
520
  "learning_rate": 5e-06,
521
+ "loss": 0.582,
522
  "step": 710
523
  },
524
  {
525
  "epoch": 2.304,
526
+ "grad_norm": 0.5076164990838873,
527
  "learning_rate": 5e-06,
528
+ "loss": 0.591,
529
  "step": 720
530
  },
531
  {
532
  "epoch": 2.336,
533
+ "grad_norm": 0.5467190742693613,
534
  "learning_rate": 5e-06,
535
+ "loss": 0.5947,
536
  "step": 730
537
  },
538
  {
539
  "epoch": 2.368,
540
+ "grad_norm": 0.6169750992589544,
541
  "learning_rate": 5e-06,
542
+ "loss": 0.5972,
543
  "step": 740
544
  },
545
  {
546
  "epoch": 2.4,
547
+ "grad_norm": 0.48044936244386016,
548
  "learning_rate": 5e-06,
549
+ "loss": 0.5876,
550
  "step": 750
551
  },
552
  {
553
  "epoch": 2.432,
554
+ "grad_norm": 0.6032706176552372,
555
  "learning_rate": 5e-06,
556
+ "loss": 0.587,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.464,
561
+ "grad_norm": 0.6063975913360607,
562
  "learning_rate": 5e-06,
563
+ "loss": 0.5925,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.496,
568
+ "grad_norm": 0.6086757160459484,
569
  "learning_rate": 5e-06,
570
+ "loss": 0.5866,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.528,
575
+ "grad_norm": 0.5468098219597467,
576
  "learning_rate": 5e-06,
577
+ "loss": 0.5833,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.56,
582
+ "grad_norm": 0.5487999795278167,
583
  "learning_rate": 5e-06,
584
+ "loss": 0.5833,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.592,
589
+ "grad_norm": 0.549327963090587,
590
  "learning_rate": 5e-06,
591
+ "loss": 0.5901,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.624,
596
+ "grad_norm": 0.7150840433104585,
597
  "learning_rate": 5e-06,
598
+ "loss": 0.5953,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.656,
603
+ "grad_norm": 0.5334250102952837,
604
  "learning_rate": 5e-06,
605
+ "loss": 0.5921,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.6879999999999997,
610
+ "grad_norm": 0.4984756567264573,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.5909,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.7199999999999998,
617
+ "grad_norm": 0.5260124201884769,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5891,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.752,
624
+ "grad_norm": 0.4768707573361185,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.5966,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.784,
631
+ "grad_norm": 0.6307214695791044,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5956,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.816,
638
+ "grad_norm": 0.6369711933029217,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5906,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.848,
645
+ "grad_norm": 0.5331564084605998,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5944,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.88,
652
+ "grad_norm": 0.49279217252034724,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.588,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.912,
659
+ "grad_norm": 0.570357514747828,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.5915,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.944,
666
+ "grad_norm": 0.78678569178475,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.6007,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.976,
673
+ "grad_norm": 0.561666413771167,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5952,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.9952,
680
+ "eval_loss": 0.6738138794898987,
681
+ "eval_runtime": 168.233,
682
+ "eval_samples_per_second": 50.038,
683
+ "eval_steps_per_second": 0.392,
684
  "step": 936
685
  },
686
  {
687
  "epoch": 2.9952,
688
  "step": 936,
689
  "total_flos": 1567416102420480.0,
690
+ "train_loss": 0.647635899293117,
691
+ "train_runtime": 28091.6471,
692
+ "train_samples_per_second": 17.079,
693
  "train_steps_per_second": 0.033
694
  }
695
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED