vaishnavi18 commited on
Commit
0344a46
1 Parent(s): 9fb3f45

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 1.3544
24
 
25
  ## Model description
26
 
@@ -56,9 +56,9 @@ The following hyperparameters were used during training:
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
- | 1.2658 | 1.0 | 326 | 1.2135 |
60
- | 0.8841 | 2.0 | 652 | 1.2135 |
61
- | 0.7012 | 3.0 | 978 | 1.3544 |
62
 
63
 
64
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.3529
24
 
25
  ## Model description
26
 
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.265 | 1.0 | 326 | 1.2147 |
60
+ | 0.8983 | 2.0 | 652 | 1.2119 |
61
+ | 0.7007 | 3.0 | 978 | 1.3529 |
62
 
63
 
64
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 3642602029056.0,
4
- "train_loss": 1.0360665048314506,
5
- "train_runtime": 586.176,
6
  "train_samples": 2023,
7
- "train_samples_per_second": 6.674,
8
- "train_steps_per_second": 1.668
9
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 3642602029056.0,
4
+ "train_loss": 1.0182966589927673,
5
+ "train_runtime": 588.3296,
6
  "train_samples": 2023,
7
+ "train_samples_per_second": 6.649,
8
+ "train_steps_per_second": 1.662
9
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f03ed4af2ea91e8b3cbbc4638db271c30b32401880813e3fffa2a20ad9a26c9
3
  size 4945242264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60939b5e22709c00ec96520ade09dcf2b14bdabe4696c70cb6682237cc9a2418
3
  size 4945242264
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3070c72853fac5da48f03f70e9348094fc053d7b892771281116c46eed2ff01
3
  size 67121608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4723cbfe53b0efa358675b46c28db4b13ac7b62f1db5b842c3fb64d4c56a47c
3
  size 67121608
runs/Oct30_00-32-00_tinkywinky.edc.iee.ucsb.edu/events.out.tfevents.1730273532.tinkywinky.edc.iee.ucsb.edu.2174734.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbc6cf7a78668c3b6db42bb5db843fabf5fc63b475186313ebd5e38c8f8d797f
3
+ size 48234
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 3642602029056.0,
4
- "train_loss": 1.0360665048314506,
5
- "train_runtime": 586.176,
6
  "train_samples": 2023,
7
- "train_samples_per_second": 6.674,
8
- "train_steps_per_second": 1.668
9
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 3642602029056.0,
4
+ "train_loss": 1.0182966589927673,
5
+ "train_runtime": 588.3296,
6
  "train_samples": 2023,
7
+ "train_samples_per_second": 6.649,
8
+ "train_steps_per_second": 1.662
9
  }
trainer_state.json CHANGED
@@ -10,1408 +10,1408 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003067484662576687,
13
- "grad_norm": 177.2131714128933,
14
  "learning_rate": 2.0408163265306121e-07,
15
  "loss": 3.0795,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.015337423312883436,
20
- "grad_norm": 241.89792017630023,
21
  "learning_rate": 1.0204081632653063e-06,
22
- "loss": 2.738,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03067484662576687,
27
- "grad_norm": 97.9005821739992,
28
  "learning_rate": 2.0408163265306125e-06,
29
- "loss": 2.573,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.046012269938650305,
34
- "grad_norm": 29.935305969711827,
35
  "learning_rate": 3.0612244897959185e-06,
36
- "loss": 2.4113,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.06134969325153374,
41
- "grad_norm": 6.7865011448005985,
42
  "learning_rate": 4.081632653061225e-06,
43
- "loss": 1.9856,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.07668711656441718,
48
- "grad_norm": 8.264677565869711,
49
  "learning_rate": 5.1020408163265315e-06,
50
- "loss": 1.8726,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09202453987730061,
55
- "grad_norm": 19.32485419745683,
56
  "learning_rate": 6.122448979591837e-06,
57
- "loss": 1.7009,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.10736196319018405,
62
- "grad_norm": 10.162135754390622,
63
  "learning_rate": 7.1428571428571436e-06,
64
- "loss": 1.6191,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.12269938650306748,
69
- "grad_norm": 3.0916095898500555,
70
  "learning_rate": 8.16326530612245e-06,
71
- "loss": 1.5164,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.13803680981595093,
76
- "grad_norm": 43.413150705371685,
77
  "learning_rate": 9.183673469387756e-06,
78
- "loss": 1.5097,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.15337423312883436,
83
- "grad_norm": 49.53965734626817,
84
  "learning_rate": 1.0204081632653063e-05,
85
- "loss": 1.5785,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.1687116564417178,
90
- "grad_norm": 10.98873172852826,
91
  "learning_rate": 1.1224489795918367e-05,
92
- "loss": 1.4715,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.18404907975460122,
97
- "grad_norm": 2.7742625393808584,
98
  "learning_rate": 1.2244897959183674e-05,
99
- "loss": 1.4263,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.19938650306748465,
104
- "grad_norm": 23.101404729255652,
105
  "learning_rate": 1.326530612244898e-05,
106
- "loss": 1.4043,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.2147239263803681,
111
- "grad_norm": 12.92305440472016,
112
  "learning_rate": 1.4285714285714287e-05,
113
- "loss": 1.3793,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.23006134969325154,
118
- "grad_norm": 2.882766144097806,
119
  "learning_rate": 1.530612244897959e-05,
120
- "loss": 1.4183,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.24539877300613497,
125
- "grad_norm": 11.111007796269925,
126
  "learning_rate": 1.63265306122449e-05,
127
- "loss": 1.4517,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2607361963190184,
132
- "grad_norm": 2.897394026367856,
133
  "learning_rate": 1.7346938775510206e-05,
134
- "loss": 1.3393,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.27607361963190186,
139
- "grad_norm": 2.9177477230316566,
140
  "learning_rate": 1.836734693877551e-05,
141
- "loss": 1.428,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.29141104294478526,
146
- "grad_norm": 2.912560429885746,
147
  "learning_rate": 1.9387755102040817e-05,
148
- "loss": 1.304,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3067484662576687,
153
- "grad_norm": 2.5615070520092704,
154
  "learning_rate": 1.9999745104274995e-05,
155
- "loss": 1.2997,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3220858895705521,
160
- "grad_norm": 2.9480036513492336,
161
  "learning_rate": 1.9996877676598733e-05,
162
- "loss": 1.4086,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.3374233128834356,
167
- "grad_norm": 2.5407276803281333,
168
  "learning_rate": 1.9990825118233958e-05,
169
- "loss": 1.3028,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.35276073619631904,
174
- "grad_norm": 3.239233594777726,
175
  "learning_rate": 1.9981589357601727e-05,
176
- "loss": 1.4185,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.36809815950920244,
181
- "grad_norm": 2.771896819307909,
182
  "learning_rate": 1.9969173337331283e-05,
183
- "loss": 1.3523,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3834355828220859,
188
- "grad_norm": 2.7643269799604724,
189
  "learning_rate": 1.9953581013322503e-05,
190
- "loss": 1.3175,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.3987730061349693,
195
- "grad_norm": 3.508281226717504,
196
  "learning_rate": 1.99348173534855e-05,
197
- "loss": 1.2981,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.41411042944785276,
202
- "grad_norm": 2.4302828555210025,
203
  "learning_rate": 1.9912888336157793e-05,
204
- "loss": 1.3109,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.4294478527607362,
209
- "grad_norm": 2.3273613141747975,
210
  "learning_rate": 1.9887800948199496e-05,
211
- "loss": 1.2887,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.4447852760736196,
216
- "grad_norm": 2.502367541877984,
217
  "learning_rate": 1.9859563182767268e-05,
218
- "loss": 1.2735,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4601226993865031,
223
- "grad_norm": 2.8584207562233113,
224
  "learning_rate": 1.9828184036767556e-05,
225
- "loss": 1.2271,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4754601226993865,
230
- "grad_norm": 2.190347531165954,
231
  "learning_rate": 1.9793673507990086e-05,
232
- "loss": 1.2896,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.49079754601226994,
237
- "grad_norm": 2.2761435625476545,
238
  "learning_rate": 1.9756042591922436e-05,
239
- "loss": 1.3294,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.5061349693251533,
244
- "grad_norm": 2.220138706256322,
245
  "learning_rate": 1.9715303278246724e-05,
246
- "loss": 1.3188,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.5214723926380368,
251
- "grad_norm": 2.294310010674355,
252
  "learning_rate": 1.9671468547019575e-05,
253
- "loss": 1.2199,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.5368098159509203,
258
- "grad_norm": 2.513372192362633,
259
  "learning_rate": 1.9624552364536472e-05,
260
- "loss": 1.3106,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.5521472392638037,
265
- "grad_norm": 2.510468468716461,
266
  "learning_rate": 1.9574569678881965e-05,
267
- "loss": 1.3708,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5674846625766872,
272
- "grad_norm": 2.041921641107936,
273
  "learning_rate": 1.952153641516698e-05,
274
- "loss": 1.1784,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5828220858895705,
279
- "grad_norm": 2.27820722339295,
280
  "learning_rate": 1.94654694704549e-05,
281
- "loss": 1.1902,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.598159509202454,
286
- "grad_norm": 2.328841014102968,
287
  "learning_rate": 1.9406386708377956e-05,
288
- "loss": 1.316,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.6134969325153374,
293
- "grad_norm": 2.2926147060214066,
294
  "learning_rate": 1.9344306953445632e-05,
295
- "loss": 1.3843,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.6288343558282209,
300
- "grad_norm": 3.113673918623077,
301
  "learning_rate": 1.9279249985046948e-05,
302
- "loss": 1.3199,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.6441717791411042,
307
- "grad_norm": 2.105413236829464,
308
  "learning_rate": 1.92112365311485e-05,
309
- "loss": 1.2593,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.6595092024539877,
314
- "grad_norm": 2.2971282506758204,
315
  "learning_rate": 1.9140288261690278e-05,
316
- "loss": 1.2316,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6748466257668712,
321
- "grad_norm": 3.060568400530995,
322
  "learning_rate": 1.9066427781681314e-05,
323
- "loss": 1.1986,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6901840490797546,
328
- "grad_norm": 2.0851577530985663,
329
  "learning_rate": 1.8989678623997506e-05,
330
- "loss": 1.2465,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.7055214723926381,
335
- "grad_norm": 2.34599375716676,
336
  "learning_rate": 1.891006524188368e-05,
337
- "loss": 1.3111,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.7208588957055214,
342
- "grad_norm": 2.0045103451147352,
343
  "learning_rate": 1.8827613001162534e-05,
344
- "loss": 1.2048,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.7361963190184049,
349
- "grad_norm": 2.0865280550933663,
350
  "learning_rate": 1.8742348172152728e-05,
351
- "loss": 1.3875,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.7515337423312883,
356
- "grad_norm": 2.1899054440028602,
357
  "learning_rate": 1.8654297921298862e-05,
358
- "loss": 1.2177,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.7668711656441718,
363
- "grad_norm": 2.146615168451316,
364
  "learning_rate": 1.856349030251589e-05,
365
- "loss": 1.2302,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7822085889570553,
370
- "grad_norm": 5.215245761172734,
371
  "learning_rate": 1.846995424825079e-05,
372
- "loss": 1.1558,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.7975460122699386,
377
- "grad_norm": 2.1303631206592253,
378
  "learning_rate": 1.837371956026433e-05,
379
- "loss": 1.2666,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.8128834355828221,
384
- "grad_norm": 3.324267620281515,
385
  "learning_rate": 1.8274816900135842e-05,
386
- "loss": 1.2796,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.8282208588957055,
391
- "grad_norm": 2.0369143733640174,
392
  "learning_rate": 1.817327777949407e-05,
393
- "loss": 1.2191,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.843558282208589,
398
- "grad_norm": 2.268965846881571,
399
  "learning_rate": 1.806913454997717e-05,
400
- "loss": 1.2382,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.8588957055214724,
405
- "grad_norm": 1.9534994546311044,
406
  "learning_rate": 1.7962420392925066e-05,
407
- "loss": 1.3029,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.8742331288343558,
412
- "grad_norm": 1.9407859425510332,
413
  "learning_rate": 1.785316930880745e-05,
414
- "loss": 1.183,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8895705521472392,
419
- "grad_norm": 1.9539166093233689,
420
  "learning_rate": 1.7741416106390828e-05,
421
- "loss": 1.1725,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.9049079754601227,
426
- "grad_norm": 2.2999481127748216,
427
  "learning_rate": 1.7627196391647982e-05,
428
- "loss": 1.2114,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.9202453987730062,
433
- "grad_norm": 1.9126415566238704,
434
  "learning_rate": 1.75105465564135e-05,
435
- "loss": 1.2206,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.9355828220858896,
440
- "grad_norm": 2.3510054432341887,
441
  "learning_rate": 1.739150376678883e-05,
442
- "loss": 1.301,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.950920245398773,
447
- "grad_norm": 2.5269459534002596,
448
  "learning_rate": 1.727010595130074e-05,
449
- "loss": 1.3914,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.9662576687116564,
454
- "grad_norm": 2.045224623846914,
455
  "learning_rate": 1.714639178881678e-05,
456
- "loss": 1.2722,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.9815950920245399,
461
- "grad_norm": 2.0917960158205107,
462
  "learning_rate": 1.7020400696221737e-05,
463
- "loss": 1.2093,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.9969325153374233,
468
- "grad_norm": 2.017009094857716,
469
  "learning_rate": 1.6892172815858896e-05,
470
- "loss": 1.2658,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 1.0,
475
- "eval_loss": 1.2135429382324219,
476
- "eval_runtime": 6.9064,
477
- "eval_samples_per_second": 23.022,
478
- "eval_steps_per_second": 5.792,
479
  "step": 326
480
  },
481
  {
482
  "epoch": 1.0122699386503067,
483
- "grad_norm": 2.1076821373912433,
484
  "learning_rate": 1.6761749002740195e-05,
485
- "loss": 1.126,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 1.0276073619631902,
490
- "grad_norm": 2.532948886360268,
491
  "learning_rate": 1.662917081152932e-05,
492
- "loss": 1.0517,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 1.0429447852760736,
497
- "grad_norm": 2.159362066124978,
498
  "learning_rate": 1.6494480483301836e-05,
499
- "loss": 0.9849,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 1.058282208588957,
504
- "grad_norm": 1.9634133830158156,
505
  "learning_rate": 1.635772093208669e-05,
506
- "loss": 0.961,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 1.0736196319018405,
511
- "grad_norm": 2.643597563770428,
512
  "learning_rate": 1.6218935731193223e-05,
513
- "loss": 0.929,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 1.0889570552147239,
518
- "grad_norm": 2.2336296848587986,
519
  "learning_rate": 1.6078169099328196e-05,
520
- "loss": 0.9695,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 1.1042944785276074,
525
- "grad_norm": 2.5120125840182803,
526
  "learning_rate": 1.5935465886507143e-05,
527
- "loss": 1.1179,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 1.1196319018404908,
532
- "grad_norm": 2.0896191655801735,
533
  "learning_rate": 1.579087155976459e-05,
534
- "loss": 1.1294,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 1.1349693251533743,
539
- "grad_norm": 2.325319374896529,
540
  "learning_rate": 1.5644432188667695e-05,
541
- "loss": 0.9826,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 1.1503067484662577,
546
- "grad_norm": 2.1728488841214095,
547
  "learning_rate": 1.5496194430637903e-05,
548
- "loss": 0.9251,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 1.165644171779141,
553
- "grad_norm": 2.424880159324192,
554
  "learning_rate": 1.5346205516085305e-05,
555
- "loss": 1.0463,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 1.1809815950920246,
560
- "grad_norm": 3.026265515790728,
561
  "learning_rate": 1.5194513233360439e-05,
562
- "loss": 0.9217,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 1.196319018404908,
567
- "grad_norm": 3.1617293471411654,
568
  "learning_rate": 1.504116591352832e-05,
569
- "loss": 1.1437,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 1.2116564417177913,
574
- "grad_norm": 2.2307237557720003,
575
  "learning_rate": 1.4886212414969551e-05,
576
- "loss": 0.967,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 1.2269938650306749,
581
- "grad_norm": 2.430911245376782,
582
  "learning_rate": 1.4729702107813438e-05,
583
- "loss": 0.999,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 1.2423312883435582,
588
- "grad_norm": 2.1612231748678896,
589
  "learning_rate": 1.4571684858208045e-05,
590
- "loss": 1.004,
591
  "step": 405
592
  },
593
  {
594
  "epoch": 1.2576687116564418,
595
- "grad_norm": 2.6596296516845563,
596
  "learning_rate": 1.4412211012432213e-05,
597
- "loss": 1.0454,
598
  "step": 410
599
  },
600
  {
601
  "epoch": 1.2730061349693251,
602
- "grad_norm": 2.093932365865954,
603
  "learning_rate": 1.4251331380854602e-05,
604
- "loss": 1.0569,
605
  "step": 415
606
  },
607
  {
608
  "epoch": 1.2883435582822087,
609
- "grad_norm": 2.72153245614344,
610
  "learning_rate": 1.408909722174487e-05,
611
- "loss": 0.9593,
612
  "step": 420
613
  },
614
  {
615
  "epoch": 1.303680981595092,
616
- "grad_norm": 2.6462362091450515,
617
  "learning_rate": 1.3925560224942145e-05,
618
- "loss": 0.9271,
619
  "step": 425
620
  },
621
  {
622
  "epoch": 1.3190184049079754,
623
- "grad_norm": 2.267205865998314,
624
  "learning_rate": 1.3760772495385998e-05,
625
- "loss": 0.9582,
626
  "step": 430
627
  },
628
  {
629
  "epoch": 1.334355828220859,
630
- "grad_norm": 14.299415996684987,
631
  "learning_rate": 1.3594786536515154e-05,
632
- "loss": 0.9659,
633
  "step": 435
634
  },
635
  {
636
  "epoch": 1.3496932515337423,
637
- "grad_norm": 2.8919289960885157,
638
  "learning_rate": 1.3427655233539227e-05,
639
- "loss": 0.922,
640
  "step": 440
641
  },
642
  {
643
  "epoch": 1.3650306748466257,
644
- "grad_norm": 2.0424273794301344,
645
  "learning_rate": 1.3259431836588843e-05,
646
- "loss": 0.9932,
647
  "step": 445
648
  },
649
  {
650
  "epoch": 1.3803680981595092,
651
- "grad_norm": 6.610636759407049,
652
  "learning_rate": 1.3090169943749475e-05,
653
- "loss": 0.9837,
654
  "step": 450
655
  },
656
  {
657
  "epoch": 1.3957055214723926,
658
- "grad_norm": 2.2497868172019357,
659
  "learning_rate": 1.2919923483984415e-05,
660
- "loss": 0.9149,
661
  "step": 455
662
  },
663
  {
664
  "epoch": 1.4110429447852761,
665
- "grad_norm": 2.2842331908069125,
666
  "learning_rate": 1.2748746699952338e-05,
667
- "loss": 1.0342,
668
  "step": 460
669
  },
670
  {
671
  "epoch": 1.4263803680981595,
672
- "grad_norm": 2.1102577421193085,
673
  "learning_rate": 1.2576694130724905e-05,
674
- "loss": 1.1568,
675
  "step": 465
676
  },
677
  {
678
  "epoch": 1.441717791411043,
679
- "grad_norm": 2.127882920435348,
680
  "learning_rate": 1.2403820594409926e-05,
681
- "loss": 0.9638,
682
  "step": 470
683
  },
684
  {
685
  "epoch": 1.4570552147239264,
686
- "grad_norm": 2.584359793762212,
687
  "learning_rate": 1.2230181170685636e-05,
688
- "loss": 0.9673,
689
  "step": 475
690
  },
691
  {
692
  "epoch": 1.4723926380368098,
693
- "grad_norm": 2.264911154520169,
694
  "learning_rate": 1.2055831183251608e-05,
695
- "loss": 1.0154,
696
  "step": 480
697
  },
698
  {
699
  "epoch": 1.4877300613496933,
700
- "grad_norm": 2.0087330179243335,
701
  "learning_rate": 1.1880826182201926e-05,
702
- "loss": 0.8973,
703
  "step": 485
704
  },
705
  {
706
  "epoch": 1.5030674846625767,
707
- "grad_norm": 1.9321511470889676,
708
  "learning_rate": 1.170522192632624e-05,
709
- "loss": 0.9371,
710
  "step": 490
711
  },
712
  {
713
  "epoch": 1.51840490797546,
714
- "grad_norm": 2.121122226050571,
715
  "learning_rate": 1.1529074365344302e-05,
716
- "loss": 1.033,
717
  "step": 495
718
  },
719
  {
720
  "epoch": 1.5337423312883436,
721
- "grad_norm": 2.029011447995027,
722
  "learning_rate": 1.1352439622079689e-05,
723
- "loss": 0.9768,
724
  "step": 500
725
  },
726
  {
727
  "epoch": 1.5490797546012272,
728
- "grad_norm": 2.108744801443365,
729
  "learning_rate": 1.1175373974578378e-05,
730
- "loss": 0.9089,
731
  "step": 505
732
  },
733
  {
734
  "epoch": 1.5644171779141103,
735
- "grad_norm": 2.2204587915399876,
736
  "learning_rate": 1.0997933838177828e-05,
737
- "loss": 1.0449,
738
  "step": 510
739
  },
740
  {
741
  "epoch": 1.5797546012269938,
742
- "grad_norm": 2.1535325181232294,
743
  "learning_rate": 1.0820175747532373e-05,
744
- "loss": 1.0172,
745
  "step": 515
746
  },
747
  {
748
  "epoch": 1.5950920245398774,
749
- "grad_norm": 21.819229878115912,
750
  "learning_rate": 1.064215633860055e-05,
751
- "loss": 0.9769,
752
  "step": 520
753
  },
754
  {
755
  "epoch": 1.6104294478527608,
756
- "grad_norm": 2.1296751297142897,
757
  "learning_rate": 1.0463932330600197e-05,
758
- "loss": 0.9483,
759
  "step": 525
760
  },
761
  {
762
  "epoch": 1.6257668711656441,
763
- "grad_norm": 2.1616752600527134,
764
  "learning_rate": 1.0285560507936962e-05,
765
- "loss": 4.3216,
766
  "step": 530
767
  },
768
  {
769
  "epoch": 1.6411042944785277,
770
- "grad_norm": 3.2786842755698795,
771
  "learning_rate": 1.010709770211212e-05,
772
- "loss": 1.1255,
773
  "step": 535
774
  },
775
  {
776
  "epoch": 1.656441717791411,
777
- "grad_norm": 2.4763448399092147,
778
  "learning_rate": 9.928600773615306e-06,
779
- "loss": 1.0498,
780
  "step": 540
781
  },
782
  {
783
  "epoch": 1.6717791411042944,
784
- "grad_norm": 2.006782735936947,
785
  "learning_rate": 9.750126593808083e-06,
786
- "loss": 0.927,
787
  "step": 545
788
  },
789
  {
790
  "epoch": 1.687116564417178,
791
- "grad_norm": 2.2437759465461777,
792
  "learning_rate": 9.571732026803978e-06,
793
- "loss": 0.906,
794
  "step": 550
795
  },
796
  {
797
  "epoch": 1.7024539877300615,
798
- "grad_norm": 1.9556898555009257,
799
  "learning_rate": 9.393473911350895e-06,
800
- "loss": 0.9715,
801
  "step": 555
802
  },
803
  {
804
  "epoch": 1.7177914110429446,
805
- "grad_norm": 2.082737830638052,
806
  "learning_rate": 9.215409042721553e-06,
807
- "loss": 1.0,
808
  "step": 560
809
  },
810
  {
811
  "epoch": 1.7331288343558282,
812
- "grad_norm": 2.3756819460619534,
813
  "learning_rate": 9.037594154617811e-06,
814
- "loss": 0.9755,
815
  "step": 565
816
  },
817
  {
818
  "epoch": 1.7484662576687118,
819
- "grad_norm": 2.7832428710692265,
820
  "learning_rate": 8.860085901094595e-06,
821
- "loss": 0.9694,
822
  "step": 570
823
  },
824
  {
825
  "epoch": 1.7638036809815951,
826
- "grad_norm": 2.299830262573217,
827
  "learning_rate": 8.682940838509206e-06,
828
- "loss": 0.8829,
829
  "step": 575
830
  },
831
  {
832
  "epoch": 1.7791411042944785,
833
- "grad_norm": 1.9164299833335463,
834
  "learning_rate": 8.50621540750175e-06,
835
- "loss": 0.9634,
836
  "step": 580
837
  },
838
  {
839
  "epoch": 1.794478527607362,
840
- "grad_norm": 2.0803428687947023,
841
  "learning_rate": 8.329965915012451e-06,
842
- "loss": 1.0047,
843
  "step": 585
844
  },
845
  {
846
  "epoch": 1.8098159509202454,
847
- "grad_norm": 1.961542308344144,
848
  "learning_rate": 8.154248516341547e-06,
849
- "loss": 0.9783,
850
  "step": 590
851
  },
852
  {
853
  "epoch": 1.8251533742331287,
854
- "grad_norm": 1.98951654932812,
855
  "learning_rate": 7.979119197257505e-06,
856
- "loss": 1.0009,
857
  "step": 595
858
  },
859
  {
860
  "epoch": 1.8404907975460123,
861
- "grad_norm": 2.1181463512012653,
862
  "learning_rate": 7.804633756159258e-06,
863
- "loss": 0.9263,
864
  "step": 600
865
  },
866
  {
867
  "epoch": 1.8558282208588959,
868
- "grad_norm": 2.017971907316727,
869
  "learning_rate": 7.63084778629813e-06,
870
- "loss": 0.9933,
871
  "step": 605
872
  },
873
  {
874
  "epoch": 1.871165644171779,
875
- "grad_norm": 2.1164163856970135,
876
  "learning_rate": 7.4578166580651335e-06,
877
- "loss": 0.9392,
878
  "step": 610
879
  },
880
  {
881
  "epoch": 1.8865030674846626,
882
- "grad_norm": 2.118557747327772,
883
  "learning_rate": 7.285595501349259e-06,
884
- "loss": 0.9537,
885
  "step": 615
886
  },
887
  {
888
  "epoch": 1.9018404907975461,
889
- "grad_norm": 2.087163676278093,
890
  "learning_rate": 7.114239187972416e-06,
891
- "loss": 0.8722,
892
  "step": 620
893
  },
894
  {
895
  "epoch": 1.9171779141104295,
896
- "grad_norm": 2.222443842922782,
897
  "learning_rate": 6.94380231420656e-06,
898
- "loss": 0.9518,
899
  "step": 625
900
  },
901
  {
902
  "epoch": 1.9325153374233128,
903
- "grad_norm": 2.0519691059771503,
904
  "learning_rate": 6.774339183378663e-06,
905
- "loss": 0.9595,
906
  "step": 630
907
  },
908
  {
909
  "epoch": 1.9478527607361964,
910
- "grad_norm": 2.395088570054642,
911
  "learning_rate": 6.605903788568962e-06,
912
- "loss": 0.9082,
913
  "step": 635
914
  },
915
  {
916
  "epoch": 1.9631901840490797,
917
- "grad_norm": 4.89604575465679,
918
  "learning_rate": 6.438549795408107e-06,
919
- "loss": 0.9101,
920
  "step": 640
921
  },
922
  {
923
  "epoch": 1.978527607361963,
924
- "grad_norm": 2.1171931357011236,
925
  "learning_rate": 6.272330524978613e-06,
926
- "loss": 0.9687,
927
  "step": 645
928
  },
929
  {
930
  "epoch": 1.9938650306748467,
931
- "grad_norm": 2.0986919385384137,
932
  "learning_rate": 6.107298936826086e-06,
933
- "loss": 0.8841,
934
  "step": 650
935
  },
936
  {
937
  "epoch": 2.0,
938
- "eval_loss": 1.2134720087051392,
939
- "eval_runtime": 7.1411,
940
- "eval_samples_per_second": 22.266,
941
- "eval_steps_per_second": 5.601,
942
  "step": 652
943
  },
944
  {
945
  "epoch": 2.0092024539877302,
946
- "grad_norm": 2.1628044565671556,
947
  "learning_rate": 5.943507612085661e-06,
948
- "loss": 0.8251,
949
  "step": 655
950
  },
951
  {
952
  "epoch": 2.0245398773006134,
953
- "grad_norm": 1.980162767088004,
954
  "learning_rate": 5.781008736728975e-06,
955
- "loss": 0.6487,
956
  "step": 660
957
  },
958
  {
959
  "epoch": 2.039877300613497,
960
- "grad_norm": 2.862228737237209,
961
  "learning_rate": 5.619854084937085e-06,
962
- "loss": 0.6706,
963
  "step": 665
964
  },
965
  {
966
  "epoch": 2.0552147239263805,
967
- "grad_norm": 2.2439249118077136,
968
  "learning_rate": 5.460095002604533e-06,
969
- "loss": 0.6652,
970
  "step": 670
971
  },
972
  {
973
  "epoch": 2.0705521472392636,
974
- "grad_norm": 2.254257809075137,
975
  "learning_rate": 5.3017823909799295e-06,
976
- "loss": 0.7293,
977
  "step": 675
978
  },
979
  {
980
  "epoch": 2.085889570552147,
981
- "grad_norm": 2.223082549239095,
982
  "learning_rate": 5.144966690448159e-06,
983
- "loss": 0.755,
984
  "step": 680
985
  },
986
  {
987
  "epoch": 2.1012269938650308,
988
- "grad_norm": 2.0645657168776617,
989
  "learning_rate": 4.9896978644594516e-06,
990
- "loss": 0.5896,
991
  "step": 685
992
  },
993
  {
994
  "epoch": 2.116564417177914,
995
- "grad_norm": 2.543974903041173,
996
  "learning_rate": 4.836025383610382e-06,
997
- "loss": 0.7457,
998
  "step": 690
999
  },
1000
  {
1001
  "epoch": 2.1319018404907975,
1002
- "grad_norm": 2.1096988101915524,
1003
  "learning_rate": 4.683998209881943e-06,
1004
- "loss": 0.6847,
1005
  "step": 695
1006
  },
1007
  {
1008
  "epoch": 2.147239263803681,
1009
- "grad_norm": 2.36099194700885,
1010
  "learning_rate": 4.533664781039622e-06,
1011
- "loss": 0.715,
1012
  "step": 700
1013
  },
1014
  {
1015
  "epoch": 2.1625766871165646,
1016
- "grad_norm": 2.633833128566785,
1017
  "learning_rate": 4.385072995200532e-06,
1018
- "loss": 0.626,
1019
  "step": 705
1020
  },
1021
  {
1022
  "epoch": 2.1779141104294477,
1023
- "grad_norm": 2.506530086487105,
1024
  "learning_rate": 4.2382701955724724e-06,
1025
- "loss": 0.7172,
1026
  "step": 710
1027
  },
1028
  {
1029
  "epoch": 2.1932515337423313,
1030
- "grad_norm": 2.3802287660865553,
1031
  "learning_rate": 4.093303155369771e-06,
1032
- "loss": 0.6839,
1033
  "step": 715
1034
  },
1035
  {
1036
  "epoch": 2.208588957055215,
1037
- "grad_norm": 2.24220538972355,
1038
  "learning_rate": 3.950218062910776e-06,
1039
- "loss": 0.5827,
1040
  "step": 720
1041
  },
1042
  {
1043
  "epoch": 2.223926380368098,
1044
- "grad_norm": 2.2370918425887267,
1045
  "learning_rate": 3.8090605069016596e-06,
1046
- "loss": 0.6599,
1047
  "step": 725
1048
  },
1049
  {
1050
  "epoch": 2.2392638036809815,
1051
- "grad_norm": 2.1088652822907448,
1052
  "learning_rate": 3.6698754619112974e-06,
1053
- "loss": 0.7594,
1054
  "step": 730
1055
  },
1056
  {
1057
  "epoch": 2.254601226993865,
1058
- "grad_norm": 2.083592158309122,
1059
  "learning_rate": 3.53270727404179e-06,
1060
- "loss": 0.6702,
1061
  "step": 735
1062
  },
1063
  {
1064
  "epoch": 2.2699386503067487,
1065
- "grad_norm": 2.567792989862267,
1066
  "learning_rate": 3.3975996467992557e-06,
1067
- "loss": 0.6419,
1068
  "step": 740
1069
  },
1070
  {
1071
  "epoch": 2.285276073619632,
1072
- "grad_norm": 2.334913734729685,
1073
  "learning_rate": 3.2645956271693257e-06,
1074
- "loss": 0.6634,
1075
  "step": 745
1076
  },
1077
  {
1078
  "epoch": 2.3006134969325154,
1079
- "grad_norm": 2.3429523817504716,
1080
  "learning_rate": 3.133737591901864e-06,
1081
- "loss": 0.6939,
1082
  "step": 750
1083
  },
1084
  {
1085
  "epoch": 2.315950920245399,
1086
- "grad_norm": 2.2619584916199615,
1087
  "learning_rate": 3.0050672340091723e-06,
1088
- "loss": 0.6678,
1089
  "step": 755
1090
  },
1091
  {
1092
  "epoch": 2.331288343558282,
1093
- "grad_norm": 2.6296479257995777,
1094
  "learning_rate": 2.878625549482084e-06,
1095
- "loss": 0.7031,
1096
  "step": 760
1097
  },
1098
  {
1099
  "epoch": 2.3466257668711656,
1100
- "grad_norm": 2.8130762384906864,
1101
  "learning_rate": 2.7544528242281323e-06,
1102
- "loss": 0.5788,
1103
  "step": 765
1104
  },
1105
  {
1106
  "epoch": 2.361963190184049,
1107
- "grad_norm": 2.150686530040835,
1108
  "learning_rate": 2.6325886212359496e-06,
1109
- "loss": 0.6077,
1110
  "step": 770
1111
  },
1112
  {
1113
  "epoch": 2.3773006134969323,
1114
- "grad_norm": 2.2806678502354476,
1115
  "learning_rate": 2.51307176797001e-06,
1116
- "loss": 0.7225,
1117
  "step": 775
1118
  },
1119
  {
1120
  "epoch": 2.392638036809816,
1121
- "grad_norm": 2.257367372279919,
1122
  "learning_rate": 2.395940343999691e-06,
1123
- "loss": 0.66,
1124
  "step": 780
1125
  },
1126
  {
1127
  "epoch": 2.4079754601226995,
1128
- "grad_norm": 2.105998185034246,
1129
  "learning_rate": 2.2812316688666735e-06,
1130
- "loss": 0.5642,
1131
  "step": 785
1132
  },
1133
  {
1134
  "epoch": 2.4233128834355826,
1135
- "grad_norm": 2.6751693267999164,
1136
  "learning_rate": 2.1689822901944456e-06,
1137
- "loss": 0.6999,
1138
  "step": 790
1139
  },
1140
  {
1141
  "epoch": 2.438650306748466,
1142
- "grad_norm": 2.337986334515246,
1143
  "learning_rate": 2.0592279720437856e-06,
1144
- "loss": 0.6405,
1145
  "step": 795
1146
  },
1147
  {
1148
  "epoch": 2.4539877300613497,
1149
- "grad_norm": 2.5090481285811026,
1150
  "learning_rate": 1.9520036835178667e-06,
1151
- "loss": 0.7335,
1152
  "step": 800
1153
  },
1154
  {
1155
  "epoch": 2.4693251533742333,
1156
- "grad_norm": 2.222070003754594,
1157
  "learning_rate": 1.8473435876206792e-06,
1158
- "loss": 0.6446,
1159
  "step": 805
1160
  },
1161
  {
1162
  "epoch": 2.4846625766871164,
1163
- "grad_norm": 2.3534318368821467,
1164
  "learning_rate": 1.74528103037226e-06,
1165
- "loss": 0.7604,
1166
  "step": 810
1167
  },
1168
  {
1169
  "epoch": 2.5,
1170
- "grad_norm": 2.1957922076650225,
1171
  "learning_rate": 1.645848530184233e-06,
1172
- "loss": 0.6602,
1173
  "step": 815
1174
  },
1175
  {
1176
  "epoch": 2.5153374233128836,
1177
- "grad_norm": 2.248469614521019,
1178
  "learning_rate": 1.5490777674990376e-06,
1179
- "loss": 0.7518,
1180
  "step": 820
1181
  },
1182
  {
1183
  "epoch": 2.530674846625767,
1184
- "grad_norm": 2.358970858496667,
1185
  "learning_rate": 1.4549995746961332e-06,
1186
- "loss": 0.6974,
1187
  "step": 825
1188
  },
1189
  {
1190
  "epoch": 2.5460122699386503,
1191
- "grad_norm": 2.409137169378918,
1192
  "learning_rate": 1.3636439262684299e-06,
1193
- "loss": 0.6054,
1194
  "step": 830
1195
  },
1196
  {
1197
  "epoch": 2.561349693251534,
1198
- "grad_norm": 2.3008344145258275,
1199
  "learning_rate": 1.2750399292720284e-06,
1200
- "loss": 0.6626,
1201
  "step": 835
1202
  },
1203
  {
1204
  "epoch": 2.5766871165644174,
1205
- "grad_norm": 2.2498393839801403,
1206
  "learning_rate": 1.1892158140523546e-06,
1207
- "loss": 0.6923,
1208
  "step": 840
1209
  },
1210
  {
1211
  "epoch": 2.5920245398773005,
1212
- "grad_norm": 2.287309751844316,
1213
  "learning_rate": 1.1061989252496053e-06,
1214
- "loss": 0.6596,
1215
  "step": 845
1216
  },
1217
  {
1218
  "epoch": 2.607361963190184,
1219
- "grad_norm": 3.2190551280360937,
1220
  "learning_rate": 1.0260157130864178e-06,
1221
- "loss": 0.6771,
1222
  "step": 850
1223
  },
1224
  {
1225
  "epoch": 2.6226993865030677,
1226
- "grad_norm": 2.1948501519870782,
1227
  "learning_rate": 9.486917249404815e-07,
1228
- "loss": 0.6886,
1229
  "step": 855
1230
  },
1231
  {
1232
  "epoch": 2.638036809815951,
1233
- "grad_norm": 2.1803684507178955,
1234
  "learning_rate": 8.742515972048404e-07,
1235
- "loss": 0.7121,
1236
  "step": 860
1237
  },
1238
  {
1239
  "epoch": 2.6533742331288344,
1240
- "grad_norm": 2.089857540368277,
1241
  "learning_rate": 8.027190474384127e-07,
1242
- "loss": 0.5654,
1243
  "step": 865
1244
  },
1245
  {
1246
  "epoch": 2.668711656441718,
1247
- "grad_norm": 2.900973015953986,
1248
  "learning_rate": 7.341168668092857e-07,
1249
- "loss": 0.5899,
1250
  "step": 870
1251
  },
1252
  {
1253
  "epoch": 2.684049079754601,
1254
- "grad_norm": 2.269220220981122,
1255
  "learning_rate": 6.684669128331655e-07,
1256
- "loss": 0.7626,
1257
  "step": 875
1258
  },
1259
  {
1260
  "epoch": 2.6993865030674846,
1261
- "grad_norm": 2.177246970471886,
1262
  "learning_rate": 6.057901024092949e-07,
1263
- "loss": 0.6272,
1264
  "step": 880
1265
  },
1266
  {
1267
  "epoch": 2.714723926380368,
1268
- "grad_norm": 2.265583420009415,
1269
  "learning_rate": 5.461064051560705e-07,
1270
- "loss": 0.7539,
1271
  "step": 885
1272
  },
1273
  {
1274
  "epoch": 2.7300613496932513,
1275
- "grad_norm": 2.097337573364251,
1276
  "learning_rate": 4.894348370484648e-07,
1277
- "loss": 0.6404,
1278
  "step": 890
1279
  },
1280
  {
1281
  "epoch": 2.745398773006135,
1282
- "grad_norm": 2.6099811023622483,
1283
  "learning_rate": 4.3579345435930454e-07,
1284
- "loss": 0.781,
1285
  "step": 895
1286
  },
1287
  {
1288
  "epoch": 2.7607361963190185,
1289
- "grad_norm": 2.2217027523814155,
1290
  "learning_rate": 3.851993479063154e-07,
1291
- "loss": 0.7015,
1292
  "step": 900
1293
  },
1294
  {
1295
  "epoch": 2.7760736196319016,
1296
- "grad_norm": 2.325907671442105,
1297
  "learning_rate": 3.3766863760676947e-07,
1298
- "loss": 0.6844,
1299
  "step": 905
1300
  },
1301
  {
1302
  "epoch": 2.791411042944785,
1303
- "grad_norm": 2.264535197628352,
1304
  "learning_rate": 2.93216467341475e-07,
1305
- "loss": 0.6508,
1306
  "step": 910
1307
  },
1308
  {
1309
  "epoch": 2.8067484662576687,
1310
- "grad_norm": 2.4713808636104533,
1311
  "learning_rate": 2.5185700012975603e-07,
1312
- "loss": 0.7762,
1313
  "step": 915
1314
  },
1315
  {
1316
  "epoch": 2.8220858895705523,
1317
- "grad_norm": 2.402679896607673,
1318
  "learning_rate": 2.1360341361692517e-07,
1319
- "loss": 0.6639,
1320
  "step": 920
1321
  },
1322
  {
1323
  "epoch": 2.837423312883436,
1324
- "grad_norm": 2.179790948612016,
1325
  "learning_rate": 1.784678958757291e-07,
1326
- "loss": 0.65,
1327
  "step": 925
1328
  },
1329
  {
1330
  "epoch": 2.852760736196319,
1331
- "grad_norm": 2.076797574324857,
1332
  "learning_rate": 1.464616415230702e-07,
1333
- "loss": 0.6342,
1334
  "step": 930
1335
  },
1336
  {
1337
  "epoch": 2.8680981595092025,
1338
- "grad_norm": 2.1047162156910804,
1339
  "learning_rate": 1.1759484815326294e-07,
1340
- "loss": 0.6047,
1341
  "step": 935
1342
  },
1343
  {
1344
  "epoch": 2.883435582822086,
1345
- "grad_norm": 2.1660216273789237,
1346
  "learning_rate": 9.187671308895418e-08,
1347
- "loss": 0.6571,
1348
  "step": 940
1349
  },
1350
  {
1351
  "epoch": 2.8987730061349692,
1352
- "grad_norm": 2.713364435220778,
1353
  "learning_rate": 6.931543045073708e-08,
1354
- "loss": 0.8338,
1355
  "step": 945
1356
  },
1357
  {
1358
  "epoch": 2.914110429447853,
1359
- "grad_norm": 2.6713007123159724,
1360
  "learning_rate": 4.991818854640396e-08,
1361
- "loss": 0.6373,
1362
  "step": 950
1363
  },
1364
  {
1365
  "epoch": 2.9294478527607364,
1366
- "grad_norm": 2.1069338794602723,
1367
  "learning_rate": 3.369116758066171e-08,
1368
- "loss": 0.6385,
1369
  "step": 955
1370
  },
1371
  {
1372
  "epoch": 2.9447852760736195,
1373
- "grad_norm": 2.312520932069723,
1374
  "learning_rate": 2.063953768603799e-08,
1375
- "loss": 0.7596,
1376
  "step": 960
1377
  },
1378
  {
1379
  "epoch": 2.960122699386503,
1380
- "grad_norm": 2.1046770032748077,
1381
  "learning_rate": 1.0767457275615567e-08,
1382
- "loss": 0.634,
1383
  "step": 965
1384
  },
1385
  {
1386
  "epoch": 2.9754601226993866,
1387
- "grad_norm": 2.207645312141143,
1388
  "learning_rate": 4.0780717181077015e-09,
1389
- "loss": 0.7555,
1390
  "step": 970
1391
  },
1392
  {
1393
  "epoch": 2.9907975460122698,
1394
- "grad_norm": 2.457929371194042,
1395
  "learning_rate": 5.735123357042405e-10,
1396
- "loss": 0.7012,
1397
  "step": 975
1398
  },
1399
  {
1400
  "epoch": 3.0,
1401
- "eval_loss": 1.3543522357940674,
1402
- "eval_runtime": 6.5158,
1403
- "eval_samples_per_second": 24.402,
1404
- "eval_steps_per_second": 6.139,
1405
  "step": 978
1406
  },
1407
  {
1408
  "epoch": 3.0,
1409
  "step": 978,
1410
  "total_flos": 3642602029056.0,
1411
- "train_loss": 1.0360665048314506,
1412
- "train_runtime": 586.176,
1413
- "train_samples_per_second": 6.674,
1414
- "train_steps_per_second": 1.668
1415
  }
1416
  ],
1417
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003067484662576687,
13
+ "grad_norm": 177.21839775031285,
14
  "learning_rate": 2.0408163265306121e-07,
15
  "loss": 3.0795,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.015337423312883436,
20
+ "grad_norm": 81.30898025729103,
21
  "learning_rate": 1.0204081632653063e-06,
22
+ "loss": 2.7402,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03067484662576687,
27
+ "grad_norm": 107.55431230634855,
28
  "learning_rate": 2.0408163265306125e-06,
29
+ "loss": 2.5637,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.046012269938650305,
34
+ "grad_norm": 17.843083480868863,
35
  "learning_rate": 3.0612244897959185e-06,
36
+ "loss": 2.3819,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.06134969325153374,
41
+ "grad_norm": 5.541698118155843,
42
  "learning_rate": 4.081632653061225e-06,
43
+ "loss": 1.9572,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.07668711656441718,
48
+ "grad_norm": 10.409288444104131,
49
  "learning_rate": 5.1020408163265315e-06,
50
+ "loss": 1.8593,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09202453987730061,
55
+ "grad_norm": 5.082002576187776,
56
  "learning_rate": 6.122448979591837e-06,
57
+ "loss": 1.6901,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.10736196319018405,
62
+ "grad_norm": 6.765872287855098,
63
  "learning_rate": 7.1428571428571436e-06,
64
+ "loss": 1.6053,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.12269938650306748,
69
+ "grad_norm": 11.409653818105163,
70
  "learning_rate": 8.16326530612245e-06,
71
+ "loss": 1.5093,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.13803680981595093,
76
+ "grad_norm": 6.4162354324510895,
77
  "learning_rate": 9.183673469387756e-06,
78
+ "loss": 1.5069,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.15337423312883436,
83
+ "grad_norm": 3.7675519341487136,
84
  "learning_rate": 1.0204081632653063e-05,
85
+ "loss": 1.575,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.1687116564417178,
90
+ "grad_norm": 4.369929690874893,
91
  "learning_rate": 1.1224489795918367e-05,
92
+ "loss": 1.464,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.18404907975460122,
97
+ "grad_norm": 5.732870449440104,
98
  "learning_rate": 1.2244897959183674e-05,
99
+ "loss": 1.4131,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.19938650306748465,
104
+ "grad_norm": 2.9718012621338654,
105
  "learning_rate": 1.326530612244898e-05,
106
+ "loss": 1.3903,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.2147239263803681,
111
+ "grad_norm": 2.986883686963562,
112
  "learning_rate": 1.4285714285714287e-05,
113
+ "loss": 1.3699,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.23006134969325154,
118
+ "grad_norm": 3.0609005774260116,
119
  "learning_rate": 1.530612244897959e-05,
120
+ "loss": 1.4099,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.24539877300613497,
125
+ "grad_norm": 3.391633696280962,
126
  "learning_rate": 1.63265306122449e-05,
127
+ "loss": 1.4246,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.2607361963190184,
132
+ "grad_norm": 2.446544689335844,
133
  "learning_rate": 1.7346938775510206e-05,
134
+ "loss": 1.3225,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.27607361963190186,
139
+ "grad_norm": 2.892905300954704,
140
  "learning_rate": 1.836734693877551e-05,
141
+ "loss": 1.4131,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.29141104294478526,
146
+ "grad_norm": 2.7276895053980588,
147
  "learning_rate": 1.9387755102040817e-05,
148
+ "loss": 1.2981,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.3067484662576687,
153
+ "grad_norm": 3.1096897823921212,
154
  "learning_rate": 1.9999745104274995e-05,
155
+ "loss": 1.2867,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.3220858895705521,
160
+ "grad_norm": 2.6017423936904605,
161
  "learning_rate": 1.9996877676598733e-05,
162
+ "loss": 1.3923,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.3374233128834356,
167
+ "grad_norm": 2.357782746709104,
168
  "learning_rate": 1.9990825118233958e-05,
169
+ "loss": 1.2889,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.35276073619631904,
174
+ "grad_norm": 4.4128065982273155,
175
  "learning_rate": 1.9981589357601727e-05,
176
+ "loss": 1.4107,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.36809815950920244,
181
+ "grad_norm": 3.425193387107373,
182
  "learning_rate": 1.9969173337331283e-05,
183
+ "loss": 1.3417,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.3834355828220859,
188
+ "grad_norm": 2.8732197303122518,
189
  "learning_rate": 1.9953581013322503e-05,
190
+ "loss": 1.3121,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.3987730061349693,
195
+ "grad_norm": 3.121331048192887,
196
  "learning_rate": 1.99348173534855e-05,
197
+ "loss": 1.2892,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.41411042944785276,
202
+ "grad_norm": 2.340124072432182,
203
  "learning_rate": 1.9912888336157793e-05,
204
+ "loss": 1.3019,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.4294478527607362,
209
+ "grad_norm": 2.5077068756467393,
210
  "learning_rate": 1.9887800948199496e-05,
211
+ "loss": 1.2821,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.4447852760736196,
216
+ "grad_norm": 5.649859070977053,
217
  "learning_rate": 1.9859563182767268e-05,
218
+ "loss": 1.2652,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.4601226993865031,
223
+ "grad_norm": 2.852076057226547,
224
  "learning_rate": 1.9828184036767556e-05,
225
+ "loss": 1.2215,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.4754601226993865,
230
+ "grad_norm": 2.3152692963670853,
231
  "learning_rate": 1.9793673507990086e-05,
232
+ "loss": 1.2821,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.49079754601226994,
237
+ "grad_norm": 2.9501006805800682,
238
  "learning_rate": 1.9756042591922436e-05,
239
+ "loss": 1.3246,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.5061349693251533,
244
+ "grad_norm": 2.208339912352479,
245
  "learning_rate": 1.9715303278246724e-05,
246
+ "loss": 1.3105,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.5214723926380368,
251
+ "grad_norm": 2.377033071477205,
252
  "learning_rate": 1.9671468547019575e-05,
253
+ "loss": 1.2158,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.5368098159509203,
258
+ "grad_norm": 2.2769169863909706,
259
  "learning_rate": 1.9624552364536472e-05,
260
+ "loss": 1.3063,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.5521472392638037,
265
+ "grad_norm": 3.488802301164089,
266
  "learning_rate": 1.9574569678881965e-05,
267
+ "loss": 1.3641,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.5674846625766872,
272
+ "grad_norm": 3.2567021860207923,
273
  "learning_rate": 1.952153641516698e-05,
274
+ "loss": 1.1764,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.5828220858895705,
279
+ "grad_norm": 2.5611122809445117,
280
  "learning_rate": 1.94654694704549e-05,
281
+ "loss": 1.187,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.598159509202454,
286
+ "grad_norm": 3.2553565189736906,
287
  "learning_rate": 1.9406386708377956e-05,
288
+ "loss": 1.3129,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.6134969325153374,
293
+ "grad_norm": 2.866192118065629,
294
  "learning_rate": 1.9344306953445632e-05,
295
+ "loss": 1.3816,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.6288343558282209,
300
+ "grad_norm": 2.729878771251503,
301
  "learning_rate": 1.9279249985046948e-05,
302
+ "loss": 1.3208,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.6441717791411042,
307
+ "grad_norm": 2.1585127050514115,
308
  "learning_rate": 1.92112365311485e-05,
309
+ "loss": 1.2483,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.6595092024539877,
314
+ "grad_norm": 2.884411746638962,
315
  "learning_rate": 1.9140288261690278e-05,
316
+ "loss": 1.2221,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.6748466257668712,
321
+ "grad_norm": 2.3400822227419704,
322
  "learning_rate": 1.9066427781681314e-05,
323
+ "loss": 1.1909,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.6901840490797546,
328
+ "grad_norm": 2.2424426077461224,
329
  "learning_rate": 1.8989678623997506e-05,
330
+ "loss": 1.2427,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.7055214723926381,
335
+ "grad_norm": 2.3532187548733847,
336
  "learning_rate": 1.891006524188368e-05,
337
+ "loss": 1.3086,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.7208588957055214,
342
+ "grad_norm": 2.1575864354340557,
343
  "learning_rate": 1.8827613001162534e-05,
344
+ "loss": 1.2051,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.7361963190184049,
349
+ "grad_norm": 2.2528114135340807,
350
  "learning_rate": 1.8742348172152728e-05,
351
+ "loss": 1.3883,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.7515337423312883,
356
+ "grad_norm": 2.20955115974882,
357
  "learning_rate": 1.8654297921298862e-05,
358
+ "loss": 1.2169,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.7668711656441718,
363
+ "grad_norm": 2.1026008601865933,
364
  "learning_rate": 1.856349030251589e-05,
365
+ "loss": 1.2356,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.7822085889570553,
370
+ "grad_norm": 4.721033676079731,
371
  "learning_rate": 1.846995424825079e-05,
372
+ "loss": 1.1575,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.7975460122699386,
377
+ "grad_norm": 2.1852216414908976,
378
  "learning_rate": 1.837371956026433e-05,
379
+ "loss": 1.2681,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.8128834355828221,
384
+ "grad_norm": 2.4315513970788682,
385
  "learning_rate": 1.8274816900135842e-05,
386
+ "loss": 1.2805,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.8282208588957055,
391
+ "grad_norm": 1.9970132500864828,
392
  "learning_rate": 1.817327777949407e-05,
393
+ "loss": 1.2204,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.843558282208589,
398
+ "grad_norm": 2.438536266127501,
399
  "learning_rate": 1.806913454997717e-05,
400
+ "loss": 1.2388,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.8588957055214724,
405
+ "grad_norm": 1.9096886848188332,
406
  "learning_rate": 1.7962420392925066e-05,
407
+ "loss": 1.3021,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.8742331288343558,
412
+ "grad_norm": 2.0120912351402875,
413
  "learning_rate": 1.785316930880745e-05,
414
+ "loss": 1.1835,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.8895705521472392,
419
+ "grad_norm": 1.982796410249312,
420
  "learning_rate": 1.7741416106390828e-05,
421
+ "loss": 1.1753,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.9049079754601227,
426
+ "grad_norm": 2.4058710177193525,
427
  "learning_rate": 1.7627196391647982e-05,
428
+ "loss": 1.2109,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.9202453987730062,
433
+ "grad_norm": 1.9621072259264853,
434
  "learning_rate": 1.75105465564135e-05,
435
+ "loss": 1.2202,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.9355828220858896,
440
+ "grad_norm": 8.348303745211982,
441
  "learning_rate": 1.739150376678883e-05,
442
+ "loss": 1.299,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.950920245398773,
447
+ "grad_norm": 2.188760659995286,
448
  "learning_rate": 1.727010595130074e-05,
449
+ "loss": 1.3906,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.9662576687116564,
454
+ "grad_norm": 2.2138363998997828,
455
  "learning_rate": 1.714639178881678e-05,
456
+ "loss": 1.2704,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.9815950920245399,
461
+ "grad_norm": 7.207718966765089,
462
  "learning_rate": 1.7020400696221737e-05,
463
+ "loss": 1.227,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.9969325153374233,
468
+ "grad_norm": 2.033313094726782,
469
  "learning_rate": 1.6892172815858896e-05,
470
+ "loss": 1.265,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 1.0,
475
+ "eval_loss": 1.214716911315918,
476
+ "eval_runtime": 7.4506,
477
+ "eval_samples_per_second": 21.341,
478
+ "eval_steps_per_second": 5.369,
479
  "step": 326
480
  },
481
  {
482
  "epoch": 1.0122699386503067,
483
+ "grad_norm": 2.246404161643353,
484
  "learning_rate": 1.6761749002740195e-05,
485
+ "loss": 1.1231,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 1.0276073619631902,
490
+ "grad_norm": 2.6307191299518595,
491
  "learning_rate": 1.662917081152932e-05,
492
+ "loss": 1.0474,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 1.0429447852760736,
497
+ "grad_norm": 2.2466529263098596,
498
  "learning_rate": 1.6494480483301836e-05,
499
+ "loss": 0.9948,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 1.058282208588957,
504
+ "grad_norm": 2.0590187620332308,
505
  "learning_rate": 1.635772093208669e-05,
506
+ "loss": 0.9675,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 1.0736196319018405,
511
+ "grad_norm": 2.5079641466397864,
512
  "learning_rate": 1.6218935731193223e-05,
513
+ "loss": 0.9397,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 1.0889570552147239,
518
+ "grad_norm": 2.2015613521602053,
519
  "learning_rate": 1.6078169099328196e-05,
520
+ "loss": 0.9576,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 1.1042944785276074,
525
+ "grad_norm": 2.5678723465584303,
526
  "learning_rate": 1.5935465886507143e-05,
527
+ "loss": 1.1066,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 1.1196319018404908,
532
+ "grad_norm": 2.1152311130980976,
533
  "learning_rate": 1.579087155976459e-05,
534
+ "loss": 1.1337,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 1.1349693251533743,
539
+ "grad_norm": 2.36694508137547,
540
  "learning_rate": 1.5644432188667695e-05,
541
+ "loss": 1.0043,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 1.1503067484662577,
546
+ "grad_norm": 2.257597203782575,
547
  "learning_rate": 1.5496194430637903e-05,
548
+ "loss": 0.9548,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 1.165644171779141,
553
+ "grad_norm": 2.224173969408768,
554
  "learning_rate": 1.5346205516085305e-05,
555
+ "loss": 1.0267,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 1.1809815950920246,
560
+ "grad_norm": 2.785039079502963,
561
  "learning_rate": 1.5194513233360439e-05,
562
+ "loss": 0.9385,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 1.196319018404908,
567
+ "grad_norm": 2.8341094388368715,
568
  "learning_rate": 1.504116591352832e-05,
569
+ "loss": 1.1401,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 1.2116564417177913,
574
+ "grad_norm": 2.5109029462747303,
575
  "learning_rate": 1.4886212414969551e-05,
576
+ "loss": 0.9811,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 1.2269938650306749,
581
+ "grad_norm": 2.364253368402221,
582
  "learning_rate": 1.4729702107813438e-05,
583
+ "loss": 0.9896,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 1.2423312883435582,
588
+ "grad_norm": 2.833077039900493,
589
  "learning_rate": 1.4571684858208045e-05,
590
+ "loss": 1.0038,
591
  "step": 405
592
  },
593
  {
594
  "epoch": 1.2576687116564418,
595
+ "grad_norm": 2.986726012938249,
596
  "learning_rate": 1.4412211012432213e-05,
597
+ "loss": 1.0369,
598
  "step": 410
599
  },
600
  {
601
  "epoch": 1.2730061349693251,
602
+ "grad_norm": 2.19472166570541,
603
  "learning_rate": 1.4251331380854602e-05,
604
+ "loss": 1.0644,
605
  "step": 415
606
  },
607
  {
608
  "epoch": 1.2883435582822087,
609
+ "grad_norm": 2.539259944541222,
610
  "learning_rate": 1.408909722174487e-05,
611
+ "loss": 0.9615,
612
  "step": 420
613
  },
614
  {
615
  "epoch": 1.303680981595092,
616
+ "grad_norm": 2.3203428811857667,
617
  "learning_rate": 1.3925560224942145e-05,
618
+ "loss": 0.933,
619
  "step": 425
620
  },
621
  {
622
  "epoch": 1.3190184049079754,
623
+ "grad_norm": 2.190402503264761,
624
  "learning_rate": 1.3760772495385998e-05,
625
+ "loss": 0.9614,
626
  "step": 430
627
  },
628
  {
629
  "epoch": 1.334355828220859,
630
+ "grad_norm": 2.4516332947183055,
631
  "learning_rate": 1.3594786536515154e-05,
632
+ "loss": 0.9784,
633
  "step": 435
634
  },
635
  {
636
  "epoch": 1.3496932515337423,
637
+ "grad_norm": 2.1838918342014897,
638
  "learning_rate": 1.3427655233539227e-05,
639
+ "loss": 0.9477,
640
  "step": 440
641
  },
642
  {
643
  "epoch": 1.3650306748466257,
644
+ "grad_norm": 2.1974830315305867,
645
  "learning_rate": 1.3259431836588843e-05,
646
+ "loss": 0.9802,
647
  "step": 445
648
  },
649
  {
650
  "epoch": 1.3803680981595092,
651
+ "grad_norm": 2.337562201817739,
652
  "learning_rate": 1.3090169943749475e-05,
653
+ "loss": 1.001,
654
  "step": 450
655
  },
656
  {
657
  "epoch": 1.3957055214723926,
658
+ "grad_norm": 2.657566030569729,
659
  "learning_rate": 1.2919923483984415e-05,
660
+ "loss": 0.9072,
661
  "step": 455
662
  },
663
  {
664
  "epoch": 1.4110429447852761,
665
+ "grad_norm": 2.5556378147753787,
666
  "learning_rate": 1.2748746699952338e-05,
667
+ "loss": 1.0344,
668
  "step": 460
669
  },
670
  {
671
  "epoch": 1.4263803680981595,
672
+ "grad_norm": 2.490197731186031,
673
  "learning_rate": 1.2576694130724905e-05,
674
+ "loss": 1.1441,
675
  "step": 465
676
  },
677
  {
678
  "epoch": 1.441717791411043,
679
+ "grad_norm": 2.456053324999455,
680
  "learning_rate": 1.2403820594409926e-05,
681
+ "loss": 0.9665,
682
  "step": 470
683
  },
684
  {
685
  "epoch": 1.4570552147239264,
686
+ "grad_norm": 2.7963958883475586,
687
  "learning_rate": 1.2230181170685636e-05,
688
+ "loss": 0.9733,
689
  "step": 475
690
  },
691
  {
692
  "epoch": 1.4723926380368098,
693
+ "grad_norm": 2.3556048604718955,
694
  "learning_rate": 1.2055831183251608e-05,
695
+ "loss": 1.0198,
696
  "step": 480
697
  },
698
  {
699
  "epoch": 1.4877300613496933,
700
+ "grad_norm": 2.7492055859532365,
701
  "learning_rate": 1.1880826182201926e-05,
702
+ "loss": 0.9002,
703
  "step": 485
704
  },
705
  {
706
  "epoch": 1.5030674846625767,
707
+ "grad_norm": 2.138357362754652,
708
  "learning_rate": 1.170522192632624e-05,
709
+ "loss": 0.9411,
710
  "step": 490
711
  },
712
  {
713
  "epoch": 1.51840490797546,
714
+ "grad_norm": 2.1341509025323644,
715
  "learning_rate": 1.1529074365344302e-05,
716
+ "loss": 1.0553,
717
  "step": 495
718
  },
719
  {
720
  "epoch": 1.5337423312883436,
721
+ "grad_norm": 2.1458405128765627,
722
  "learning_rate": 1.1352439622079689e-05,
723
+ "loss": 1.0077,
724
  "step": 500
725
  },
726
  {
727
  "epoch": 1.5490797546012272,
728
+ "grad_norm": 2.1132360957639698,
729
  "learning_rate": 1.1175373974578378e-05,
730
+ "loss": 0.9168,
731
  "step": 505
732
  },
733
  {
734
  "epoch": 1.5644171779141103,
735
+ "grad_norm": 2.5113223130286686,
736
  "learning_rate": 1.0997933838177828e-05,
737
+ "loss": 1.0312,
738
  "step": 510
739
  },
740
  {
741
  "epoch": 1.5797546012269938,
742
+ "grad_norm": 2.0666811534447906,
743
  "learning_rate": 1.0820175747532373e-05,
744
+ "loss": 1.0088,
745
  "step": 515
746
  },
747
  {
748
  "epoch": 1.5950920245398774,
749
+ "grad_norm": 2.18910972259404,
750
  "learning_rate": 1.064215633860055e-05,
751
+ "loss": 0.9566,
752
  "step": 520
753
  },
754
  {
755
  "epoch": 1.6104294478527608,
756
+ "grad_norm": 2.1335643657003764,
757
  "learning_rate": 1.0463932330600197e-05,
758
+ "loss": 0.9537,
759
  "step": 525
760
  },
761
  {
762
  "epoch": 1.6257668711656441,
763
+ "grad_norm": 2.211695419294786,
764
  "learning_rate": 1.0285560507936962e-05,
765
+ "loss": 0.9519,
766
  "step": 530
767
  },
768
  {
769
  "epoch": 1.6411042944785277,
770
+ "grad_norm": 1.9927545915886413,
771
  "learning_rate": 1.010709770211212e-05,
772
+ "loss": 1.0909,
773
  "step": 535
774
  },
775
  {
776
  "epoch": 1.656441717791411,
777
+ "grad_norm": 2.1799108916751266,
778
  "learning_rate": 9.928600773615306e-06,
779
+ "loss": 1.0668,
780
  "step": 540
781
  },
782
  {
783
  "epoch": 1.6717791411042944,
784
+ "grad_norm": 2.0870390082509824,
785
  "learning_rate": 9.750126593808083e-06,
786
+ "loss": 0.9425,
787
  "step": 545
788
  },
789
  {
790
  "epoch": 1.687116564417178,
791
+ "grad_norm": 2.219919825754393,
792
  "learning_rate": 9.571732026803978e-06,
793
+ "loss": 0.9223,
794
  "step": 550
795
  },
796
  {
797
  "epoch": 1.7024539877300615,
798
+ "grad_norm": 2.0158507556929712,
799
  "learning_rate": 9.393473911350895e-06,
800
+ "loss": 0.9779,
801
  "step": 555
802
  },
803
  {
804
  "epoch": 1.7177914110429446,
805
+ "grad_norm": 2.146438648027479,
806
  "learning_rate": 9.215409042721553e-06,
807
+ "loss": 0.9929,
808
  "step": 560
809
  },
810
  {
811
  "epoch": 1.7331288343558282,
812
+ "grad_norm": 2.2820045196023115,
813
  "learning_rate": 9.037594154617811e-06,
814
+ "loss": 0.9746,
815
  "step": 565
816
  },
817
  {
818
  "epoch": 1.7484662576687118,
819
+ "grad_norm": 2.1811945127821626,
820
  "learning_rate": 8.860085901094595e-06,
821
+ "loss": 0.9463,
822
  "step": 570
823
  },
824
  {
825
  "epoch": 1.7638036809815951,
826
+ "grad_norm": 2.2195282023793976,
827
  "learning_rate": 8.682940838509206e-06,
828
+ "loss": 0.8636,
829
  "step": 575
830
  },
831
  {
832
  "epoch": 1.7791411042944785,
833
+ "grad_norm": 1.9502264600710417,
834
  "learning_rate": 8.50621540750175e-06,
835
+ "loss": 0.9595,
836
  "step": 580
837
  },
838
  {
839
  "epoch": 1.794478527607362,
840
+ "grad_norm": 2.1028286450122784,
841
  "learning_rate": 8.329965915012451e-06,
842
+ "loss": 1.0149,
843
  "step": 585
844
  },
845
  {
846
  "epoch": 1.8098159509202454,
847
+ "grad_norm": 1.996134849337679,
848
  "learning_rate": 8.154248516341547e-06,
849
+ "loss": 0.9714,
850
  "step": 590
851
  },
852
  {
853
  "epoch": 1.8251533742331287,
854
+ "grad_norm": 2.0229355706203673,
855
  "learning_rate": 7.979119197257505e-06,
856
+ "loss": 1.0042,
857
  "step": 595
858
  },
859
  {
860
  "epoch": 1.8404907975460123,
861
+ "grad_norm": 2.198065583918555,
862
  "learning_rate": 7.804633756159258e-06,
863
+ "loss": 0.9354,
864
  "step": 600
865
  },
866
  {
867
  "epoch": 1.8558282208588959,
868
+ "grad_norm": 2.0001193503472248,
869
  "learning_rate": 7.63084778629813e-06,
870
+ "loss": 0.9796,
871
  "step": 605
872
  },
873
  {
874
  "epoch": 1.871165644171779,
875
+ "grad_norm": 2.150238538587084,
876
  "learning_rate": 7.4578166580651335e-06,
877
+ "loss": 0.9495,
878
  "step": 610
879
  },
880
  {
881
  "epoch": 1.8865030674846626,
882
+ "grad_norm": 2.1692276656591543,
883
  "learning_rate": 7.285595501349259e-06,
884
+ "loss": 0.972,
885
  "step": 615
886
  },
887
  {
888
  "epoch": 1.9018404907975461,
889
+ "grad_norm": 2.2030115157082073,
890
  "learning_rate": 7.114239187972416e-06,
891
+ "loss": 0.8763,
892
  "step": 620
893
  },
894
  {
895
  "epoch": 1.9171779141104295,
896
+ "grad_norm": 13.25110262390118,
897
  "learning_rate": 6.94380231420656e-06,
898
+ "loss": 0.9911,
899
  "step": 625
900
  },
901
  {
902
  "epoch": 1.9325153374233128,
903
+ "grad_norm": 2.128565521517256,
904
  "learning_rate": 6.774339183378663e-06,
905
+ "loss": 0.98,
906
  "step": 630
907
  },
908
  {
909
  "epoch": 1.9478527607361964,
910
+ "grad_norm": 2.446455411382673,
911
  "learning_rate": 6.605903788568962e-06,
912
+ "loss": 0.9163,
913
  "step": 635
914
  },
915
  {
916
  "epoch": 1.9631901840490797,
917
+ "grad_norm": 3.6457472773842055,
918
  "learning_rate": 6.438549795408107e-06,
919
+ "loss": 0.893,
920
  "step": 640
921
  },
922
  {
923
  "epoch": 1.978527607361963,
924
+ "grad_norm": 2.1918272282922615,
925
  "learning_rate": 6.272330524978613e-06,
926
+ "loss": 0.9862,
927
  "step": 645
928
  },
929
  {
930
  "epoch": 1.9938650306748467,
931
+ "grad_norm": 2.173238130186426,
932
  "learning_rate": 6.107298936826086e-06,
933
+ "loss": 0.8983,
934
  "step": 650
935
  },
936
  {
937
  "epoch": 2.0,
938
+ "eval_loss": 1.211948275566101,
939
+ "eval_runtime": 6.699,
940
+ "eval_samples_per_second": 23.735,
941
+ "eval_steps_per_second": 5.971,
942
  "step": 652
943
  },
944
  {
945
  "epoch": 2.0092024539877302,
946
+ "grad_norm": 2.2524003786276254,
947
  "learning_rate": 5.943507612085661e-06,
948
+ "loss": 0.8223,
949
  "step": 655
950
  },
951
  {
952
  "epoch": 2.0245398773006134,
953
+ "grad_norm": 1.9974383389908232,
954
  "learning_rate": 5.781008736728975e-06,
955
+ "loss": 0.6493,
956
  "step": 660
957
  },
958
  {
959
  "epoch": 2.039877300613497,
960
+ "grad_norm": 2.9682960559686897,
961
  "learning_rate": 5.619854084937085e-06,
962
+ "loss": 0.6784,
963
  "step": 665
964
  },
965
  {
966
  "epoch": 2.0552147239263805,
967
+ "grad_norm": 2.2655157887127024,
968
  "learning_rate": 5.460095002604533e-06,
969
+ "loss": 0.6748,
970
  "step": 670
971
  },
972
  {
973
  "epoch": 2.0705521472392636,
974
+ "grad_norm": 2.2085119385865863,
975
  "learning_rate": 5.3017823909799295e-06,
976
+ "loss": 0.7027,
977
  "step": 675
978
  },
979
  {
980
  "epoch": 2.085889570552147,
981
+ "grad_norm": 2.27766004579248,
982
  "learning_rate": 5.144966690448159e-06,
983
+ "loss": 0.7278,
984
  "step": 680
985
  },
986
  {
987
  "epoch": 2.1012269938650308,
988
+ "grad_norm": 2.128219709033872,
989
  "learning_rate": 4.9896978644594516e-06,
990
+ "loss": 0.5989,
991
  "step": 685
992
  },
993
  {
994
  "epoch": 2.116564417177914,
995
+ "grad_norm": 2.4873822979923315,
996
  "learning_rate": 4.836025383610382e-06,
997
+ "loss": 0.7357,
998
  "step": 690
999
  },
1000
  {
1001
  "epoch": 2.1319018404907975,
1002
+ "grad_norm": 2.1568784749495244,
1003
  "learning_rate": 4.683998209881943e-06,
1004
+ "loss": 0.714,
1005
  "step": 695
1006
  },
1007
  {
1008
  "epoch": 2.147239263803681,
1009
+ "grad_norm": 2.3234411443951126,
1010
  "learning_rate": 4.533664781039622e-06,
1011
+ "loss": 0.7281,
1012
  "step": 700
1013
  },
1014
  {
1015
  "epoch": 2.1625766871165646,
1016
+ "grad_norm": 2.735689720360169,
1017
  "learning_rate": 4.385072995200532e-06,
1018
+ "loss": 0.6257,
1019
  "step": 705
1020
  },
1021
  {
1022
  "epoch": 2.1779141104294477,
1023
+ "grad_norm": 3.0839192153525934,
1024
  "learning_rate": 4.2382701955724724e-06,
1025
+ "loss": 0.7162,
1026
  "step": 710
1027
  },
1028
  {
1029
  "epoch": 2.1932515337423313,
1030
+ "grad_norm": 2.4589255584801624,
1031
  "learning_rate": 4.093303155369771e-06,
1032
+ "loss": 0.6726,
1033
  "step": 715
1034
  },
1035
  {
1036
  "epoch": 2.208588957055215,
1037
+ "grad_norm": 2.3530595892180637,
1038
  "learning_rate": 3.950218062910776e-06,
1039
+ "loss": 0.6077,
1040
  "step": 720
1041
  },
1042
  {
1043
  "epoch": 2.223926380368098,
1044
+ "grad_norm": 2.2994180114078375,
1045
  "learning_rate": 3.8090605069016596e-06,
1046
+ "loss": 0.6484,
1047
  "step": 725
1048
  },
1049
  {
1050
  "epoch": 2.2392638036809815,
1051
+ "grad_norm": 2.1614299270488844,
1052
  "learning_rate": 3.6698754619112974e-06,
1053
+ "loss": 0.7606,
1054
  "step": 730
1055
  },
1056
  {
1057
  "epoch": 2.254601226993865,
1058
+ "grad_norm": 2.148628125479035,
1059
  "learning_rate": 3.53270727404179e-06,
1060
+ "loss": 0.6735,
1061
  "step": 735
1062
  },
1063
  {
1064
  "epoch": 2.2699386503067487,
1065
+ "grad_norm": 2.791988335379792,
1066
  "learning_rate": 3.3975996467992557e-06,
1067
+ "loss": 0.6685,
1068
  "step": 740
1069
  },
1070
  {
1071
  "epoch": 2.285276073619632,
1072
+ "grad_norm": 2.4166794649978036,
1073
  "learning_rate": 3.2645956271693257e-06,
1074
+ "loss": 0.6727,
1075
  "step": 745
1076
  },
1077
  {
1078
  "epoch": 2.3006134969325154,
1079
+ "grad_norm": 2.270680126335755,
1080
  "learning_rate": 3.133737591901864e-06,
1081
+ "loss": 0.6946,
1082
  "step": 750
1083
  },
1084
  {
1085
  "epoch": 2.315950920245399,
1086
+ "grad_norm": 2.2773152491745043,
1087
  "learning_rate": 3.0050672340091723e-06,
1088
+ "loss": 0.6436,
1089
  "step": 755
1090
  },
1091
  {
1092
  "epoch": 2.331288343558282,
1093
+ "grad_norm": 2.351791962083414,
1094
  "learning_rate": 2.878625549482084e-06,
1095
+ "loss": 0.671,
1096
  "step": 760
1097
  },
1098
  {
1099
  "epoch": 2.3466257668711656,
1100
+ "grad_norm": 2.5602963650641604,
1101
  "learning_rate": 2.7544528242281323e-06,
1102
+ "loss": 0.559,
1103
  "step": 765
1104
  },
1105
  {
1106
  "epoch": 2.361963190184049,
1107
+ "grad_norm": 2.919982636047529,
1108
  "learning_rate": 2.6325886212359496e-06,
1109
+ "loss": 0.608,
1110
  "step": 770
1111
  },
1112
  {
1113
  "epoch": 2.3773006134969323,
1114
+ "grad_norm": 2.283862507375789,
1115
  "learning_rate": 2.51307176797001e-06,
1116
+ "loss": 0.7129,
1117
  "step": 775
1118
  },
1119
  {
1120
  "epoch": 2.392638036809816,
1121
+ "grad_norm": 2.5119018741262273,
1122
  "learning_rate": 2.395940343999691e-06,
1123
+ "loss": 0.6617,
1124
  "step": 780
1125
  },
1126
  {
1127
  "epoch": 2.4079754601226995,
1128
+ "grad_norm": 2.2326958137785997,
1129
  "learning_rate": 2.2812316688666735e-06,
1130
+ "loss": 0.6001,
1131
  "step": 785
1132
  },
1133
  {
1134
  "epoch": 2.4233128834355826,
1135
+ "grad_norm": 2.4717197828958875,
1136
  "learning_rate": 2.1689822901944456e-06,
1137
+ "loss": 0.6849,
1138
  "step": 790
1139
  },
1140
  {
1141
  "epoch": 2.438650306748466,
1142
+ "grad_norm": 2.428100475473941,
1143
  "learning_rate": 2.0592279720437856e-06,
1144
+ "loss": 0.6541,
1145
  "step": 795
1146
  },
1147
  {
1148
  "epoch": 2.4539877300613497,
1149
+ "grad_norm": 2.453498212161667,
1150
  "learning_rate": 1.9520036835178667e-06,
1151
+ "loss": 0.7362,
1152
  "step": 800
1153
  },
1154
  {
1155
  "epoch": 2.4693251533742333,
1156
+ "grad_norm": 2.202254537031373,
1157
  "learning_rate": 1.8473435876206792e-06,
1158
+ "loss": 0.6467,
1159
  "step": 805
1160
  },
1161
  {
1162
  "epoch": 2.4846625766871164,
1163
+ "grad_norm": 2.5704434258789557,
1164
  "learning_rate": 1.74528103037226e-06,
1165
+ "loss": 0.7467,
1166
  "step": 810
1167
  },
1168
  {
1169
  "epoch": 2.5,
1170
+ "grad_norm": 2.278793792368562,
1171
  "learning_rate": 1.645848530184233e-06,
1172
+ "loss": 0.6641,
1173
  "step": 815
1174
  },
1175
  {
1176
  "epoch": 2.5153374233128836,
1177
+ "grad_norm": 2.3206459000474338,
1178
  "learning_rate": 1.5490777674990376e-06,
1179
+ "loss": 0.7399,
1180
  "step": 820
1181
  },
1182
  {
1183
  "epoch": 2.530674846625767,
1184
+ "grad_norm": 2.2760200456954105,
1185
  "learning_rate": 1.4549995746961332e-06,
1186
+ "loss": 0.6896,
1187
  "step": 825
1188
  },
1189
  {
1190
  "epoch": 2.5460122699386503,
1191
+ "grad_norm": 2.0966897996478626,
1192
  "learning_rate": 1.3636439262684299e-06,
1193
+ "loss": 0.5792,
1194
  "step": 830
1195
  },
1196
  {
1197
  "epoch": 2.561349693251534,
1198
+ "grad_norm": 2.273137273352939,
1199
  "learning_rate": 1.2750399292720284e-06,
1200
+ "loss": 0.6727,
1201
  "step": 835
1202
  },
1203
  {
1204
  "epoch": 2.5766871165644174,
1205
+ "grad_norm": 2.232573385750787,
1206
  "learning_rate": 1.1892158140523546e-06,
1207
+ "loss": 0.7411,
1208
  "step": 840
1209
  },
1210
  {
1211
  "epoch": 2.5920245398773005,
1212
+ "grad_norm": 2.2477459570297427,
1213
  "learning_rate": 1.1061989252496053e-06,
1214
+ "loss": 0.637,
1215
  "step": 845
1216
  },
1217
  {
1218
  "epoch": 2.607361963190184,
1219
+ "grad_norm": 2.4503366494149152,
1220
  "learning_rate": 1.0260157130864178e-06,
1221
+ "loss": 0.6591,
1222
  "step": 850
1223
  },
1224
  {
1225
  "epoch": 2.6226993865030677,
1226
+ "grad_norm": 2.2727030330816707,
1227
  "learning_rate": 9.486917249404815e-07,
1228
+ "loss": 0.6791,
1229
  "step": 855
1230
  },
1231
  {
1232
  "epoch": 2.638036809815951,
1233
+ "grad_norm": 2.4331538301144695,
1234
  "learning_rate": 8.742515972048404e-07,
1235
+ "loss": 0.689,
1236
  "step": 860
1237
  },
1238
  {
1239
  "epoch": 2.6533742331288344,
1240
+ "grad_norm": 2.171632535993997,
1241
  "learning_rate": 8.027190474384127e-07,
1242
+ "loss": 0.588,
1243
  "step": 865
1244
  },
1245
  {
1246
  "epoch": 2.668711656441718,
1247
+ "grad_norm": 2.274712548175332,
1248
  "learning_rate": 7.341168668092857e-07,
1249
+ "loss": 0.5984,
1250
  "step": 870
1251
  },
1252
  {
1253
  "epoch": 2.684049079754601,
1254
+ "grad_norm": 2.187602871418788,
1255
  "learning_rate": 6.684669128331655e-07,
1256
+ "loss": 0.7899,
1257
  "step": 875
1258
  },
1259
  {
1260
  "epoch": 2.6993865030674846,
1261
+ "grad_norm": 2.172889231318168,
1262
  "learning_rate": 6.057901024092949e-07,
1263
+ "loss": 0.6107,
1264
  "step": 880
1265
  },
1266
  {
1267
  "epoch": 2.714723926380368,
1268
+ "grad_norm": 5.181833794013852,
1269
  "learning_rate": 5.461064051560705e-07,
1270
+ "loss": 0.747,
1271
  "step": 885
1272
  },
1273
  {
1274
  "epoch": 2.7300613496932513,
1275
+ "grad_norm": 2.1284344076378163,
1276
  "learning_rate": 4.894348370484648e-07,
1277
+ "loss": 0.6449,
1278
  "step": 890
1279
  },
1280
  {
1281
  "epoch": 2.745398773006135,
1282
+ "grad_norm": 2.742405951465162,
1283
  "learning_rate": 4.3579345435930454e-07,
1284
+ "loss": 0.7748,
1285
  "step": 895
1286
  },
1287
  {
1288
  "epoch": 2.7607361963190185,
1289
+ "grad_norm": 2.453933407406173,
1290
  "learning_rate": 3.851993479063154e-07,
1291
+ "loss": 0.7247,
1292
  "step": 900
1293
  },
1294
  {
1295
  "epoch": 2.7760736196319016,
1296
+ "grad_norm": 2.3349240625236307,
1297
  "learning_rate": 3.3766863760676947e-07,
1298
+ "loss": 0.6742,
1299
  "step": 905
1300
  },
1301
  {
1302
  "epoch": 2.791411042944785,
1303
+ "grad_norm": 2.346238889067428,
1304
  "learning_rate": 2.93216467341475e-07,
1305
+ "loss": 0.6736,
1306
  "step": 910
1307
  },
1308
  {
1309
  "epoch": 2.8067484662576687,
1310
+ "grad_norm": 2.682770493121025,
1311
  "learning_rate": 2.5185700012975603e-07,
1312
+ "loss": 0.7891,
1313
  "step": 915
1314
  },
1315
  {
1316
  "epoch": 2.8220858895705523,
1317
+ "grad_norm": 2.418319757792411,
1318
  "learning_rate": 2.1360341361692517e-07,
1319
+ "loss": 0.6781,
1320
  "step": 920
1321
  },
1322
  {
1323
  "epoch": 2.837423312883436,
1324
+ "grad_norm": 2.1865263805130737,
1325
  "learning_rate": 1.784678958757291e-07,
1326
+ "loss": 0.6458,
1327
  "step": 925
1328
  },
1329
  {
1330
  "epoch": 2.852760736196319,
1331
+ "grad_norm": 2.15121097086734,
1332
  "learning_rate": 1.464616415230702e-07,
1333
+ "loss": 0.639,
1334
  "step": 930
1335
  },
1336
  {
1337
  "epoch": 2.8680981595092025,
1338
+ "grad_norm": 2.1733157849896547,
1339
  "learning_rate": 1.1759484815326294e-07,
1340
+ "loss": 0.6231,
1341
  "step": 935
1342
  },
1343
  {
1344
  "epoch": 2.883435582822086,
1345
+ "grad_norm": 2.2028794055351235,
1346
  "learning_rate": 9.187671308895418e-08,
1347
+ "loss": 0.6718,
1348
  "step": 940
1349
  },
1350
  {
1351
  "epoch": 2.8987730061349692,
1352
+ "grad_norm": 2.7622530458915655,
1353
  "learning_rate": 6.931543045073708e-08,
1354
+ "loss": 0.8227,
1355
  "step": 945
1356
  },
1357
  {
1358
  "epoch": 2.914110429447853,
1359
+ "grad_norm": 2.1282608794755364,
1360
  "learning_rate": 4.991818854640396e-08,
1361
+ "loss": 0.6131,
1362
  "step": 950
1363
  },
1364
  {
1365
  "epoch": 2.9294478527607364,
1366
+ "grad_norm": 2.1900416244520295,
1367
  "learning_rate": 3.369116758066171e-08,
1368
+ "loss": 0.6493,
1369
  "step": 955
1370
  },
1371
  {
1372
  "epoch": 2.9447852760736195,
1373
+ "grad_norm": 2.3718267533912125,
1374
  "learning_rate": 2.063953768603799e-08,
1375
+ "loss": 0.7364,
1376
  "step": 960
1377
  },
1378
  {
1379
  "epoch": 2.960122699386503,
1380
+ "grad_norm": 2.2239316758606975,
1381
  "learning_rate": 1.0767457275615567e-08,
1382
+ "loss": 0.6457,
1383
  "step": 965
1384
  },
1385
  {
1386
  "epoch": 2.9754601226993866,
1387
+ "grad_norm": 2.6083276517224374,
1388
  "learning_rate": 4.0780717181077015e-09,
1389
+ "loss": 0.7664,
1390
  "step": 970
1391
  },
1392
  {
1393
  "epoch": 2.9907975460122698,
1394
+ "grad_norm": 2.554211412831243,
1395
  "learning_rate": 5.735123357042405e-10,
1396
+ "loss": 0.7007,
1397
  "step": 975
1398
  },
1399
  {
1400
  "epoch": 3.0,
1401
+ "eval_loss": 1.3528565168380737,
1402
+ "eval_runtime": 6.5488,
1403
+ "eval_samples_per_second": 24.279,
1404
+ "eval_steps_per_second": 6.108,
1405
  "step": 978
1406
  },
1407
  {
1408
  "epoch": 3.0,
1409
  "step": 978,
1410
  "total_flos": 3642602029056.0,
1411
+ "train_loss": 1.0182966589927673,
1412
+ "train_runtime": 588.3296,
1413
+ "train_samples_per_second": 6.649,
1414
+ "train_steps_per_second": 1.662
1415
  }
1416
  ],
1417
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53624d092600031a396068be5c97cc3df1725958186d1ebe3846858ecc969c0e
3
- size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eebff96f179673b60d9f5f767d4d743d7be42e01e13c2259d4a1c04b186e03dc
3
+ size 7224