Allen Poston commited on
Commit
43a6ae0
·
verified ·
1 Parent(s): eb2a98c

Adaptive Senko AI - 30,000 examples, gpt2 base

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cbeee3dfc268cbdbc22f727c37945ebd4275a3ea49512bd0ee049c583c3112b
3
  size 3253104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c31c1fc80a43fba4f08f077ad1ede5d3c76eed5f71dd49542cc83bc90eb9efcd
3
  size 3253104
checkpoint-11600/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8137c1eda8ebedf087f23f1f39ad57fa24e9b35b3fc584acc19896ff81984421
3
  size 3253104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f00e5b5ef7c8c59c34d43215747d202327ad21663a0e47924385bd19415ee39e
3
  size 3253104
checkpoint-11600/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6698b23ae4afc12ca05cd227dc927d2c778fd5abc3ce6f3c2c02a8a7d3b3794
3
  size 6548858
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7799f81ecc00926b24d6f4d7dedc2927e3d6e77ff412d06b4e8c581553ebab4
3
  size 6548858
checkpoint-11600/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:684a2d80447af9cc76005199cfec8e7dbd71d967ea01cae183e7fcace028e157
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:693f99554dfc41be899ee174a0a6d027f3a87ebe648db0a6dc8d08f9d87f4282
3
  size 988
checkpoint-11600/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6a219de3badb6cb2e9c9c7b282785dd34d720128250f541f986799be86e14e5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec67c7fac204597b08f4f6cf2351c31722eaeb3a2200ef62ea451ed7b01b543a
3
  size 1064
checkpoint-11600/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 11600,
3
- "best_metric": 2.4329476356506348,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11600",
5
  "epoch": 2.936886750648857,
6
  "eval_steps": 200,
@@ -11,2090 +11,2090 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
- "grad_norm": 0.4274106025695801,
15
  "learning_rate": 2.067510548523207e-06,
16
- "loss": 3.4405,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
- "grad_norm": 0.5292551517486572,
22
  "learning_rate": 4.177215189873418e-06,
23
- "loss": 3.4567,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
- "grad_norm": 0.7541739344596863,
29
  "learning_rate": 6.28691983122363e-06,
30
- "loss": 3.4683,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
- "grad_norm": 0.8833445906639099,
36
  "learning_rate": 8.39662447257384e-06,
37
- "loss": 3.5084,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
- "eval_loss": 3.5248923301696777,
43
- "eval_runtime": 39.9384,
44
- "eval_samples_per_second": 43.968,
45
- "eval_steps_per_second": 43.968,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
- "grad_norm": 0.9998921155929565,
51
  "learning_rate": 1.0506329113924052e-05,
52
- "loss": 3.359,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
- "grad_norm": 0.8041885495185852,
58
  "learning_rate": 1.2616033755274262e-05,
59
- "loss": 3.351,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
- "grad_norm": 0.9213416576385498,
65
  "learning_rate": 1.4725738396624473e-05,
66
- "loss": 3.2244,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
- "grad_norm": 1.0922213792800903,
72
  "learning_rate": 1.6835443037974685e-05,
73
- "loss": 3.1565,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
- "eval_loss": 3.151216983795166,
79
- "eval_runtime": 39.7515,
80
- "eval_samples_per_second": 44.174,
81
- "eval_steps_per_second": 44.174,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
- "grad_norm": 1.4199283123016357,
87
  "learning_rate": 1.8945147679324897e-05,
88
- "loss": 3.0154,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
- "grad_norm": 1.077143907546997,
94
  "learning_rate": 2.1054852320675106e-05,
95
- "loss": 3.0456,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
- "grad_norm": 1.5466052293777466,
101
  "learning_rate": 2.3164556962025318e-05,
102
- "loss": 2.9099,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
- "grad_norm": 1.2139467000961304,
108
  "learning_rate": 2.5274261603375527e-05,
109
- "loss": 2.8839,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
- "eval_loss": 2.793567657470703,
115
- "eval_runtime": 40.2573,
116
- "eval_samples_per_second": 43.619,
117
- "eval_steps_per_second": 43.619,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
- "grad_norm": 1.0270315408706665,
123
  "learning_rate": 2.738396624472574e-05,
124
- "loss": 2.8389,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
- "grad_norm": 1.5865377187728882,
130
  "learning_rate": 2.949367088607595e-05,
131
- "loss": 2.8228,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
- "grad_norm": 1.076073408126831,
137
  "learning_rate": 3.160337552742616e-05,
138
- "loss": 2.9255,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
- "grad_norm": 1.4510694742202759,
144
  "learning_rate": 3.3713080168776376e-05,
145
- "loss": 2.8165,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
- "eval_loss": 2.667301893234253,
151
- "eval_runtime": 40.0906,
152
- "eval_samples_per_second": 43.801,
153
- "eval_steps_per_second": 43.801,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
- "grad_norm": 1.5206592082977295,
159
  "learning_rate": 3.5822784810126585e-05,
160
- "loss": 2.8022,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
- "grad_norm": 1.173909068107605,
166
  "learning_rate": 3.7932489451476794e-05,
167
- "loss": 2.8034,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
- "grad_norm": 1.4551103115081787,
173
  "learning_rate": 4.004219409282701e-05,
174
- "loss": 2.774,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
- "grad_norm": 1.509749412536621,
180
  "learning_rate": 4.215189873417722e-05,
181
- "loss": 2.7978,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
- "eval_loss": 2.603116273880005,
187
- "eval_runtime": 39.9617,
188
- "eval_samples_per_second": 43.942,
189
- "eval_steps_per_second": 43.942,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
- "grad_norm": 1.745764136314392,
195
  "learning_rate": 4.426160337552743e-05,
196
- "loss": 2.7249,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
- "grad_norm": 1.6712589263916016,
202
  "learning_rate": 4.637130801687764e-05,
203
- "loss": 2.7618,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
- "grad_norm": 2.256267786026001,
209
  "learning_rate": 4.8481012658227845e-05,
210
- "loss": 2.7431,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
- "grad_norm": 1.5181586742401123,
216
  "learning_rate": 4.993434627649597e-05,
217
- "loss": 2.778,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
- "eval_loss": 2.5704379081726074,
223
- "eval_runtime": 40.1791,
224
- "eval_samples_per_second": 43.704,
225
- "eval_steps_per_second": 43.704,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
- "grad_norm": 1.1885608434677124,
231
  "learning_rate": 4.969986869255299e-05,
232
- "loss": 2.7224,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
- "grad_norm": 1.2136404514312744,
238
  "learning_rate": 4.946539110861002e-05,
239
- "loss": 2.6823,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
- "grad_norm": 0.8780732750892639,
245
  "learning_rate": 4.9230913524667046e-05,
246
- "loss": 2.6961,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
- "grad_norm": 1.0844959020614624,
252
- "learning_rate": 4.899643594072407e-05,
253
- "loss": 2.7014,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
- "eval_loss": 2.54587721824646,
259
- "eval_runtime": 39.8803,
260
- "eval_samples_per_second": 44.032,
261
- "eval_steps_per_second": 44.032,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
- "grad_norm": 1.3518335819244385,
267
- "learning_rate": 4.8761958356781096e-05,
268
- "loss": 2.6393,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
- "grad_norm": 1.1389687061309814,
274
- "learning_rate": 4.852748077283812e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
- "grad_norm": 1.6295430660247803,
281
- "learning_rate": 4.829300318889514e-05,
282
- "loss": 2.6754,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
- "grad_norm": 1.387499451637268,
288
- "learning_rate": 4.8058525604952173e-05,
289
- "loss": 2.6853,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
- "eval_loss": 2.5297553539276123,
295
- "eval_runtime": 39.722,
296
- "eval_samples_per_second": 44.207,
297
- "eval_steps_per_second": 44.207,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
- "grad_norm": 1.014020323753357,
303
- "learning_rate": 4.7824048021009195e-05,
304
- "loss": 2.7275,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
- "grad_norm": 1.1505990028381348,
310
- "learning_rate": 4.7589570437066216e-05,
311
- "loss": 2.6651,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
- "grad_norm": 1.1389458179473877,
317
- "learning_rate": 4.7355092853123244e-05,
318
- "loss": 2.6993,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
- "grad_norm": 1.2159587144851685,
324
- "learning_rate": 4.7120615269180266e-05,
325
- "loss": 2.7239,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
- "eval_loss": 2.5201079845428467,
331
- "eval_runtime": 39.8313,
332
- "eval_samples_per_second": 44.086,
333
- "eval_steps_per_second": 44.086,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
- "grad_norm": 1.1873971223831177,
339
- "learning_rate": 4.6886137685237294e-05,
340
- "loss": 2.6368,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
- "grad_norm": 1.5109103918075562,
346
- "learning_rate": 4.665166010129432e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
- "grad_norm": 1.9981125593185425,
353
- "learning_rate": 4.641718251735134e-05,
354
- "loss": 2.6513,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
- "grad_norm": 1.4879294633865356,
360
- "learning_rate": 4.6182704933408365e-05,
361
- "loss": 2.6433,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
- "eval_loss": 2.5107176303863525,
367
- "eval_runtime": 40.0643,
368
- "eval_samples_per_second": 43.83,
369
- "eval_steps_per_second": 43.83,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
- "grad_norm": 1.2832767963409424,
375
  "learning_rate": 4.5952916901144253e-05,
376
- "loss": 2.6225,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
- "grad_norm": 1.2915899753570557,
382
  "learning_rate": 4.5718439317201275e-05,
383
- "loss": 2.6592,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
- "grad_norm": 1.229929804801941,
389
  "learning_rate": 4.54839617332583e-05,
390
- "loss": 2.6411,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
- "grad_norm": 1.2569608688354492,
396
  "learning_rate": 4.524948414931533e-05,
397
- "loss": 2.6436,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
- "eval_loss": 2.504101514816284,
403
- "eval_runtime": 39.8694,
404
- "eval_samples_per_second": 44.044,
405
- "eval_steps_per_second": 44.044,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
- "grad_norm": 1.3688510656356812,
411
  "learning_rate": 4.501500656537235e-05,
412
- "loss": 2.6819,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
- "grad_norm": 1.1405905485153198,
418
  "learning_rate": 4.4780528981429374e-05,
419
- "loss": 2.6116,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
- "grad_norm": 1.453338861465454,
425
  "learning_rate": 4.45460513974864e-05,
426
- "loss": 2.6154,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
- "grad_norm": 1.0401395559310913,
432
  "learning_rate": 4.431157381354343e-05,
433
- "loss": 2.6018,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
- "eval_loss": 2.498344898223877,
439
- "eval_runtime": 39.9496,
440
- "eval_samples_per_second": 43.955,
441
- "eval_steps_per_second": 43.955,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
- "grad_norm": 1.4646718502044678,
447
  "learning_rate": 4.407709622960045e-05,
448
- "loss": 2.5734,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
- "grad_norm": 1.3828164339065552,
454
  "learning_rate": 4.384261864565748e-05,
455
- "loss": 2.6445,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
- "grad_norm": 2.1768596172332764,
461
  "learning_rate": 4.36081410617145e-05,
462
- "loss": 2.6618,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
- "grad_norm": 1.6110296249389648,
468
  "learning_rate": 4.337366347777152e-05,
469
- "loss": 2.6509,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
- "eval_loss": 2.4937028884887695,
475
- "eval_runtime": 39.8698,
476
- "eval_samples_per_second": 44.043,
477
- "eval_steps_per_second": 44.043,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
- "grad_norm": 1.2363536357879639,
483
  "learning_rate": 4.313918589382856e-05,
484
- "loss": 2.6274,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
- "grad_norm": 2.192110538482666,
490
  "learning_rate": 4.290470830988558e-05,
491
- "loss": 2.6932,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
- "grad_norm": 1.2024074792861938,
497
  "learning_rate": 4.26702307259426e-05,
498
- "loss": 2.6221,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
- "grad_norm": 1.8665797710418701,
504
  "learning_rate": 4.243575314199963e-05,
505
- "loss": 2.6313,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
- "eval_loss": 2.4876773357391357,
511
- "eval_runtime": 40.026,
512
- "eval_samples_per_second": 43.871,
513
- "eval_steps_per_second": 43.871,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
- "grad_norm": 1.4088993072509766,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
- "grad_norm": 1.3225140571594238,
526
  "learning_rate": 4.196679797411368e-05,
527
- "loss": 2.56,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
- "grad_norm": 1.3416539430618286,
533
  "learning_rate": 4.1732320390170706e-05,
534
- "loss": 2.6517,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
- "grad_norm": 1.079567790031433,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
- "eval_loss": 2.4842560291290283,
547
- "eval_runtime": 39.7988,
548
- "eval_samples_per_second": 44.122,
549
- "eval_steps_per_second": 44.122,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
- "grad_norm": 1.4532116651535034,
555
  "learning_rate": 4.126336522228475e-05,
556
- "loss": 2.6232,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
- "grad_norm": 1.5380038022994995,
562
  "learning_rate": 4.102888763834178e-05,
563
- "loss": 2.6212,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
- "grad_norm": 1.3965916633605957,
569
  "learning_rate": 4.0794410054398805e-05,
570
- "loss": 2.5804,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
- "grad_norm": 1.4798463582992554,
576
  "learning_rate": 4.0559932470455826e-05,
577
- "loss": 2.6724,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
- "eval_loss": 2.480894088745117,
583
- "eval_runtime": 39.9604,
584
- "eval_samples_per_second": 43.943,
585
- "eval_steps_per_second": 43.943,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
- "grad_norm": 1.2598360776901245,
591
  "learning_rate": 4.0325454886512854e-05,
592
- "loss": 2.6993,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
- "grad_norm": 1.366295576095581,
598
  "learning_rate": 4.0090977302569876e-05,
599
- "loss": 2.551,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
- "grad_norm": 1.1827855110168457,
605
  "learning_rate": 3.98564997186269e-05,
606
- "loss": 2.6131,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
- "grad_norm": 1.2728627920150757,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
- "eval_loss": 2.477010726928711,
619
- "eval_runtime": 40.2504,
620
- "eval_samples_per_second": 43.627,
621
- "eval_steps_per_second": 43.627,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
- "grad_norm": 1.341917634010315,
627
  "learning_rate": 3.938754455074095e-05,
628
- "loss": 2.5748,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
- "grad_norm": 1.4114609956741333,
634
  "learning_rate": 3.9153066966797975e-05,
635
- "loss": 2.667,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
- "grad_norm": 1.1211490631103516,
641
  "learning_rate": 3.8918589382855e-05,
642
- "loss": 2.5671,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
- "grad_norm": 1.4166322946548462,
648
  "learning_rate": 3.8684111798912024e-05,
649
- "loss": 2.5945,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
- "eval_loss": 2.47322940826416,
655
- "eval_runtime": 40.2079,
656
- "eval_samples_per_second": 43.673,
657
- "eval_steps_per_second": 43.673,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
- "grad_norm": 0.9144394993782043,
663
  "learning_rate": 3.844963421496905e-05,
664
- "loss": 2.6148,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
- "grad_norm": 1.4106061458587646,
670
  "learning_rate": 3.821515663102608e-05,
671
- "loss": 2.6586,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
- "grad_norm": 1.414415717124939,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
- "grad_norm": 1.5448992252349854,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
- "eval_loss": 2.4701173305511475,
691
- "eval_runtime": 40.1267,
692
- "eval_samples_per_second": 43.761,
693
- "eval_steps_per_second": 43.761,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
- "grad_norm": 1.1959314346313477,
699
  "learning_rate": 3.751172387919715e-05,
700
- "loss": 2.6975,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
- "grad_norm": 0.9525274038314819,
706
  "learning_rate": 3.727724629525417e-05,
707
- "loss": 2.6675,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
- "grad_norm": 4.733253479003906,
713
  "learning_rate": 3.70427687113112e-05,
714
- "loss": 2.566,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
- "grad_norm": 1.2803192138671875,
720
  "learning_rate": 3.680829112736823e-05,
721
- "loss": 2.5659,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
- "eval_loss": 2.4702188968658447,
727
- "eval_runtime": 40.1387,
728
- "eval_samples_per_second": 43.748,
729
- "eval_steps_per_second": 43.748,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
- "grad_norm": 1.446990966796875,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
- "grad_norm": 1.3563008308410645,
742
  "learning_rate": 3.633933595948227e-05,
743
- "loss": 2.6252,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
- "grad_norm": 1.5763463973999023,
749
- "learning_rate": 3.61048583755393e-05,
750
- "loss": 2.6593,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
- "grad_norm": 1.0055335760116577,
756
- "learning_rate": 3.587038079159633e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
- "eval_loss": 2.4676930904388428,
763
- "eval_runtime": 40.0342,
764
- "eval_samples_per_second": 43.863,
765
- "eval_steps_per_second": 43.863,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
- "grad_norm": 1.7013343572616577,
771
- "learning_rate": 3.563590320765335e-05,
772
- "loss": 2.59,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
- "grad_norm": 1.541069507598877,
778
- "learning_rate": 3.540142562371038e-05,
779
- "loss": 2.6192,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
- "grad_norm": 1.2536805868148804,
785
- "learning_rate": 3.51669480397674e-05,
786
- "loss": 2.6225,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
- "grad_norm": 1.8328826427459717,
792
- "learning_rate": 3.493247045582442e-05,
793
- "loss": 2.6022,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
- "eval_loss": 2.465629816055298,
799
- "eval_runtime": 39.8532,
800
- "eval_samples_per_second": 44.062,
801
- "eval_steps_per_second": 44.062,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
- "grad_norm": 1.8557270765304565,
807
- "learning_rate": 3.469799287188145e-05,
808
- "loss": 2.6496,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
- "grad_norm": 1.3255618810653687,
814
  "learning_rate": 3.446820483961734e-05,
815
- "loss": 2.5315,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
- "grad_norm": 1.2192399501800537,
821
  "learning_rate": 3.423372725567436e-05,
822
- "loss": 2.5409,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
- "grad_norm": 1.2533234357833862,
828
  "learning_rate": 3.399924967173139e-05,
829
- "loss": 2.6457,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
- "eval_loss": 2.462162733078003,
835
- "eval_runtime": 40.0542,
836
- "eval_samples_per_second": 43.841,
837
- "eval_steps_per_second": 43.841,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
- "grad_norm": 1.8414678573608398,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
- "grad_norm": 1.568259596824646,
850
  "learning_rate": 3.3530294503845436e-05,
851
- "loss": 2.5771,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
- "grad_norm": 1.3547483682632446,
857
  "learning_rate": 3.3295816919902464e-05,
858
- "loss": 2.6525,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
- "grad_norm": 1.1655386686325073,
864
  "learning_rate": 3.3061339335959486e-05,
865
- "loss": 2.6421,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
- "eval_loss": 2.461489200592041,
871
- "eval_runtime": 39.9962,
872
- "eval_samples_per_second": 43.904,
873
- "eval_steps_per_second": 43.904,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
- "grad_norm": 1.798033595085144,
879
  "learning_rate": 3.282686175201651e-05,
880
- "loss": 2.6091,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
- "grad_norm": 3.2964117527008057,
886
  "learning_rate": 3.2592384168073535e-05,
887
- "loss": 2.5997,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
- "grad_norm": 1.0457675457000732,
893
  "learning_rate": 3.2357906584130557e-05,
894
- "loss": 2.6144,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
- "grad_norm": 0.9728056192398071,
900
  "learning_rate": 3.2123429000187585e-05,
901
- "loss": 2.5712,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
- "eval_loss": 2.460186719894409,
907
- "eval_runtime": 39.8386,
908
- "eval_samples_per_second": 44.078,
909
- "eval_steps_per_second": 44.078,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
- "grad_norm": 1.2350194454193115,
915
  "learning_rate": 3.188895141624461e-05,
916
- "loss": 2.5448,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
- "grad_norm": 1.4210622310638428,
922
  "learning_rate": 3.1654473832301634e-05,
923
- "loss": 2.6031,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
- "grad_norm": 2.226473093032837,
929
  "learning_rate": 3.1419996248358656e-05,
930
- "loss": 2.6597,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
- "grad_norm": 2.4525105953216553,
936
  "learning_rate": 3.1185518664415684e-05,
937
- "loss": 2.596,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
- "eval_loss": 2.454537868499756,
943
- "eval_runtime": 39.805,
944
- "eval_samples_per_second": 44.115,
945
- "eval_steps_per_second": 44.115,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
- "grad_norm": 1.265309453010559,
951
  "learning_rate": 3.095104108047271e-05,
952
- "loss": 2.5559,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
- "grad_norm": 2.1364307403564453,
958
  "learning_rate": 3.071656349652973e-05,
959
- "loss": 2.5859,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
- "grad_norm": 1.5945920944213867,
965
  "learning_rate": 3.048208591258676e-05,
966
- "loss": 2.5778,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
- "grad_norm": 1.2479759454727173,
972
  "learning_rate": 3.0247608328643783e-05,
973
- "loss": 2.6846,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
- "eval_loss": 2.4547293186187744,
979
- "eval_runtime": 39.7806,
980
- "eval_samples_per_second": 44.142,
981
- "eval_steps_per_second": 44.142,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
- "grad_norm": 1.4845050573349,
987
  "learning_rate": 3.0013130744700808e-05,
988
- "loss": 2.5661,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
- "grad_norm": 1.5581985712051392,
994
  "learning_rate": 2.9778653160757836e-05,
995
- "loss": 2.5441,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
- "grad_norm": 3.1663737297058105,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
- "loss": 2.5044,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
- "grad_norm": 1.2454484701156616,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
- "loss": 2.5747,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
- "eval_loss": 2.4544529914855957,
1015
- "eval_runtime": 39.9287,
1016
- "eval_samples_per_second": 43.978,
1017
- "eval_steps_per_second": 43.978,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
- "grad_norm": 1.662784457206726,
1023
  "learning_rate": 2.907522040892891e-05,
1024
- "loss": 2.6064,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
- "grad_norm": 1.618458867073059,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
- "loss": 2.5191,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
- "grad_norm": 1.3003348112106323,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
- "loss": 2.5339,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
- "grad_norm": 1.1443992853164673,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
- "loss": 2.5914,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
- "eval_loss": 2.453752279281616,
1051
- "eval_runtime": 39.8234,
1052
- "eval_samples_per_second": 44.095,
1053
- "eval_steps_per_second": 44.095,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
- "grad_norm": 1.2574009895324707,
1059
  "learning_rate": 2.813731007315701e-05,
1060
- "loss": 2.6109,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
- "grad_norm": 1.002815842628479,
1066
  "learning_rate": 2.790283248921403e-05,
1067
- "loss": 2.6075,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
- "grad_norm": 1.306024432182312,
1073
  "learning_rate": 2.766835490527106e-05,
1074
- "loss": 2.5733,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
- "grad_norm": 2.5023701190948486,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
- "loss": 2.6274,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
- "eval_loss": 2.449617862701416,
1087
- "eval_runtime": 39.9401,
1088
- "eval_samples_per_second": 43.966,
1089
- "eval_steps_per_second": 43.966,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
- "grad_norm": 1.9410326480865479,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
- "loss": 2.5312,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
- "grad_norm": 1.9793561697006226,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
- "grad_norm": 1.290531873703003,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
- "loss": 2.5817,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
- "grad_norm": 2.11389422416687,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
- "loss": 2.6287,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
- "eval_loss": 2.4490554332733154,
1123
- "eval_runtime": 39.7474,
1124
- "eval_samples_per_second": 44.179,
1125
- "eval_steps_per_second": 44.179,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
- "grad_norm": 1.6492938995361328,
1131
  "learning_rate": 2.626148940161321e-05,
1132
- "loss": 2.631,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
- "grad_norm": 1.3233673572540283,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
- "loss": 2.5654,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
- "grad_norm": 1.688264012336731,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
- "loss": 2.6096,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
- "grad_norm": 2.064823865890503,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
- "loss": 2.6275,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
- "eval_loss": 2.4488983154296875,
1159
- "eval_runtime": 39.6847,
1160
- "eval_samples_per_second": 44.249,
1161
- "eval_steps_per_second": 44.249,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
- "grad_norm": 1.5599696636199951,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
- "loss": 2.6334,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
- "grad_norm": 1.3142633438110352,
1174
  "learning_rate": 2.508910148189833e-05,
1175
- "loss": 2.5496,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
- "grad_norm": 1.474135160446167,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
- "grad_norm": 1.3737610578536987,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
- "loss": 2.5345,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
- "eval_loss": 2.447559356689453,
1195
- "eval_runtime": 39.7021,
1196
- "eval_samples_per_second": 44.229,
1197
- "eval_steps_per_second": 44.229,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
- "grad_norm": 1.2432060241699219,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
- "loss": 2.5977,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
- "grad_norm": 1.465063452720642,
1210
  "learning_rate": 2.415119114612643e-05,
1211
- "loss": 2.6118,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
- "grad_norm": 1.5186200141906738,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
- "loss": 2.6126,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
- "grad_norm": 1.6869078874588013,
1224
  "learning_rate": 2.368223597824048e-05,
1225
- "loss": 2.576,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
- "eval_loss": 2.4459502696990967,
1231
- "eval_runtime": 39.7653,
1232
- "eval_samples_per_second": 44.159,
1233
- "eval_steps_per_second": 44.159,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
- "grad_norm": 1.2578104734420776,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
- "loss": 2.6178,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
- "grad_norm": 1.7597213983535767,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
- "loss": 2.6358,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
- "grad_norm": 2.144465923309326,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
- "loss": 2.5597,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
- "grad_norm": 1.1808464527130127,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
- "loss": 2.6269,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
- "eval_loss": 2.4444611072540283,
1267
- "eval_runtime": 40.0709,
1268
- "eval_samples_per_second": 43.822,
1269
- "eval_steps_per_second": 43.822,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
- "grad_norm": 1.4550806283950806,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
- "loss": 2.6206,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
- "grad_norm": 1.2635902166366577,
1282
  "learning_rate": 2.227537047458263e-05,
1283
- "loss": 2.5722,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
- "grad_norm": 1.3835856914520264,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
- "loss": 2.535,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
- "grad_norm": 1.735004186630249,
1296
  "learning_rate": 2.180641530669668e-05,
1297
- "loss": 2.6086,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
- "eval_loss": 2.443899154663086,
1303
- "eval_runtime": 40.91,
1304
- "eval_samples_per_second": 42.924,
1305
- "eval_steps_per_second": 42.924,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
- "grad_norm": 1.263051986694336,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
- "loss": 2.5544,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
- "grad_norm": 1.0899442434310913,
1318
  "learning_rate": 2.133746013881073e-05,
1319
- "loss": 2.5603,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
- "grad_norm": 3.038811206817627,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
- "loss": 2.5688,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
- "grad_norm": 1.6385984420776367,
1332
  "learning_rate": 2.086850497092478e-05,
1333
- "loss": 2.6006,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
- "eval_loss": 2.443300724029541,
1339
- "eval_runtime": 40.6649,
1340
- "eval_samples_per_second": 43.182,
1341
- "eval_steps_per_second": 43.182,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
- "grad_norm": 1.2857129573822021,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
- "grad_norm": 1.0289497375488281,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
- "loss": 2.5671,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
- "grad_norm": 1.5041025876998901,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
- "grad_norm": 1.6611964702606201,
1368
- "learning_rate": 1.993528418683174e-05,
1369
- "loss": 2.5801,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
- "eval_loss": 2.443532943725586,
1375
- "eval_runtime": 39.931,
1376
- "eval_samples_per_second": 43.976,
1377
- "eval_steps_per_second": 43.976,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
- "grad_norm": 1.521170735359192,
1383
- "learning_rate": 1.9700806602888767e-05,
1384
- "loss": 2.5969,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
- "grad_norm": 1.3700034618377686,
1390
- "learning_rate": 1.946632901894579e-05,
1391
- "loss": 2.6306,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
- "grad_norm": 2.311443328857422,
1397
- "learning_rate": 1.9231851435002814e-05,
1398
- "loss": 2.5608,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
- "grad_norm": 1.6699820756912231,
1404
- "learning_rate": 1.8997373851059842e-05,
1405
- "loss": 2.5113,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
- "eval_loss": 2.4421675205230713,
1411
- "eval_runtime": 40.1783,
1412
- "eval_samples_per_second": 43.705,
1413
- "eval_steps_per_second": 43.705,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
- "grad_norm": 1.2560683488845825,
1419
- "learning_rate": 1.8762896267116863e-05,
1420
- "loss": 2.545,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
- "grad_norm": 2.176563262939453,
1426
- "learning_rate": 1.852841868317389e-05,
1427
- "loss": 2.5752,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
- "grad_norm": 1.2551178932189941,
1433
- "learning_rate": 1.8293941099230916e-05,
1434
- "loss": 2.5215,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
- "grad_norm": 1.5646872520446777,
1440
- "learning_rate": 1.8059463515287937e-05,
1441
- "loss": 2.5838,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
- "eval_loss": 2.441195249557495,
1447
- "eval_runtime": 39.701,
1448
- "eval_samples_per_second": 44.231,
1449
- "eval_steps_per_second": 44.231,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
- "grad_norm": 1.4227900505065918,
1455
- "learning_rate": 1.7824985931344966e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
- "grad_norm": 1.3013832569122314,
1462
- "learning_rate": 1.759050834740199e-05,
1463
- "loss": 2.7641,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
- "grad_norm": 1.1282143592834473,
1469
- "learning_rate": 1.7356030763459015e-05,
1470
- "loss": 2.5875,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
- "grad_norm": 2.079760789871216,
1476
- "learning_rate": 1.712155317951604e-05,
1477
- "loss": 2.4861,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
- "eval_loss": 2.440812826156616,
1483
- "eval_runtime": 40.0764,
1484
- "eval_samples_per_second": 43.816,
1485
- "eval_steps_per_second": 43.816,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
- "grad_norm": 1.0884991884231567,
1491
- "learning_rate": 1.6887075595573065e-05,
1492
- "loss": 2.5941,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
- "grad_norm": 1.9202015399932861,
1498
- "learning_rate": 1.665259801163009e-05,
1499
- "loss": 2.5929,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
- "grad_norm": 1.5925830602645874,
1505
- "learning_rate": 1.6418120427687114e-05,
1506
- "loss": 2.5046,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
- "grad_norm": 1.5219184160232544,
1512
- "learning_rate": 1.618364284374414e-05,
1513
- "loss": 2.5628,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
- "eval_loss": 2.4396440982818604,
1519
- "eval_runtime": 40.0053,
1520
- "eval_samples_per_second": 43.894,
1521
- "eval_steps_per_second": 43.894,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
- "grad_norm": 1.4882445335388184,
1527
- "learning_rate": 1.5949165259801164e-05,
1528
- "loss": 2.6268,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
- "grad_norm": 1.3513301610946655,
1534
- "learning_rate": 1.571468767585819e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
- "grad_norm": 1.690974473953247,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
- "loss": 2.5631,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
- "grad_norm": 1.5311528444290161,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
- "loss": 2.5454,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
- "eval_loss": 2.4388718605041504,
1555
- "eval_runtime": 40.0289,
1556
- "eval_samples_per_second": 43.868,
1557
- "eval_steps_per_second": 43.868,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
- "grad_norm": 2.1171281337738037,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
- "grad_norm": 1.9706814289093018,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
- "loss": 2.588,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
- "grad_norm": 1.8991297483444214,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
- "loss": 2.5655,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
- "grad_norm": 1.5568820238113403,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
- "loss": 2.5312,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
- "eval_loss": 2.438715696334839,
1591
- "eval_runtime": 40.1748,
1592
- "eval_samples_per_second": 43.709,
1593
- "eval_steps_per_second": 43.709,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
- "grad_norm": 1.277051329612732,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
- "loss": 2.5818,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
- "grad_norm": 1.8890128135681152,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
- "loss": 2.5211,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
- "grad_norm": 1.8824830055236816,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
- "loss": 2.53,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
- "grad_norm": 1.239490032196045,
1620
  "learning_rate": 1.336991183642844e-05,
1621
- "loss": 2.5889,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
- "eval_loss": 2.437577962875366,
1627
- "eval_runtime": 40.1654,
1628
- "eval_samples_per_second": 43.719,
1629
- "eval_steps_per_second": 43.719,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
- "grad_norm": 1.7253328561782837,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
- "loss": 2.5426,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
- "grad_norm": 1.6971838474273682,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
- "loss": 2.4953,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
- "grad_norm": 1.4906270503997803,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
- "grad_norm": 1.658526062965393,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
- "loss": 2.5483,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
- "eval_loss": 2.437896490097046,
1663
- "eval_runtime": 40.7008,
1664
- "eval_samples_per_second": 43.144,
1665
- "eval_steps_per_second": 43.144,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
- "grad_norm": 1.0781177282333374,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
- "loss": 2.5449,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
- "grad_norm": 2.1414873600006104,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
- "loss": 2.5303,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
- "grad_norm": 2.063297986984253,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
- "loss": 2.5837,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
- "grad_norm": 1.2153489589691162,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
- "loss": 2.6384,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
- "eval_loss": 2.4365696907043457,
1699
- "eval_runtime": 40.2398,
1700
- "eval_samples_per_second": 43.638,
1701
- "eval_steps_per_second": 43.638,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
- "grad_norm": 1.2976094484329224,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
- "grad_norm": 1.2775920629501343,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
- "loss": 2.6083,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
- "grad_norm": 1.358311653137207,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
- "loss": 2.5206,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
- "grad_norm": 1.3438369035720825,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
- "loss": 2.4967,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
- "eval_loss": 2.4359662532806396,
1735
- "eval_runtime": 40.0802,
1736
- "eval_samples_per_second": 43.812,
1737
- "eval_steps_per_second": 43.812,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
- "grad_norm": 1.2618831396102905,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
- "loss": 2.6169,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
- "grad_norm": 1.3764727115631104,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
- "loss": 2.5444,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
- "grad_norm": 1.604864478111267,
1757
  "learning_rate": 9.852748077283812e-06,
1758
- "loss": 2.5343,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
- "grad_norm": 1.390496850013733,
1764
  "learning_rate": 9.618270493340837e-06,
1765
- "loss": 2.5051,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
- "eval_loss": 2.4353232383728027,
1771
- "eval_runtime": 40.1607,
1772
- "eval_samples_per_second": 43.724,
1773
- "eval_steps_per_second": 43.724,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
- "grad_norm": 2.1982169151306152,
1779
  "learning_rate": 9.383792909397862e-06,
1780
- "loss": 2.5036,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
- "grad_norm": 1.3033822774887085,
1786
  "learning_rate": 9.149315325454887e-06,
1787
- "loss": 2.5205,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
- "grad_norm": 1.682586431503296,
1793
- "learning_rate": 8.919527293190772e-06,
1794
- "loss": 2.6083,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
- "grad_norm": 3.184382200241089,
1800
- "learning_rate": 8.685049709247797e-06,
1801
- "loss": 2.5314,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
- "eval_loss": 2.434755802154541,
1807
- "eval_runtime": 40.2877,
1808
- "eval_samples_per_second": 43.587,
1809
- "eval_steps_per_second": 43.587,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
- "grad_norm": 2.0026867389678955,
1815
- "learning_rate": 8.450572125304821e-06,
1816
- "loss": 2.5109,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
- "grad_norm": 1.3833885192871094,
1822
- "learning_rate": 8.216094541361846e-06,
1823
- "loss": 2.5362,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
- "grad_norm": 2.157984495162964,
1829
- "learning_rate": 7.981616957418871e-06,
1830
- "loss": 2.5423,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
- "grad_norm": 1.682053565979004,
1836
- "learning_rate": 7.747139373475897e-06,
1837
- "loss": 2.5133,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
- "eval_loss": 2.435208559036255,
1843
- "eval_runtime": 40.4768,
1844
- "eval_samples_per_second": 43.383,
1845
- "eval_steps_per_second": 43.383,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
- "grad_norm": 1.9720139503479004,
1851
- "learning_rate": 7.512661789532921e-06,
1852
- "loss": 2.6372,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
- "grad_norm": 1.6906607151031494,
1858
- "learning_rate": 7.278184205589945e-06,
1859
- "loss": 2.5505,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
- "grad_norm": 1.484045147895813,
1865
- "learning_rate": 7.043706621646972e-06,
1866
- "loss": 2.5095,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
- "grad_norm": 1.6676850318908691,
1872
- "learning_rate": 6.8092290377039955e-06,
1873
- "loss": 2.6487,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
- "eval_loss": 2.4344091415405273,
1879
- "eval_runtime": 39.869,
1880
- "eval_samples_per_second": 44.044,
1881
- "eval_steps_per_second": 44.044,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
- "grad_norm": 1.5012388229370117,
1887
- "learning_rate": 6.57475145376102e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
- "grad_norm": 1.043954849243164,
1894
- "learning_rate": 6.340273869818046e-06,
1895
- "loss": 2.5843,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
- "grad_norm": 1.0455141067504883,
1901
- "learning_rate": 6.105796285875071e-06,
1902
- "loss": 2.5248,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
- "grad_norm": 1.39467453956604,
1908
- "learning_rate": 5.871318701932095e-06,
1909
- "loss": 2.5091,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
- "eval_loss": 2.4331610202789307,
1915
- "eval_runtime": 39.923,
1916
- "eval_samples_per_second": 43.985,
1917
- "eval_steps_per_second": 43.985,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
- "grad_norm": 1.1417715549468994,
1923
- "learning_rate": 5.63684111798912e-06,
1924
- "loss": 2.5853,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
- "grad_norm": 1.133244514465332,
1930
- "learning_rate": 5.402363534046146e-06,
1931
- "loss": 2.5457,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
- "grad_norm": 1.2331452369689941,
1937
- "learning_rate": 5.1678859501031705e-06,
1938
- "loss": 2.5576,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
- "grad_norm": 1.7164263725280762,
1944
- "learning_rate": 4.933408366160195e-06,
1945
- "loss": 2.5471,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
- "eval_loss": 2.4340403079986572,
1951
- "eval_runtime": 40.3849,
1952
- "eval_samples_per_second": 43.482,
1953
- "eval_steps_per_second": 43.482,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
- "grad_norm": 1.3680106401443481,
1959
- "learning_rate": 4.69893078221722e-06,
1960
- "loss": 2.5562,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
- "grad_norm": 1.0978279113769531,
1966
- "learning_rate": 4.464453198274246e-06,
1967
- "loss": 2.5185,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
- "grad_norm": 1.2212647199630737,
1973
- "learning_rate": 4.2299756143312695e-06,
1974
- "loss": 2.6371,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
- "grad_norm": 1.6452165842056274,
1980
- "learning_rate": 3.995498030388295e-06,
1981
- "loss": 2.681,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
- "eval_loss": 2.4337143898010254,
1987
- "eval_runtime": 40.4235,
1988
- "eval_samples_per_second": 43.44,
1989
- "eval_steps_per_second": 43.44,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
- "grad_norm": 1.7757978439331055,
1995
- "learning_rate": 3.7610204464453203e-06,
1996
- "loss": 2.5746,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
- "grad_norm": 1.2373579740524292,
2002
- "learning_rate": 3.5265428625023455e-06,
2003
- "loss": 2.5412,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
- "grad_norm": 1.1407558917999268,
2009
- "learning_rate": 3.29206527855937e-06,
2010
- "loss": 2.5973,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
- "grad_norm": 2.399686813354492,
2016
- "learning_rate": 3.057587694616395e-06,
2017
- "loss": 2.5566,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
- "eval_loss": 2.4338231086730957,
2023
- "eval_runtime": 40.4877,
2024
- "eval_samples_per_second": 43.371,
2025
- "eval_steps_per_second": 43.371,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
- "grad_norm": 1.7053141593933105,
2031
- "learning_rate": 2.8231101106734197e-06,
2032
- "loss": 2.6224,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
- "grad_norm": 1.8215903043746948,
2038
- "learning_rate": 2.5886325267304445e-06,
2039
- "loss": 2.5108,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
- "grad_norm": 1.1648200750350952,
2045
- "learning_rate": 2.3541549427874697e-06,
2046
- "loss": 2.557,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
- "grad_norm": 1.5225868225097656,
2052
- "learning_rate": 2.1196773588444944e-06,
2053
- "loss": 2.6285,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
- "eval_loss": 2.4334514141082764,
2059
- "eval_runtime": 40.3985,
2060
- "eval_samples_per_second": 43.467,
2061
- "eval_steps_per_second": 43.467,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
- "grad_norm": 1.4937622547149658,
2067
- "learning_rate": 1.8851997749015194e-06,
2068
- "loss": 2.481,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
- "grad_norm": 1.9169902801513672,
2074
- "learning_rate": 1.6507221909585446e-06,
2075
- "loss": 2.5412,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
- "grad_norm": 1.6611114740371704,
2081
- "learning_rate": 1.4162446070155693e-06,
2082
- "loss": 2.5086,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
- "grad_norm": 1.3464007377624512,
2088
- "learning_rate": 1.1817670230725943e-06,
2089
- "loss": 2.6063,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
- "eval_loss": 2.4329476356506348,
2095
- "eval_runtime": 40.4334,
2096
- "eval_samples_per_second": 43.429,
2097
- "eval_steps_per_second": 43.429,
2098
  "step": 11600
2099
  }
2100
  ],
 
1
  {
2
  "best_global_step": 11600,
3
+ "best_metric": 2.4339404106140137,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11600",
5
  "epoch": 2.936886750648857,
6
  "eval_steps": 200,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4365496337413788,
15
  "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4408,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5391681790351868,
22
  "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4568,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7692704796791077,
29
  "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4686,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.9071826934814453,
36
  "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5075,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5223069190979004,
43
+ "eval_runtime": 41.5235,
44
+ "eval_samples_per_second": 42.289,
45
+ "eval_steps_per_second": 42.289,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.976381778717041,
51
  "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.3576,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.81010901927948,
58
  "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.3492,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9288440942764282,
65
  "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2229,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.110400676727295,
72
  "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1555,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.1509861946105957,
79
+ "eval_runtime": 40.9241,
80
+ "eval_samples_per_second": 42.909,
81
+ "eval_steps_per_second": 42.909,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4328745603561401,
87
  "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0152,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
+ "grad_norm": 1.094860553741455,
94
  "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0463,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5432164669036865,
101
  "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9119,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2089171409606934,
108
  "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.885,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.7955658435821533,
115
+ "eval_runtime": 41.2266,
116
+ "eval_samples_per_second": 42.594,
117
+ "eval_steps_per_second": 42.594,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0353807210922241,
123
  "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8395,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.6014362573623657,
130
  "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8229,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.0306800603866577,
137
  "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9251,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.7377468347549438,
144
  "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.816,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.6652894020080566,
151
+ "eval_runtime": 41.6497,
152
+ "eval_samples_per_second": 42.161,
153
+ "eval_steps_per_second": 42.161,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.550484299659729,
159
  "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8018,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.1680374145507812,
166
  "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8037,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4538466930389404,
173
  "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.7734,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
+ "grad_norm": 1.5268754959106445,
180
  "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7975,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
+ "eval_loss": 2.6034388542175293,
187
+ "eval_runtime": 41.1034,
188
+ "eval_samples_per_second": 42.722,
189
+ "eval_steps_per_second": 42.722,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.813955545425415,
195
  "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7258,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.664474368095398,
202
  "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7607,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.343366861343384,
209
  "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7418,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5289666652679443,
216
  "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.7779,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.571211814880371,
223
+ "eval_runtime": 41.5835,
224
+ "eval_samples_per_second": 42.228,
225
+ "eval_steps_per_second": 42.228,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.2860196828842163,
231
  "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7216,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2128461599349976,
238
  "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6822,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8949031233787537,
245
  "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.696,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.1098757982254028,
252
+ "learning_rate": 4.900112549240293e-05,
253
+ "loss": 2.7004,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.5462076663970947,
259
+ "eval_runtime": 41.8547,
260
+ "eval_samples_per_second": 41.955,
261
+ "eval_steps_per_second": 41.955,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3600550889968872,
267
+ "learning_rate": 4.8766647908459956e-05,
268
+ "loss": 2.6385,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1471352577209473,
274
+ "learning_rate": 4.853217032451698e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.666717767715454,
281
+ "learning_rate": 4.8297692740574e-05,
282
+ "loss": 2.6755,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.4194293022155762,
288
+ "learning_rate": 4.8063215156631034e-05,
289
+ "loss": 2.6852,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5299158096313477,
295
+ "eval_runtime": 41.8943,
296
+ "eval_samples_per_second": 41.915,
297
+ "eval_steps_per_second": 41.915,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
+ "grad_norm": 0.9786908626556396,
303
+ "learning_rate": 4.7828737572688055e-05,
304
+ "loss": 2.7279,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1531881093978882,
310
+ "learning_rate": 4.759425998874508e-05,
311
+ "loss": 2.6643,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1486274003982544,
317
+ "learning_rate": 4.7359782404802105e-05,
318
+ "loss": 2.6994,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2005631923675537,
324
+ "learning_rate": 4.7125304820859126e-05,
325
+ "loss": 2.7233,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.520224094390869,
331
+ "eval_runtime": 41.9149,
332
+ "eval_samples_per_second": 41.894,
333
+ "eval_steps_per_second": 41.894,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.2023993730545044,
339
+ "learning_rate": 4.6890827236916154e-05,
340
+ "loss": 2.6352,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5334537029266357,
346
+ "learning_rate": 4.665634965297318e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9828767776489258,
353
+ "learning_rate": 4.6421872069030204e-05,
354
+ "loss": 2.6507,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
+ "grad_norm": 1.5031970739364624,
360
+ "learning_rate": 4.6187394485087225e-05,
361
+ "loss": 2.6432,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
+ "eval_loss": 2.511120319366455,
367
+ "eval_runtime": 41.0032,
368
+ "eval_samples_per_second": 42.826,
369
+ "eval_steps_per_second": 42.826,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2630196809768677,
375
  "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6226,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.3230444192886353,
382
  "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6596,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.2275980710983276,
389
  "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6419,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2460874319076538,
396
  "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6432,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.5041472911834717,
403
+ "eval_runtime": 40.9893,
404
+ "eval_samples_per_second": 42.84,
405
+ "eval_steps_per_second": 42.84,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.4000052213668823,
411
  "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.681,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.139631748199463,
418
  "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6119,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.4779937267303467,
425
  "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.615,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0678008794784546,
432
  "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6014,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.4982025623321533,
439
+ "eval_runtime": 41.0253,
440
+ "eval_samples_per_second": 42.803,
441
+ "eval_steps_per_second": 42.803,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4893417358398438,
447
  "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.574,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3910084962844849,
454
  "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6452,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1891777515411377,
461
  "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6627,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6157493591308594,
468
  "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6508,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4939932823181152,
475
+ "eval_runtime": 41.1171,
476
+ "eval_samples_per_second": 42.707,
477
+ "eval_steps_per_second": 42.707,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2457774877548218,
483
  "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6282,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.1914823055267334,
490
  "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6931,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.186735987663269,
497
  "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6229,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.868569016456604,
504
  "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6312,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.487884283065796,
511
+ "eval_runtime": 41.4603,
512
+ "eval_samples_per_second": 42.354,
513
+ "eval_steps_per_second": 42.354,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.3528209924697876,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.319753646850586,
526
  "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.5599,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.338115930557251,
533
  "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6528,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.2844877243041992,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4847159385681152,
547
+ "eval_runtime": 41.1324,
548
+ "eval_samples_per_second": 42.691,
549
+ "eval_steps_per_second": 42.691,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4525926113128662,
555
  "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.622,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5551460981369019,
562
  "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6219,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.39869225025177,
569
  "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5807,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4835882186889648,
576
  "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6733,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.4816081523895264,
583
+ "eval_runtime": 40.9609,
584
+ "eval_samples_per_second": 42.87,
585
+ "eval_steps_per_second": 42.87,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2404175996780396,
591
  "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6991,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.3770995140075684,
598
  "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.5512,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1706722974777222,
605
  "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6126,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.290719985961914,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.4776341915130615,
619
+ "eval_runtime": 41.0702,
620
+ "eval_samples_per_second": 42.756,
621
+ "eval_steps_per_second": 42.756,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.32352614402771,
627
  "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5755,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4078598022460938,
634
  "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.6678,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1207985877990723,
641
  "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5674,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4133316278457642,
648
  "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5949,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.473680019378662,
655
+ "eval_runtime": 42.1618,
656
+ "eval_samples_per_second": 41.649,
657
+ "eval_steps_per_second": 41.649,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9091076254844666,
663
  "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6154,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.3824701309204102,
670
  "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6569,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.3944271802902222,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.504271388053894,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4708938598632812,
691
+ "eval_runtime": 41.7484,
692
+ "eval_samples_per_second": 42.061,
693
+ "eval_steps_per_second": 42.061,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1897855997085571,
699
  "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6979,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9344286918640137,
706
  "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6678,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
+ "grad_norm": 4.620224475860596,
713
  "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.5652,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.275289535522461,
720
  "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5655,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4711084365844727,
727
+ "eval_runtime": 40.8651,
728
+ "eval_samples_per_second": 42.971,
729
+ "eval_steps_per_second": 42.971,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.460325837135315,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.2776564359664917,
742
  "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.626,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5591661930084229,
749
+ "learning_rate": 3.6109547927218154e-05,
750
+ "loss": 2.6603,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0031243562698364,
756
+ "learning_rate": 3.587507034327519e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4686498641967773,
763
+ "eval_runtime": 41.059,
764
+ "eval_samples_per_second": 42.768,
765
+ "eval_steps_per_second": 42.768,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.662988543510437,
771
+ "learning_rate": 3.564059275933221e-05,
772
+ "loss": 2.5906,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.5336205959320068,
778
+ "learning_rate": 3.540611517538923e-05,
779
+ "loss": 2.62,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2656798362731934,
785
+ "learning_rate": 3.517163759144626e-05,
786
+ "loss": 2.6229,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.5082098245620728,
792
+ "learning_rate": 3.493716000750328e-05,
793
+ "loss": 2.6015,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.466660737991333,
799
+ "eval_runtime": 40.8118,
800
+ "eval_samples_per_second": 43.027,
801
+ "eval_steps_per_second": 43.027,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8201966285705566,
807
+ "learning_rate": 3.470268242356031e-05,
808
+ "loss": 2.6495,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3035717010498047,
814
  "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.531,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2087314128875732,
821
  "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5412,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2561825513839722,
828
  "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6465,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.4628918170928955,
835
+ "eval_runtime": 40.8309,
836
+ "eval_samples_per_second": 43.007,
837
+ "eval_steps_per_second": 43.007,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.7700440883636475,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.4953458309173584,
850
  "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.577,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3659100532531738,
857
  "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6531,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.156020998954773,
864
  "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6418,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.462512969970703,
871
+ "eval_runtime": 40.8077,
872
+ "eval_samples_per_second": 43.031,
873
+ "eval_steps_per_second": 43.031,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.7687715291976929,
879
  "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6085,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.3047523498535156,
886
  "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.6002,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
+ "grad_norm": 1.040693998336792,
893
  "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6145,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9686591029167175,
900
  "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5709,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460991621017456,
907
+ "eval_runtime": 40.9408,
908
+ "eval_samples_per_second": 42.891,
909
+ "eval_steps_per_second": 42.891,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2371070384979248,
915
  "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5449,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.422345757484436,
922
  "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6032,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.229543447494507,
929
  "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6611,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4649646282196045,
936
  "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.5963,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.455350637435913,
943
+ "eval_runtime": 40.876,
944
+ "eval_samples_per_second": 42.959,
945
+ "eval_steps_per_second": 42.959,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.2330511808395386,
951
  "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5561,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1780569553375244,
958
  "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5878,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5878489017486572,
965
  "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5788,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2362117767333984,
972
  "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.685,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4557485580444336,
979
+ "eval_runtime": 40.7838,
980
+ "eval_samples_per_second": 43.056,
981
+ "eval_steps_per_second": 43.056,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4540385007858276,
987
  "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5653,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
+ "grad_norm": 1.560059905052185,
994
  "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5448,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.153442144393921,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5042,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.250948429107666,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.575,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4553444385528564,
1015
+ "eval_runtime": 40.9798,
1016
+ "eval_samples_per_second": 42.85,
1017
+ "eval_steps_per_second": 42.85,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.6559193134307861,
1023
  "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6065,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.6024394035339355,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5194,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3071702718734741,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5348,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1332521438598633,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5913,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.454563617706299,
1051
+ "eval_runtime": 40.8474,
1052
+ "eval_samples_per_second": 42.989,
1053
+ "eval_steps_per_second": 42.989,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.260486364364624,
1059
  "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.612,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.009621500968933,
1066
  "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6078,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.3116769790649414,
1073
  "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5739,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.485499143600464,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6272,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.450514316558838,
1087
+ "eval_runtime": 40.6819,
1088
+ "eval_samples_per_second": 43.164,
1089
+ "eval_steps_per_second": 43.164,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.934110164642334,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5319,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9517920017242432,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.3010960817337036,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5811,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.7256052494049072,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6294,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4498414993286133,
1123
+ "eval_runtime": 41.0253,
1124
+ "eval_samples_per_second": 42.803,
1125
+ "eval_steps_per_second": 42.803,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6172245740890503,
1131
  "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.6309,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3149018287658691,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5658,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.6285394430160522,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.611,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.0910215377807617,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6277,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.449920892715454,
1159
+ "eval_runtime": 40.7694,
1160
+ "eval_samples_per_second": 43.071,
1161
+ "eval_steps_per_second": 43.071,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.497223138809204,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6336,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3010990619659424,
1174
  "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5497,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.4681612253189087,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3477168083190918,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5352,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.4485294818878174,
1195
+ "eval_runtime": 40.856,
1196
+ "eval_samples_per_second": 42.98,
1197
+ "eval_steps_per_second": 42.98,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2609894275665283,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5984,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.498071312904358,
1210
  "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6117,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5235400199890137,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6127,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.7103843688964844,
1224
  "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.5761,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4469785690307617,
1231
+ "eval_runtime": 40.7827,
1232
+ "eval_samples_per_second": 43.058,
1233
+ "eval_steps_per_second": 43.058,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2467267513275146,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6174,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.8229267597198486,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6364,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.1323461532592773,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5595,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.150225043296814,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6266,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.445380926132202,
1267
+ "eval_runtime": 40.7346,
1268
+ "eval_samples_per_second": 43.108,
1269
+ "eval_steps_per_second": 43.108,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.36672842502594,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6212,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.244776725769043,
1282
  "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5734,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3731275796890259,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.536,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
+ "grad_norm": 2.2051963806152344,
1296
  "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6097,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.4447412490844727,
1303
+ "eval_runtime": 40.7537,
1304
+ "eval_samples_per_second": 43.088,
1305
+ "eval_steps_per_second": 43.088,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.2323483228683472,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.555,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0700924396514893,
1318
  "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5598,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
+ "grad_norm": 2.785604238510132,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5682,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6302391290664673,
1332
  "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6002,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.4443070888519287,
1339
+ "eval_runtime": 40.7834,
1340
+ "eval_samples_per_second": 43.057,
1341
+ "eval_steps_per_second": 43.057,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.270948886871338,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0166101455688477,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5687,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.4803165197372437,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.66029953956604,
1368
+ "learning_rate": 1.993059463515288e-05,
1369
+ "loss": 2.5815,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.4443864822387695,
1375
+ "eval_runtime": 40.8884,
1376
+ "eval_samples_per_second": 42.946,
1377
+ "eval_steps_per_second": 42.946,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.5316967964172363,
1383
+ "learning_rate": 1.9696117051209907e-05,
1384
+ "loss": 2.5979,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3586021661758423,
1390
+ "learning_rate": 1.946163946726693e-05,
1391
+ "loss": 2.6304,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.293283462524414,
1397
+ "learning_rate": 1.9227161883323953e-05,
1398
+ "loss": 2.5601,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6579082012176514,
1404
+ "learning_rate": 1.899268429938098e-05,
1405
+ "loss": 2.5124,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.443239688873291,
1411
+ "eval_runtime": 41.5253,
1412
+ "eval_samples_per_second": 42.288,
1413
+ "eval_steps_per_second": 42.288,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2292983531951904,
1419
+ "learning_rate": 1.8758206715438003e-05,
1420
+ "loss": 2.5449,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
+ "grad_norm": 2.1584088802337646,
1426
+ "learning_rate": 1.852372913149503e-05,
1427
+ "loss": 2.576,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.248931646347046,
1433
+ "learning_rate": 1.8289251547552055e-05,
1434
+ "loss": 2.5218,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5526643991470337,
1440
+ "learning_rate": 1.8054773963609077e-05,
1441
+ "loss": 2.5839,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.4422366619110107,
1447
+ "eval_runtime": 41.5286,
1448
+ "eval_samples_per_second": 42.284,
1449
+ "eval_steps_per_second": 42.284,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4182465076446533,
1455
+ "learning_rate": 1.7820296379666105e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.2547794580459595,
1462
+ "learning_rate": 1.758581879572313e-05,
1463
+ "loss": 2.7643,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.093676209449768,
1469
+ "learning_rate": 1.7351341211780155e-05,
1470
+ "loss": 2.5877,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.055103302001953,
1476
+ "learning_rate": 1.711686362783718e-05,
1477
+ "loss": 2.4874,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.4419164657592773,
1483
+ "eval_runtime": 40.7627,
1484
+ "eval_samples_per_second": 43.079,
1485
+ "eval_steps_per_second": 43.079,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0890482664108276,
1491
+ "learning_rate": 1.6882386043894204e-05,
1492
+ "loss": 2.5942,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.8730581998825073,
1498
+ "learning_rate": 1.6647908459951232e-05,
1499
+ "loss": 2.592,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.6372568607330322,
1505
+ "learning_rate": 1.6413430876008254e-05,
1506
+ "loss": 2.5051,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.4793121814727783,
1512
+ "learning_rate": 1.6178953292065278e-05,
1513
+ "loss": 2.5644,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.44052791595459,
1519
+ "eval_runtime": 40.8302,
1520
+ "eval_samples_per_second": 43.007,
1521
+ "eval_steps_per_second": 43.007,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4595574140548706,
1527
+ "learning_rate": 1.5944475708122306e-05,
1528
+ "loss": 2.6267,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3399115800857544,
1534
+ "learning_rate": 1.5709998124179328e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.6734541654586792,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5633,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5579371452331543,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5467,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4398272037506104,
1555
+ "eval_runtime": 40.8947,
1556
+ "eval_samples_per_second": 42.94,
1557
+ "eval_steps_per_second": 42.94,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
+ "grad_norm": 1.932307243347168,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9798572063446045,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.5891,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8812506198883057,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5659,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5422954559326172,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5315,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.439927816390991,
1591
+ "eval_runtime": 40.7058,
1592
+ "eval_samples_per_second": 43.139,
1593
+ "eval_steps_per_second": 43.139,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.2686810493469238,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5809,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.905816674232483,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5225,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.9044383764266968,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.5301,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.2211689949035645,
1620
  "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5885,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.4387078285217285,
1627
+ "eval_runtime": 40.6013,
1628
+ "eval_samples_per_second": 43.25,
1629
+ "eval_steps_per_second": 43.25,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7181427478790283,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5422,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.714859127998352,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4957,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.473822832107544,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.6518057584762573,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5488,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.438912868499756,
1663
+ "eval_runtime": 40.9773,
1664
+ "eval_samples_per_second": 42.853,
1665
+ "eval_steps_per_second": 42.853,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0921835899353027,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5456,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.0887908935546875,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5298,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.09403133392334,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5843,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2155842781066895,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.639,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4376986026763916,
1699
+ "eval_runtime": 40.7687,
1700
+ "eval_samples_per_second": 43.072,
1701
+ "eval_steps_per_second": 43.072,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2745308876037598,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.243294358253479,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6086,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.3740507364273071,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5203,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3419544696807861,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4968,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4370713233947754,
1735
+ "eval_runtime": 40.6311,
1736
+ "eval_samples_per_second": 43.218,
1737
+ "eval_steps_per_second": 43.218,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2722185850143433,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.617,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.336860179901123,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5443,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.5844101905822754,
1757
  "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5358,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.376717209815979,
1764
  "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5065,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4364349842071533,
1771
+ "eval_runtime": 40.68,
1772
+ "eval_samples_per_second": 43.166,
1773
+ "eval_steps_per_second": 43.166,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.2268385887145996,
1779
  "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5043,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.304364800453186,
1786
  "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5213,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.662419319152832,
1793
+ "learning_rate": 8.914837741511913e-06,
1794
+ "loss": 2.6088,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.155359983444214,
1800
+ "learning_rate": 8.680360157568938e-06,
1801
+ "loss": 2.5311,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.4357810020446777,
1807
+ "eval_runtime": 40.7194,
1808
+ "eval_samples_per_second": 43.124,
1809
+ "eval_steps_per_second": 43.124,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
+ "grad_norm": 1.9857499599456787,
1815
+ "learning_rate": 8.44588257362596e-06,
1816
+ "loss": 2.5105,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.383115530014038,
1822
+ "learning_rate": 8.211404989682987e-06,
1823
+ "loss": 2.5388,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.1307530403137207,
1829
+ "learning_rate": 7.976927405740012e-06,
1830
+ "loss": 2.5422,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.7428008317947388,
1836
+ "learning_rate": 7.742449821797037e-06,
1837
+ "loss": 2.5139,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.4361467361450195,
1843
+ "eval_runtime": 40.6659,
1844
+ "eval_samples_per_second": 43.181,
1845
+ "eval_steps_per_second": 43.181,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9510554075241089,
1851
+ "learning_rate": 7.507972237854062e-06,
1852
+ "loss": 2.6383,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.658544898033142,
1858
+ "learning_rate": 7.273494653911086e-06,
1859
+ "loss": 2.5507,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.4996962547302246,
1865
+ "learning_rate": 7.039017069968111e-06,
1866
+ "loss": 2.5093,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6710158586502075,
1872
+ "learning_rate": 6.804539486025137e-06,
1873
+ "loss": 2.6495,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.435485601425171,
1879
+ "eval_runtime": 40.7734,
1880
+ "eval_samples_per_second": 43.067,
1881
+ "eval_steps_per_second": 43.067,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.3965766429901123,
1887
+ "learning_rate": 6.570061902082161e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.1116695404052734,
1894
+ "learning_rate": 6.335584318139186e-06,
1895
+ "loss": 2.584,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0421807765960693,
1901
+ "learning_rate": 6.101106734196211e-06,
1902
+ "loss": 2.5244,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.374508023262024,
1908
+ "learning_rate": 5.8666291502532365e-06,
1909
+ "loss": 2.5097,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4343748092651367,
1915
+ "eval_runtime": 40.8106,
1916
+ "eval_samples_per_second": 43.028,
1917
+ "eval_steps_per_second": 43.028,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.139459252357483,
1923
+ "learning_rate": 5.632151566310261e-06,
1924
+ "loss": 2.586,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.1283456087112427,
1930
+ "learning_rate": 5.397673982367286e-06,
1931
+ "loss": 2.5461,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2529475688934326,
1937
+ "learning_rate": 5.163196398424311e-06,
1938
+ "loss": 2.5577,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7139452695846558,
1944
+ "learning_rate": 4.928718814481336e-06,
1945
+ "loss": 2.5476,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.435107707977295,
1951
+ "eval_runtime": 40.9961,
1952
+ "eval_samples_per_second": 42.833,
1953
+ "eval_steps_per_second": 42.833,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3778159618377686,
1959
+ "learning_rate": 4.69424123053836e-06,
1960
+ "loss": 2.5561,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0923840999603271,
1966
+ "learning_rate": 4.459763646595386e-06,
1967
+ "loss": 2.5184,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2169151306152344,
1973
+ "learning_rate": 4.225286062652411e-06,
1974
+ "loss": 2.6378,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.5901875495910645,
1980
+ "learning_rate": 3.9908084787094354e-06,
1981
+ "loss": 2.6822,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.434779644012451,
1987
+ "eval_runtime": 40.6453,
1988
+ "eval_samples_per_second": 43.203,
1989
+ "eval_steps_per_second": 43.203,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7463274002075195,
1995
+ "learning_rate": 3.7563308947664606e-06,
1996
+ "loss": 2.5749,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.236441731452942,
2002
+ "learning_rate": 3.521853310823486e-06,
2003
+ "loss": 2.5416,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.132720708847046,
2009
+ "learning_rate": 3.28737572688051e-06,
2010
+ "loss": 2.597,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.339376926422119,
2016
+ "learning_rate": 3.0528981429375353e-06,
2017
+ "loss": 2.5564,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4349372386932373,
2023
+ "eval_runtime": 40.7164,
2024
+ "eval_samples_per_second": 43.128,
2025
+ "eval_steps_per_second": 43.128,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.878458857536316,
2031
+ "learning_rate": 2.81842055899456e-06,
2032
+ "loss": 2.6238,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8116399049758911,
2038
+ "learning_rate": 2.5839429750515852e-06,
2039
+ "loss": 2.5114,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.155181884765625,
2045
+ "learning_rate": 2.34946539110861e-06,
2046
+ "loss": 2.5582,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.505588173866272,
2052
+ "learning_rate": 2.1149878071656348e-06,
2053
+ "loss": 2.6288,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.434521436691284,
2059
+ "eval_runtime": 41.0956,
2060
+ "eval_samples_per_second": 42.73,
2061
+ "eval_steps_per_second": 42.73,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4831116199493408,
2067
+ "learning_rate": 1.8805102232226601e-06,
2068
+ "loss": 2.4811,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.931284785270691,
2074
+ "learning_rate": 1.646032639279685e-06,
2075
+ "loss": 2.5426,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6025974750518799,
2081
+ "learning_rate": 1.4115550553367099e-06,
2082
+ "loss": 2.509,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3426520824432373,
2088
+ "learning_rate": 1.1770774713937348e-06,
2089
+ "loss": 2.6057,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4339404106140137,
2095
+ "eval_runtime": 41.3931,
2096
+ "eval_samples_per_second": 42.423,
2097
+ "eval_steps_per_second": 42.423,
2098
  "step": 11600
2099
  }
2100
  ],
checkpoint-11600/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8421dc43c44b3cc68cec62a0d36963570aa58f934fcd2e92f9f288f7caa6d69
3
  size 5304
checkpoint-11800/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cbeee3dfc268cbdbc22f727c37945ebd4275a3ea49512bd0ee049c583c3112b
3
  size 3253104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c31c1fc80a43fba4f08f077ad1ede5d3c76eed5f71dd49542cc83bc90eb9efcd
3
  size 3253104
checkpoint-11800/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2508e6c71eefe4bf3e6d34287d87494c23ed52ffdc88e1f2b4e1870ad89b4ce2
3
  size 6548858
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6171329dc9f9a52ddf47d1afcff06a552cf2c7862c05b4ddc69008d479f164e8
3
  size 6548858
checkpoint-11800/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:573e7fd5dc49ed59f25d4becc07e4193e7f82ea7e1f2e73baca290d86a26a80a
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1253846e6ca4acd393094e5d9b5c0e97b2eb9bf1590fbc0e021d2dc8e0cad935
3
  size 988
checkpoint-11800/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:779c8896e02a376108997a6c5096158c8cea4a1d22071fa3e06e889ac960dfa8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea12dc38b50429670bad53db4a1a2b3f4a755e3269116c3289e1458e8f9428c0
3
  size 1064
checkpoint-11800/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 11800,
3
- "best_metric": 2.432849645614624,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
  "epoch": 2.9875292777109577,
6
  "eval_steps": 200,
@@ -11,2126 +11,2126 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
- "grad_norm": 0.4274106025695801,
15
  "learning_rate": 2.067510548523207e-06,
16
- "loss": 3.4405,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
- "grad_norm": 0.5292551517486572,
22
  "learning_rate": 4.177215189873418e-06,
23
- "loss": 3.4567,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
- "grad_norm": 0.7541739344596863,
29
  "learning_rate": 6.28691983122363e-06,
30
- "loss": 3.4683,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
- "grad_norm": 0.8833445906639099,
36
  "learning_rate": 8.39662447257384e-06,
37
- "loss": 3.5084,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
- "eval_loss": 3.5248923301696777,
43
- "eval_runtime": 39.9384,
44
- "eval_samples_per_second": 43.968,
45
- "eval_steps_per_second": 43.968,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
- "grad_norm": 0.9998921155929565,
51
  "learning_rate": 1.0506329113924052e-05,
52
- "loss": 3.359,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
- "grad_norm": 0.8041885495185852,
58
  "learning_rate": 1.2616033755274262e-05,
59
- "loss": 3.351,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
- "grad_norm": 0.9213416576385498,
65
  "learning_rate": 1.4725738396624473e-05,
66
- "loss": 3.2244,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
- "grad_norm": 1.0922213792800903,
72
  "learning_rate": 1.6835443037974685e-05,
73
- "loss": 3.1565,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
- "eval_loss": 3.151216983795166,
79
- "eval_runtime": 39.7515,
80
- "eval_samples_per_second": 44.174,
81
- "eval_steps_per_second": 44.174,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
- "grad_norm": 1.4199283123016357,
87
  "learning_rate": 1.8945147679324897e-05,
88
- "loss": 3.0154,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
- "grad_norm": 1.077143907546997,
94
  "learning_rate": 2.1054852320675106e-05,
95
- "loss": 3.0456,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
- "grad_norm": 1.5466052293777466,
101
  "learning_rate": 2.3164556962025318e-05,
102
- "loss": 2.9099,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
- "grad_norm": 1.2139467000961304,
108
  "learning_rate": 2.5274261603375527e-05,
109
- "loss": 2.8839,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
- "eval_loss": 2.793567657470703,
115
- "eval_runtime": 40.2573,
116
- "eval_samples_per_second": 43.619,
117
- "eval_steps_per_second": 43.619,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
- "grad_norm": 1.0270315408706665,
123
  "learning_rate": 2.738396624472574e-05,
124
- "loss": 2.8389,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
- "grad_norm": 1.5865377187728882,
130
  "learning_rate": 2.949367088607595e-05,
131
- "loss": 2.8228,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
- "grad_norm": 1.076073408126831,
137
  "learning_rate": 3.160337552742616e-05,
138
- "loss": 2.9255,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
- "grad_norm": 1.4510694742202759,
144
  "learning_rate": 3.3713080168776376e-05,
145
- "loss": 2.8165,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
- "eval_loss": 2.667301893234253,
151
- "eval_runtime": 40.0906,
152
- "eval_samples_per_second": 43.801,
153
- "eval_steps_per_second": 43.801,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
- "grad_norm": 1.5206592082977295,
159
  "learning_rate": 3.5822784810126585e-05,
160
- "loss": 2.8022,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
- "grad_norm": 1.173909068107605,
166
  "learning_rate": 3.7932489451476794e-05,
167
- "loss": 2.8034,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
- "grad_norm": 1.4551103115081787,
173
  "learning_rate": 4.004219409282701e-05,
174
- "loss": 2.774,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
- "grad_norm": 1.509749412536621,
180
  "learning_rate": 4.215189873417722e-05,
181
- "loss": 2.7978,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
- "eval_loss": 2.603116273880005,
187
- "eval_runtime": 39.9617,
188
- "eval_samples_per_second": 43.942,
189
- "eval_steps_per_second": 43.942,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
- "grad_norm": 1.745764136314392,
195
  "learning_rate": 4.426160337552743e-05,
196
- "loss": 2.7249,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
- "grad_norm": 1.6712589263916016,
202
  "learning_rate": 4.637130801687764e-05,
203
- "loss": 2.7618,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
- "grad_norm": 2.256267786026001,
209
  "learning_rate": 4.8481012658227845e-05,
210
- "loss": 2.7431,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
- "grad_norm": 1.5181586742401123,
216
  "learning_rate": 4.993434627649597e-05,
217
- "loss": 2.778,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
- "eval_loss": 2.5704379081726074,
223
- "eval_runtime": 40.1791,
224
- "eval_samples_per_second": 43.704,
225
- "eval_steps_per_second": 43.704,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
- "grad_norm": 1.1885608434677124,
231
  "learning_rate": 4.969986869255299e-05,
232
- "loss": 2.7224,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
- "grad_norm": 1.2136404514312744,
238
  "learning_rate": 4.946539110861002e-05,
239
- "loss": 2.6823,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
- "grad_norm": 0.8780732750892639,
245
  "learning_rate": 4.9230913524667046e-05,
246
- "loss": 2.6961,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
- "grad_norm": 1.0844959020614624,
252
- "learning_rate": 4.899643594072407e-05,
253
- "loss": 2.7014,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
- "eval_loss": 2.54587721824646,
259
- "eval_runtime": 39.8803,
260
- "eval_samples_per_second": 44.032,
261
- "eval_steps_per_second": 44.032,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
- "grad_norm": 1.3518335819244385,
267
- "learning_rate": 4.8761958356781096e-05,
268
- "loss": 2.6393,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
- "grad_norm": 1.1389687061309814,
274
- "learning_rate": 4.852748077283812e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
- "grad_norm": 1.6295430660247803,
281
- "learning_rate": 4.829300318889514e-05,
282
- "loss": 2.6754,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
- "grad_norm": 1.387499451637268,
288
- "learning_rate": 4.8058525604952173e-05,
289
- "loss": 2.6853,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
- "eval_loss": 2.5297553539276123,
295
- "eval_runtime": 39.722,
296
- "eval_samples_per_second": 44.207,
297
- "eval_steps_per_second": 44.207,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
- "grad_norm": 1.014020323753357,
303
- "learning_rate": 4.7824048021009195e-05,
304
- "loss": 2.7275,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
- "grad_norm": 1.1505990028381348,
310
- "learning_rate": 4.7589570437066216e-05,
311
- "loss": 2.6651,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
- "grad_norm": 1.1389458179473877,
317
- "learning_rate": 4.7355092853123244e-05,
318
- "loss": 2.6993,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
- "grad_norm": 1.2159587144851685,
324
- "learning_rate": 4.7120615269180266e-05,
325
- "loss": 2.7239,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
- "eval_loss": 2.5201079845428467,
331
- "eval_runtime": 39.8313,
332
- "eval_samples_per_second": 44.086,
333
- "eval_steps_per_second": 44.086,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
- "grad_norm": 1.1873971223831177,
339
- "learning_rate": 4.6886137685237294e-05,
340
- "loss": 2.6368,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
- "grad_norm": 1.5109103918075562,
346
- "learning_rate": 4.665166010129432e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
- "grad_norm": 1.9981125593185425,
353
- "learning_rate": 4.641718251735134e-05,
354
- "loss": 2.6513,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
- "grad_norm": 1.4879294633865356,
360
- "learning_rate": 4.6182704933408365e-05,
361
- "loss": 2.6433,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
- "eval_loss": 2.5107176303863525,
367
- "eval_runtime": 40.0643,
368
- "eval_samples_per_second": 43.83,
369
- "eval_steps_per_second": 43.83,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
- "grad_norm": 1.2832767963409424,
375
  "learning_rate": 4.5952916901144253e-05,
376
- "loss": 2.6225,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
- "grad_norm": 1.2915899753570557,
382
  "learning_rate": 4.5718439317201275e-05,
383
- "loss": 2.6592,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
- "grad_norm": 1.229929804801941,
389
  "learning_rate": 4.54839617332583e-05,
390
- "loss": 2.6411,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
- "grad_norm": 1.2569608688354492,
396
  "learning_rate": 4.524948414931533e-05,
397
- "loss": 2.6436,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
- "eval_loss": 2.504101514816284,
403
- "eval_runtime": 39.8694,
404
- "eval_samples_per_second": 44.044,
405
- "eval_steps_per_second": 44.044,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
- "grad_norm": 1.3688510656356812,
411
  "learning_rate": 4.501500656537235e-05,
412
- "loss": 2.6819,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
- "grad_norm": 1.1405905485153198,
418
  "learning_rate": 4.4780528981429374e-05,
419
- "loss": 2.6116,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
- "grad_norm": 1.453338861465454,
425
  "learning_rate": 4.45460513974864e-05,
426
- "loss": 2.6154,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
- "grad_norm": 1.0401395559310913,
432
  "learning_rate": 4.431157381354343e-05,
433
- "loss": 2.6018,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
- "eval_loss": 2.498344898223877,
439
- "eval_runtime": 39.9496,
440
- "eval_samples_per_second": 43.955,
441
- "eval_steps_per_second": 43.955,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
- "grad_norm": 1.4646718502044678,
447
  "learning_rate": 4.407709622960045e-05,
448
- "loss": 2.5734,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
- "grad_norm": 1.3828164339065552,
454
  "learning_rate": 4.384261864565748e-05,
455
- "loss": 2.6445,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
- "grad_norm": 2.1768596172332764,
461
  "learning_rate": 4.36081410617145e-05,
462
- "loss": 2.6618,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
- "grad_norm": 1.6110296249389648,
468
  "learning_rate": 4.337366347777152e-05,
469
- "loss": 2.6509,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
- "eval_loss": 2.4937028884887695,
475
- "eval_runtime": 39.8698,
476
- "eval_samples_per_second": 44.043,
477
- "eval_steps_per_second": 44.043,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
- "grad_norm": 1.2363536357879639,
483
  "learning_rate": 4.313918589382856e-05,
484
- "loss": 2.6274,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
- "grad_norm": 2.192110538482666,
490
  "learning_rate": 4.290470830988558e-05,
491
- "loss": 2.6932,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
- "grad_norm": 1.2024074792861938,
497
  "learning_rate": 4.26702307259426e-05,
498
- "loss": 2.6221,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
- "grad_norm": 1.8665797710418701,
504
  "learning_rate": 4.243575314199963e-05,
505
- "loss": 2.6313,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
- "eval_loss": 2.4876773357391357,
511
- "eval_runtime": 40.026,
512
- "eval_samples_per_second": 43.871,
513
- "eval_steps_per_second": 43.871,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
- "grad_norm": 1.4088993072509766,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
- "grad_norm": 1.3225140571594238,
526
  "learning_rate": 4.196679797411368e-05,
527
- "loss": 2.56,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
- "grad_norm": 1.3416539430618286,
533
  "learning_rate": 4.1732320390170706e-05,
534
- "loss": 2.6517,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
- "grad_norm": 1.079567790031433,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
- "eval_loss": 2.4842560291290283,
547
- "eval_runtime": 39.7988,
548
- "eval_samples_per_second": 44.122,
549
- "eval_steps_per_second": 44.122,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
- "grad_norm": 1.4532116651535034,
555
  "learning_rate": 4.126336522228475e-05,
556
- "loss": 2.6232,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
- "grad_norm": 1.5380038022994995,
562
  "learning_rate": 4.102888763834178e-05,
563
- "loss": 2.6212,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
- "grad_norm": 1.3965916633605957,
569
  "learning_rate": 4.0794410054398805e-05,
570
- "loss": 2.5804,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
- "grad_norm": 1.4798463582992554,
576
  "learning_rate": 4.0559932470455826e-05,
577
- "loss": 2.6724,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
- "eval_loss": 2.480894088745117,
583
- "eval_runtime": 39.9604,
584
- "eval_samples_per_second": 43.943,
585
- "eval_steps_per_second": 43.943,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
- "grad_norm": 1.2598360776901245,
591
  "learning_rate": 4.0325454886512854e-05,
592
- "loss": 2.6993,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
- "grad_norm": 1.366295576095581,
598
  "learning_rate": 4.0090977302569876e-05,
599
- "loss": 2.551,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
- "grad_norm": 1.1827855110168457,
605
  "learning_rate": 3.98564997186269e-05,
606
- "loss": 2.6131,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
- "grad_norm": 1.2728627920150757,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
- "eval_loss": 2.477010726928711,
619
- "eval_runtime": 40.2504,
620
- "eval_samples_per_second": 43.627,
621
- "eval_steps_per_second": 43.627,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
- "grad_norm": 1.341917634010315,
627
  "learning_rate": 3.938754455074095e-05,
628
- "loss": 2.5748,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
- "grad_norm": 1.4114609956741333,
634
  "learning_rate": 3.9153066966797975e-05,
635
- "loss": 2.667,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
- "grad_norm": 1.1211490631103516,
641
  "learning_rate": 3.8918589382855e-05,
642
- "loss": 2.5671,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
- "grad_norm": 1.4166322946548462,
648
  "learning_rate": 3.8684111798912024e-05,
649
- "loss": 2.5945,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
- "eval_loss": 2.47322940826416,
655
- "eval_runtime": 40.2079,
656
- "eval_samples_per_second": 43.673,
657
- "eval_steps_per_second": 43.673,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
- "grad_norm": 0.9144394993782043,
663
  "learning_rate": 3.844963421496905e-05,
664
- "loss": 2.6148,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
- "grad_norm": 1.4106061458587646,
670
  "learning_rate": 3.821515663102608e-05,
671
- "loss": 2.6586,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
- "grad_norm": 1.414415717124939,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
- "grad_norm": 1.5448992252349854,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
- "eval_loss": 2.4701173305511475,
691
- "eval_runtime": 40.1267,
692
- "eval_samples_per_second": 43.761,
693
- "eval_steps_per_second": 43.761,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
- "grad_norm": 1.1959314346313477,
699
  "learning_rate": 3.751172387919715e-05,
700
- "loss": 2.6975,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
- "grad_norm": 0.9525274038314819,
706
  "learning_rate": 3.727724629525417e-05,
707
- "loss": 2.6675,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
- "grad_norm": 4.733253479003906,
713
  "learning_rate": 3.70427687113112e-05,
714
- "loss": 2.566,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
- "grad_norm": 1.2803192138671875,
720
  "learning_rate": 3.680829112736823e-05,
721
- "loss": 2.5659,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
- "eval_loss": 2.4702188968658447,
727
- "eval_runtime": 40.1387,
728
- "eval_samples_per_second": 43.748,
729
- "eval_steps_per_second": 43.748,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
- "grad_norm": 1.446990966796875,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
- "grad_norm": 1.3563008308410645,
742
  "learning_rate": 3.633933595948227e-05,
743
- "loss": 2.6252,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
- "grad_norm": 1.5763463973999023,
749
- "learning_rate": 3.61048583755393e-05,
750
- "loss": 2.6593,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
- "grad_norm": 1.0055335760116577,
756
- "learning_rate": 3.587038079159633e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
- "eval_loss": 2.4676930904388428,
763
- "eval_runtime": 40.0342,
764
- "eval_samples_per_second": 43.863,
765
- "eval_steps_per_second": 43.863,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
- "grad_norm": 1.7013343572616577,
771
- "learning_rate": 3.563590320765335e-05,
772
- "loss": 2.59,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
- "grad_norm": 1.541069507598877,
778
- "learning_rate": 3.540142562371038e-05,
779
- "loss": 2.6192,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
- "grad_norm": 1.2536805868148804,
785
- "learning_rate": 3.51669480397674e-05,
786
- "loss": 2.6225,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
- "grad_norm": 1.8328826427459717,
792
- "learning_rate": 3.493247045582442e-05,
793
- "loss": 2.6022,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
- "eval_loss": 2.465629816055298,
799
- "eval_runtime": 39.8532,
800
- "eval_samples_per_second": 44.062,
801
- "eval_steps_per_second": 44.062,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
- "grad_norm": 1.8557270765304565,
807
- "learning_rate": 3.469799287188145e-05,
808
- "loss": 2.6496,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
- "grad_norm": 1.3255618810653687,
814
  "learning_rate": 3.446820483961734e-05,
815
- "loss": 2.5315,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
- "grad_norm": 1.2192399501800537,
821
  "learning_rate": 3.423372725567436e-05,
822
- "loss": 2.5409,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
- "grad_norm": 1.2533234357833862,
828
  "learning_rate": 3.399924967173139e-05,
829
- "loss": 2.6457,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
- "eval_loss": 2.462162733078003,
835
- "eval_runtime": 40.0542,
836
- "eval_samples_per_second": 43.841,
837
- "eval_steps_per_second": 43.841,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
- "grad_norm": 1.8414678573608398,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
- "grad_norm": 1.568259596824646,
850
  "learning_rate": 3.3530294503845436e-05,
851
- "loss": 2.5771,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
- "grad_norm": 1.3547483682632446,
857
  "learning_rate": 3.3295816919902464e-05,
858
- "loss": 2.6525,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
- "grad_norm": 1.1655386686325073,
864
  "learning_rate": 3.3061339335959486e-05,
865
- "loss": 2.6421,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
- "eval_loss": 2.461489200592041,
871
- "eval_runtime": 39.9962,
872
- "eval_samples_per_second": 43.904,
873
- "eval_steps_per_second": 43.904,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
- "grad_norm": 1.798033595085144,
879
  "learning_rate": 3.282686175201651e-05,
880
- "loss": 2.6091,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
- "grad_norm": 3.2964117527008057,
886
  "learning_rate": 3.2592384168073535e-05,
887
- "loss": 2.5997,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
- "grad_norm": 1.0457675457000732,
893
  "learning_rate": 3.2357906584130557e-05,
894
- "loss": 2.6144,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
- "grad_norm": 0.9728056192398071,
900
  "learning_rate": 3.2123429000187585e-05,
901
- "loss": 2.5712,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
- "eval_loss": 2.460186719894409,
907
- "eval_runtime": 39.8386,
908
- "eval_samples_per_second": 44.078,
909
- "eval_steps_per_second": 44.078,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
- "grad_norm": 1.2350194454193115,
915
  "learning_rate": 3.188895141624461e-05,
916
- "loss": 2.5448,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
- "grad_norm": 1.4210622310638428,
922
  "learning_rate": 3.1654473832301634e-05,
923
- "loss": 2.6031,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
- "grad_norm": 2.226473093032837,
929
  "learning_rate": 3.1419996248358656e-05,
930
- "loss": 2.6597,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
- "grad_norm": 2.4525105953216553,
936
  "learning_rate": 3.1185518664415684e-05,
937
- "loss": 2.596,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
- "eval_loss": 2.454537868499756,
943
- "eval_runtime": 39.805,
944
- "eval_samples_per_second": 44.115,
945
- "eval_steps_per_second": 44.115,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
- "grad_norm": 1.265309453010559,
951
  "learning_rate": 3.095104108047271e-05,
952
- "loss": 2.5559,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
- "grad_norm": 2.1364307403564453,
958
  "learning_rate": 3.071656349652973e-05,
959
- "loss": 2.5859,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
- "grad_norm": 1.5945920944213867,
965
  "learning_rate": 3.048208591258676e-05,
966
- "loss": 2.5778,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
- "grad_norm": 1.2479759454727173,
972
  "learning_rate": 3.0247608328643783e-05,
973
- "loss": 2.6846,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
- "eval_loss": 2.4547293186187744,
979
- "eval_runtime": 39.7806,
980
- "eval_samples_per_second": 44.142,
981
- "eval_steps_per_second": 44.142,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
- "grad_norm": 1.4845050573349,
987
  "learning_rate": 3.0013130744700808e-05,
988
- "loss": 2.5661,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
- "grad_norm": 1.5581985712051392,
994
  "learning_rate": 2.9778653160757836e-05,
995
- "loss": 2.5441,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
- "grad_norm": 3.1663737297058105,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
- "loss": 2.5044,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
- "grad_norm": 1.2454484701156616,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
- "loss": 2.5747,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
- "eval_loss": 2.4544529914855957,
1015
- "eval_runtime": 39.9287,
1016
- "eval_samples_per_second": 43.978,
1017
- "eval_steps_per_second": 43.978,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
- "grad_norm": 1.662784457206726,
1023
  "learning_rate": 2.907522040892891e-05,
1024
- "loss": 2.6064,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
- "grad_norm": 1.618458867073059,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
- "loss": 2.5191,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
- "grad_norm": 1.3003348112106323,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
- "loss": 2.5339,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
- "grad_norm": 1.1443992853164673,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
- "loss": 2.5914,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
- "eval_loss": 2.453752279281616,
1051
- "eval_runtime": 39.8234,
1052
- "eval_samples_per_second": 44.095,
1053
- "eval_steps_per_second": 44.095,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
- "grad_norm": 1.2574009895324707,
1059
  "learning_rate": 2.813731007315701e-05,
1060
- "loss": 2.6109,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
- "grad_norm": 1.002815842628479,
1066
  "learning_rate": 2.790283248921403e-05,
1067
- "loss": 2.6075,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
- "grad_norm": 1.306024432182312,
1073
  "learning_rate": 2.766835490527106e-05,
1074
- "loss": 2.5733,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
- "grad_norm": 2.5023701190948486,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
- "loss": 2.6274,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
- "eval_loss": 2.449617862701416,
1087
- "eval_runtime": 39.9401,
1088
- "eval_samples_per_second": 43.966,
1089
- "eval_steps_per_second": 43.966,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
- "grad_norm": 1.9410326480865479,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
- "loss": 2.5312,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
- "grad_norm": 1.9793561697006226,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
- "grad_norm": 1.290531873703003,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
- "loss": 2.5817,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
- "grad_norm": 2.11389422416687,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
- "loss": 2.6287,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
- "eval_loss": 2.4490554332733154,
1123
- "eval_runtime": 39.7474,
1124
- "eval_samples_per_second": 44.179,
1125
- "eval_steps_per_second": 44.179,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
- "grad_norm": 1.6492938995361328,
1131
  "learning_rate": 2.626148940161321e-05,
1132
- "loss": 2.631,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
- "grad_norm": 1.3233673572540283,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
- "loss": 2.5654,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
- "grad_norm": 1.688264012336731,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
- "loss": 2.6096,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
- "grad_norm": 2.064823865890503,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
- "loss": 2.6275,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
- "eval_loss": 2.4488983154296875,
1159
- "eval_runtime": 39.6847,
1160
- "eval_samples_per_second": 44.249,
1161
- "eval_steps_per_second": 44.249,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
- "grad_norm": 1.5599696636199951,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
- "loss": 2.6334,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
- "grad_norm": 1.3142633438110352,
1174
  "learning_rate": 2.508910148189833e-05,
1175
- "loss": 2.5496,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
- "grad_norm": 1.474135160446167,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
- "grad_norm": 1.3737610578536987,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
- "loss": 2.5345,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
- "eval_loss": 2.447559356689453,
1195
- "eval_runtime": 39.7021,
1196
- "eval_samples_per_second": 44.229,
1197
- "eval_steps_per_second": 44.229,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
- "grad_norm": 1.2432060241699219,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
- "loss": 2.5977,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
- "grad_norm": 1.465063452720642,
1210
  "learning_rate": 2.415119114612643e-05,
1211
- "loss": 2.6118,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
- "grad_norm": 1.5186200141906738,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
- "loss": 2.6126,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
- "grad_norm": 1.6869078874588013,
1224
  "learning_rate": 2.368223597824048e-05,
1225
- "loss": 2.576,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
- "eval_loss": 2.4459502696990967,
1231
- "eval_runtime": 39.7653,
1232
- "eval_samples_per_second": 44.159,
1233
- "eval_steps_per_second": 44.159,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
- "grad_norm": 1.2578104734420776,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
- "loss": 2.6178,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
- "grad_norm": 1.7597213983535767,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
- "loss": 2.6358,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
- "grad_norm": 2.144465923309326,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
- "loss": 2.5597,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
- "grad_norm": 1.1808464527130127,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
- "loss": 2.6269,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
- "eval_loss": 2.4444611072540283,
1267
- "eval_runtime": 40.0709,
1268
- "eval_samples_per_second": 43.822,
1269
- "eval_steps_per_second": 43.822,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
- "grad_norm": 1.4550806283950806,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
- "loss": 2.6206,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
- "grad_norm": 1.2635902166366577,
1282
  "learning_rate": 2.227537047458263e-05,
1283
- "loss": 2.5722,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
- "grad_norm": 1.3835856914520264,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
- "loss": 2.535,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
- "grad_norm": 1.735004186630249,
1296
  "learning_rate": 2.180641530669668e-05,
1297
- "loss": 2.6086,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
- "eval_loss": 2.443899154663086,
1303
- "eval_runtime": 40.91,
1304
- "eval_samples_per_second": 42.924,
1305
- "eval_steps_per_second": 42.924,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
- "grad_norm": 1.263051986694336,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
- "loss": 2.5544,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
- "grad_norm": 1.0899442434310913,
1318
  "learning_rate": 2.133746013881073e-05,
1319
- "loss": 2.5603,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
- "grad_norm": 3.038811206817627,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
- "loss": 2.5688,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
- "grad_norm": 1.6385984420776367,
1332
  "learning_rate": 2.086850497092478e-05,
1333
- "loss": 2.6006,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
- "eval_loss": 2.443300724029541,
1339
- "eval_runtime": 40.6649,
1340
- "eval_samples_per_second": 43.182,
1341
- "eval_steps_per_second": 43.182,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
- "grad_norm": 1.2857129573822021,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
- "grad_norm": 1.0289497375488281,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
- "loss": 2.5671,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
- "grad_norm": 1.5041025876998901,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
- "grad_norm": 1.6611964702606201,
1368
- "learning_rate": 1.993528418683174e-05,
1369
- "loss": 2.5801,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
- "eval_loss": 2.443532943725586,
1375
- "eval_runtime": 39.931,
1376
- "eval_samples_per_second": 43.976,
1377
- "eval_steps_per_second": 43.976,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
- "grad_norm": 1.521170735359192,
1383
- "learning_rate": 1.9700806602888767e-05,
1384
- "loss": 2.5969,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
- "grad_norm": 1.3700034618377686,
1390
- "learning_rate": 1.946632901894579e-05,
1391
- "loss": 2.6306,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
- "grad_norm": 2.311443328857422,
1397
- "learning_rate": 1.9231851435002814e-05,
1398
- "loss": 2.5608,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
- "grad_norm": 1.6699820756912231,
1404
- "learning_rate": 1.8997373851059842e-05,
1405
- "loss": 2.5113,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
- "eval_loss": 2.4421675205230713,
1411
- "eval_runtime": 40.1783,
1412
- "eval_samples_per_second": 43.705,
1413
- "eval_steps_per_second": 43.705,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
- "grad_norm": 1.2560683488845825,
1419
- "learning_rate": 1.8762896267116863e-05,
1420
- "loss": 2.545,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
- "grad_norm": 2.176563262939453,
1426
- "learning_rate": 1.852841868317389e-05,
1427
- "loss": 2.5752,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
- "grad_norm": 1.2551178932189941,
1433
- "learning_rate": 1.8293941099230916e-05,
1434
- "loss": 2.5215,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
- "grad_norm": 1.5646872520446777,
1440
- "learning_rate": 1.8059463515287937e-05,
1441
- "loss": 2.5838,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
- "eval_loss": 2.441195249557495,
1447
- "eval_runtime": 39.701,
1448
- "eval_samples_per_second": 44.231,
1449
- "eval_steps_per_second": 44.231,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
- "grad_norm": 1.4227900505065918,
1455
- "learning_rate": 1.7824985931344966e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
- "grad_norm": 1.3013832569122314,
1462
- "learning_rate": 1.759050834740199e-05,
1463
- "loss": 2.7641,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
- "grad_norm": 1.1282143592834473,
1469
- "learning_rate": 1.7356030763459015e-05,
1470
- "loss": 2.5875,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
- "grad_norm": 2.079760789871216,
1476
- "learning_rate": 1.712155317951604e-05,
1477
- "loss": 2.4861,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
- "eval_loss": 2.440812826156616,
1483
- "eval_runtime": 40.0764,
1484
- "eval_samples_per_second": 43.816,
1485
- "eval_steps_per_second": 43.816,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
- "grad_norm": 1.0884991884231567,
1491
- "learning_rate": 1.6887075595573065e-05,
1492
- "loss": 2.5941,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
- "grad_norm": 1.9202015399932861,
1498
- "learning_rate": 1.665259801163009e-05,
1499
- "loss": 2.5929,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
- "grad_norm": 1.5925830602645874,
1505
- "learning_rate": 1.6418120427687114e-05,
1506
- "loss": 2.5046,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
- "grad_norm": 1.5219184160232544,
1512
- "learning_rate": 1.618364284374414e-05,
1513
- "loss": 2.5628,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
- "eval_loss": 2.4396440982818604,
1519
- "eval_runtime": 40.0053,
1520
- "eval_samples_per_second": 43.894,
1521
- "eval_steps_per_second": 43.894,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
- "grad_norm": 1.4882445335388184,
1527
- "learning_rate": 1.5949165259801164e-05,
1528
- "loss": 2.6268,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
- "grad_norm": 1.3513301610946655,
1534
- "learning_rate": 1.571468767585819e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
- "grad_norm": 1.690974473953247,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
- "loss": 2.5631,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
- "grad_norm": 1.5311528444290161,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
- "loss": 2.5454,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
- "eval_loss": 2.4388718605041504,
1555
- "eval_runtime": 40.0289,
1556
- "eval_samples_per_second": 43.868,
1557
- "eval_steps_per_second": 43.868,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
- "grad_norm": 2.1171281337738037,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
- "grad_norm": 1.9706814289093018,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
- "loss": 2.588,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
- "grad_norm": 1.8991297483444214,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
- "loss": 2.5655,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
- "grad_norm": 1.5568820238113403,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
- "loss": 2.5312,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
- "eval_loss": 2.438715696334839,
1591
- "eval_runtime": 40.1748,
1592
- "eval_samples_per_second": 43.709,
1593
- "eval_steps_per_second": 43.709,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
- "grad_norm": 1.277051329612732,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
- "loss": 2.5818,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
- "grad_norm": 1.8890128135681152,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
- "loss": 2.5211,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
- "grad_norm": 1.8824830055236816,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
- "loss": 2.53,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
- "grad_norm": 1.239490032196045,
1620
  "learning_rate": 1.336991183642844e-05,
1621
- "loss": 2.5889,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
- "eval_loss": 2.437577962875366,
1627
- "eval_runtime": 40.1654,
1628
- "eval_samples_per_second": 43.719,
1629
- "eval_steps_per_second": 43.719,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
- "grad_norm": 1.7253328561782837,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
- "loss": 2.5426,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
- "grad_norm": 1.6971838474273682,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
- "loss": 2.4953,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
- "grad_norm": 1.4906270503997803,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
- "grad_norm": 1.658526062965393,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
- "loss": 2.5483,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
- "eval_loss": 2.437896490097046,
1663
- "eval_runtime": 40.7008,
1664
- "eval_samples_per_second": 43.144,
1665
- "eval_steps_per_second": 43.144,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
- "grad_norm": 1.0781177282333374,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
- "loss": 2.5449,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
- "grad_norm": 2.1414873600006104,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
- "loss": 2.5303,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
- "grad_norm": 2.063297986984253,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
- "loss": 2.5837,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
- "grad_norm": 1.2153489589691162,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
- "loss": 2.6384,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
- "eval_loss": 2.4365696907043457,
1699
- "eval_runtime": 40.2398,
1700
- "eval_samples_per_second": 43.638,
1701
- "eval_steps_per_second": 43.638,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
- "grad_norm": 1.2976094484329224,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
- "grad_norm": 1.2775920629501343,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
- "loss": 2.6083,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
- "grad_norm": 1.358311653137207,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
- "loss": 2.5206,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
- "grad_norm": 1.3438369035720825,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
- "loss": 2.4967,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
- "eval_loss": 2.4359662532806396,
1735
- "eval_runtime": 40.0802,
1736
- "eval_samples_per_second": 43.812,
1737
- "eval_steps_per_second": 43.812,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
- "grad_norm": 1.2618831396102905,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
- "loss": 2.6169,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
- "grad_norm": 1.3764727115631104,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
- "loss": 2.5444,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
- "grad_norm": 1.604864478111267,
1757
  "learning_rate": 9.852748077283812e-06,
1758
- "loss": 2.5343,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
- "grad_norm": 1.390496850013733,
1764
  "learning_rate": 9.618270493340837e-06,
1765
- "loss": 2.5051,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
- "eval_loss": 2.4353232383728027,
1771
- "eval_runtime": 40.1607,
1772
- "eval_samples_per_second": 43.724,
1773
- "eval_steps_per_second": 43.724,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
- "grad_norm": 2.1982169151306152,
1779
  "learning_rate": 9.383792909397862e-06,
1780
- "loss": 2.5036,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
- "grad_norm": 1.3033822774887085,
1786
  "learning_rate": 9.149315325454887e-06,
1787
- "loss": 2.5205,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
- "grad_norm": 1.682586431503296,
1793
- "learning_rate": 8.919527293190772e-06,
1794
- "loss": 2.6083,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
- "grad_norm": 3.184382200241089,
1800
- "learning_rate": 8.685049709247797e-06,
1801
- "loss": 2.5314,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
- "eval_loss": 2.434755802154541,
1807
- "eval_runtime": 40.2877,
1808
- "eval_samples_per_second": 43.587,
1809
- "eval_steps_per_second": 43.587,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
- "grad_norm": 2.0026867389678955,
1815
- "learning_rate": 8.450572125304821e-06,
1816
- "loss": 2.5109,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
- "grad_norm": 1.3833885192871094,
1822
- "learning_rate": 8.216094541361846e-06,
1823
- "loss": 2.5362,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
- "grad_norm": 2.157984495162964,
1829
- "learning_rate": 7.981616957418871e-06,
1830
- "loss": 2.5423,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
- "grad_norm": 1.682053565979004,
1836
- "learning_rate": 7.747139373475897e-06,
1837
- "loss": 2.5133,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
- "eval_loss": 2.435208559036255,
1843
- "eval_runtime": 40.4768,
1844
- "eval_samples_per_second": 43.383,
1845
- "eval_steps_per_second": 43.383,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
- "grad_norm": 1.9720139503479004,
1851
- "learning_rate": 7.512661789532921e-06,
1852
- "loss": 2.6372,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
- "grad_norm": 1.6906607151031494,
1858
- "learning_rate": 7.278184205589945e-06,
1859
- "loss": 2.5505,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
- "grad_norm": 1.484045147895813,
1865
- "learning_rate": 7.043706621646972e-06,
1866
- "loss": 2.5095,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
- "grad_norm": 1.6676850318908691,
1872
- "learning_rate": 6.8092290377039955e-06,
1873
- "loss": 2.6487,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
- "eval_loss": 2.4344091415405273,
1879
- "eval_runtime": 39.869,
1880
- "eval_samples_per_second": 44.044,
1881
- "eval_steps_per_second": 44.044,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
- "grad_norm": 1.5012388229370117,
1887
- "learning_rate": 6.57475145376102e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
- "grad_norm": 1.043954849243164,
1894
- "learning_rate": 6.340273869818046e-06,
1895
- "loss": 2.5843,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
- "grad_norm": 1.0455141067504883,
1901
- "learning_rate": 6.105796285875071e-06,
1902
- "loss": 2.5248,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
- "grad_norm": 1.39467453956604,
1908
- "learning_rate": 5.871318701932095e-06,
1909
- "loss": 2.5091,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
- "eval_loss": 2.4331610202789307,
1915
- "eval_runtime": 39.923,
1916
- "eval_samples_per_second": 43.985,
1917
- "eval_steps_per_second": 43.985,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
- "grad_norm": 1.1417715549468994,
1923
- "learning_rate": 5.63684111798912e-06,
1924
- "loss": 2.5853,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
- "grad_norm": 1.133244514465332,
1930
- "learning_rate": 5.402363534046146e-06,
1931
- "loss": 2.5457,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
- "grad_norm": 1.2331452369689941,
1937
- "learning_rate": 5.1678859501031705e-06,
1938
- "loss": 2.5576,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
- "grad_norm": 1.7164263725280762,
1944
- "learning_rate": 4.933408366160195e-06,
1945
- "loss": 2.5471,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
- "eval_loss": 2.4340403079986572,
1951
- "eval_runtime": 40.3849,
1952
- "eval_samples_per_second": 43.482,
1953
- "eval_steps_per_second": 43.482,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
- "grad_norm": 1.3680106401443481,
1959
- "learning_rate": 4.69893078221722e-06,
1960
- "loss": 2.5562,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
- "grad_norm": 1.0978279113769531,
1966
- "learning_rate": 4.464453198274246e-06,
1967
- "loss": 2.5185,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
- "grad_norm": 1.2212647199630737,
1973
- "learning_rate": 4.2299756143312695e-06,
1974
- "loss": 2.6371,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
- "grad_norm": 1.6452165842056274,
1980
- "learning_rate": 3.995498030388295e-06,
1981
- "loss": 2.681,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
- "eval_loss": 2.4337143898010254,
1987
- "eval_runtime": 40.4235,
1988
- "eval_samples_per_second": 43.44,
1989
- "eval_steps_per_second": 43.44,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
- "grad_norm": 1.7757978439331055,
1995
- "learning_rate": 3.7610204464453203e-06,
1996
- "loss": 2.5746,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
- "grad_norm": 1.2373579740524292,
2002
- "learning_rate": 3.5265428625023455e-06,
2003
- "loss": 2.5412,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
- "grad_norm": 1.1407558917999268,
2009
- "learning_rate": 3.29206527855937e-06,
2010
- "loss": 2.5973,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
- "grad_norm": 2.399686813354492,
2016
- "learning_rate": 3.057587694616395e-06,
2017
- "loss": 2.5566,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
- "eval_loss": 2.4338231086730957,
2023
- "eval_runtime": 40.4877,
2024
- "eval_samples_per_second": 43.371,
2025
- "eval_steps_per_second": 43.371,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
- "grad_norm": 1.7053141593933105,
2031
- "learning_rate": 2.8231101106734197e-06,
2032
- "loss": 2.6224,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
- "grad_norm": 1.8215903043746948,
2038
- "learning_rate": 2.5886325267304445e-06,
2039
- "loss": 2.5108,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
- "grad_norm": 1.1648200750350952,
2045
- "learning_rate": 2.3541549427874697e-06,
2046
- "loss": 2.557,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
- "grad_norm": 1.5225868225097656,
2052
- "learning_rate": 2.1196773588444944e-06,
2053
- "loss": 2.6285,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
- "eval_loss": 2.4334514141082764,
2059
- "eval_runtime": 40.3985,
2060
- "eval_samples_per_second": 43.467,
2061
- "eval_steps_per_second": 43.467,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
- "grad_norm": 1.4937622547149658,
2067
- "learning_rate": 1.8851997749015194e-06,
2068
- "loss": 2.481,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
- "grad_norm": 1.9169902801513672,
2074
- "learning_rate": 1.6507221909585446e-06,
2075
- "loss": 2.5412,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
- "grad_norm": 1.6611114740371704,
2081
- "learning_rate": 1.4162446070155693e-06,
2082
- "loss": 2.5086,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
- "grad_norm": 1.3464007377624512,
2088
- "learning_rate": 1.1817670230725943e-06,
2089
- "loss": 2.6063,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
- "eval_loss": 2.4329476356506348,
2095
- "eval_runtime": 40.4334,
2096
- "eval_samples_per_second": 43.429,
2097
- "eval_steps_per_second": 43.429,
2098
  "step": 11600
2099
  },
2100
  {
2101
  "epoch": 2.9495473824143827,
2102
- "grad_norm": 1.453385829925537,
2103
- "learning_rate": 9.472894391296193e-07,
2104
- "loss": 2.5012,
2105
  "step": 11650
2106
  },
2107
  {
2108
  "epoch": 2.9622080141799074,
2109
- "grad_norm": 1.6921356916427612,
2110
- "learning_rate": 7.128118551866442e-07,
2111
- "loss": 2.5589,
2112
  "step": 11700
2113
  },
2114
  {
2115
  "epoch": 2.9748686459454325,
2116
- "grad_norm": 1.0562982559204102,
2117
- "learning_rate": 4.783342712436691e-07,
2118
- "loss": 2.6015,
2119
  "step": 11750
2120
  },
2121
  {
2122
  "epoch": 2.9875292777109577,
2123
- "grad_norm": 1.457960844039917,
2124
- "learning_rate": 2.4385668730069406e-07,
2125
- "loss": 2.6224,
2126
  "step": 11800
2127
  },
2128
  {
2129
  "epoch": 2.9875292777109577,
2130
- "eval_loss": 2.432849645614624,
2131
- "eval_runtime": 40.4518,
2132
- "eval_samples_per_second": 43.41,
2133
- "eval_steps_per_second": 43.41,
2134
  "step": 11800
2135
  }
2136
  ],
 
1
  {
2
  "best_global_step": 11800,
3
+ "best_metric": 2.4338691234588623,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
  "epoch": 2.9875292777109577,
6
  "eval_steps": 200,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4365496337413788,
15
  "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4408,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5391681790351868,
22
  "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4568,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7692704796791077,
29
  "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4686,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.9071826934814453,
36
  "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5075,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5223069190979004,
43
+ "eval_runtime": 41.5235,
44
+ "eval_samples_per_second": 42.289,
45
+ "eval_steps_per_second": 42.289,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.976381778717041,
51
  "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.3576,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.81010901927948,
58
  "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.3492,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9288440942764282,
65
  "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2229,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.110400676727295,
72
  "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1555,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.1509861946105957,
79
+ "eval_runtime": 40.9241,
80
+ "eval_samples_per_second": 42.909,
81
+ "eval_steps_per_second": 42.909,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4328745603561401,
87
  "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0152,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
+ "grad_norm": 1.094860553741455,
94
  "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0463,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5432164669036865,
101
  "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9119,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2089171409606934,
108
  "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.885,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.7955658435821533,
115
+ "eval_runtime": 41.2266,
116
+ "eval_samples_per_second": 42.594,
117
+ "eval_steps_per_second": 42.594,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0353807210922241,
123
  "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8395,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.6014362573623657,
130
  "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8229,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.0306800603866577,
137
  "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9251,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.7377468347549438,
144
  "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.816,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.6652894020080566,
151
+ "eval_runtime": 41.6497,
152
+ "eval_samples_per_second": 42.161,
153
+ "eval_steps_per_second": 42.161,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.550484299659729,
159
  "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8018,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.1680374145507812,
166
  "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8037,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4538466930389404,
173
  "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.7734,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
+ "grad_norm": 1.5268754959106445,
180
  "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7975,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
+ "eval_loss": 2.6034388542175293,
187
+ "eval_runtime": 41.1034,
188
+ "eval_samples_per_second": 42.722,
189
+ "eval_steps_per_second": 42.722,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.813955545425415,
195
  "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7258,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.664474368095398,
202
  "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7607,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.343366861343384,
209
  "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7418,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5289666652679443,
216
  "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.7779,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.571211814880371,
223
+ "eval_runtime": 41.5835,
224
+ "eval_samples_per_second": 42.228,
225
+ "eval_steps_per_second": 42.228,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.2860196828842163,
231
  "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7216,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2128461599349976,
238
  "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6822,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8949031233787537,
245
  "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.696,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.1098757982254028,
252
+ "learning_rate": 4.900112549240293e-05,
253
+ "loss": 2.7004,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.5462076663970947,
259
+ "eval_runtime": 41.8547,
260
+ "eval_samples_per_second": 41.955,
261
+ "eval_steps_per_second": 41.955,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3600550889968872,
267
+ "learning_rate": 4.8766647908459956e-05,
268
+ "loss": 2.6385,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1471352577209473,
274
+ "learning_rate": 4.853217032451698e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.666717767715454,
281
+ "learning_rate": 4.8297692740574e-05,
282
+ "loss": 2.6755,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.4194293022155762,
288
+ "learning_rate": 4.8063215156631034e-05,
289
+ "loss": 2.6852,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5299158096313477,
295
+ "eval_runtime": 41.8943,
296
+ "eval_samples_per_second": 41.915,
297
+ "eval_steps_per_second": 41.915,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
+ "grad_norm": 0.9786908626556396,
303
+ "learning_rate": 4.7828737572688055e-05,
304
+ "loss": 2.7279,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1531881093978882,
310
+ "learning_rate": 4.759425998874508e-05,
311
+ "loss": 2.6643,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1486274003982544,
317
+ "learning_rate": 4.7359782404802105e-05,
318
+ "loss": 2.6994,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2005631923675537,
324
+ "learning_rate": 4.7125304820859126e-05,
325
+ "loss": 2.7233,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.520224094390869,
331
+ "eval_runtime": 41.9149,
332
+ "eval_samples_per_second": 41.894,
333
+ "eval_steps_per_second": 41.894,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.2023993730545044,
339
+ "learning_rate": 4.6890827236916154e-05,
340
+ "loss": 2.6352,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5334537029266357,
346
+ "learning_rate": 4.665634965297318e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9828767776489258,
353
+ "learning_rate": 4.6421872069030204e-05,
354
+ "loss": 2.6507,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
+ "grad_norm": 1.5031970739364624,
360
+ "learning_rate": 4.6187394485087225e-05,
361
+ "loss": 2.6432,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
+ "eval_loss": 2.511120319366455,
367
+ "eval_runtime": 41.0032,
368
+ "eval_samples_per_second": 42.826,
369
+ "eval_steps_per_second": 42.826,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2630196809768677,
375
  "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6226,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.3230444192886353,
382
  "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6596,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.2275980710983276,
389
  "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6419,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2460874319076538,
396
  "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6432,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.5041472911834717,
403
+ "eval_runtime": 40.9893,
404
+ "eval_samples_per_second": 42.84,
405
+ "eval_steps_per_second": 42.84,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.4000052213668823,
411
  "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.681,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.139631748199463,
418
  "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6119,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.4779937267303467,
425
  "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.615,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0678008794784546,
432
  "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6014,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.4982025623321533,
439
+ "eval_runtime": 41.0253,
440
+ "eval_samples_per_second": 42.803,
441
+ "eval_steps_per_second": 42.803,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4893417358398438,
447
  "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.574,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3910084962844849,
454
  "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6452,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1891777515411377,
461
  "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6627,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6157493591308594,
468
  "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6508,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4939932823181152,
475
+ "eval_runtime": 41.1171,
476
+ "eval_samples_per_second": 42.707,
477
+ "eval_steps_per_second": 42.707,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2457774877548218,
483
  "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6282,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.1914823055267334,
490
  "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6931,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.186735987663269,
497
  "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6229,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.868569016456604,
504
  "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6312,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.487884283065796,
511
+ "eval_runtime": 41.4603,
512
+ "eval_samples_per_second": 42.354,
513
+ "eval_steps_per_second": 42.354,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.3528209924697876,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.319753646850586,
526
  "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.5599,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.338115930557251,
533
  "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6528,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.2844877243041992,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4847159385681152,
547
+ "eval_runtime": 41.1324,
548
+ "eval_samples_per_second": 42.691,
549
+ "eval_steps_per_second": 42.691,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4525926113128662,
555
  "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.622,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5551460981369019,
562
  "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6219,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.39869225025177,
569
  "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5807,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4835882186889648,
576
  "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6733,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.4816081523895264,
583
+ "eval_runtime": 40.9609,
584
+ "eval_samples_per_second": 42.87,
585
+ "eval_steps_per_second": 42.87,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2404175996780396,
591
  "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6991,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.3770995140075684,
598
  "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.5512,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1706722974777222,
605
  "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6126,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.290719985961914,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.4776341915130615,
619
+ "eval_runtime": 41.0702,
620
+ "eval_samples_per_second": 42.756,
621
+ "eval_steps_per_second": 42.756,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.32352614402771,
627
  "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5755,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4078598022460938,
634
  "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.6678,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1207985877990723,
641
  "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5674,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4133316278457642,
648
  "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5949,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.473680019378662,
655
+ "eval_runtime": 42.1618,
656
+ "eval_samples_per_second": 41.649,
657
+ "eval_steps_per_second": 41.649,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9091076254844666,
663
  "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6154,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.3824701309204102,
670
  "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6569,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.3944271802902222,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.504271388053894,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4708938598632812,
691
+ "eval_runtime": 41.7484,
692
+ "eval_samples_per_second": 42.061,
693
+ "eval_steps_per_second": 42.061,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1897855997085571,
699
  "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6979,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9344286918640137,
706
  "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6678,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
+ "grad_norm": 4.620224475860596,
713
  "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.5652,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.275289535522461,
720
  "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5655,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4711084365844727,
727
+ "eval_runtime": 40.8651,
728
+ "eval_samples_per_second": 42.971,
729
+ "eval_steps_per_second": 42.971,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.460325837135315,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.2776564359664917,
742
  "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.626,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5591661930084229,
749
+ "learning_rate": 3.6109547927218154e-05,
750
+ "loss": 2.6603,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0031243562698364,
756
+ "learning_rate": 3.587507034327519e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4686498641967773,
763
+ "eval_runtime": 41.059,
764
+ "eval_samples_per_second": 42.768,
765
+ "eval_steps_per_second": 42.768,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.662988543510437,
771
+ "learning_rate": 3.564059275933221e-05,
772
+ "loss": 2.5906,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.5336205959320068,
778
+ "learning_rate": 3.540611517538923e-05,
779
+ "loss": 2.62,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2656798362731934,
785
+ "learning_rate": 3.517163759144626e-05,
786
+ "loss": 2.6229,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.5082098245620728,
792
+ "learning_rate": 3.493716000750328e-05,
793
+ "loss": 2.6015,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.466660737991333,
799
+ "eval_runtime": 40.8118,
800
+ "eval_samples_per_second": 43.027,
801
+ "eval_steps_per_second": 43.027,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8201966285705566,
807
+ "learning_rate": 3.470268242356031e-05,
808
+ "loss": 2.6495,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3035717010498047,
814
  "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.531,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2087314128875732,
821
  "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5412,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2561825513839722,
828
  "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6465,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.4628918170928955,
835
+ "eval_runtime": 40.8309,
836
+ "eval_samples_per_second": 43.007,
837
+ "eval_steps_per_second": 43.007,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.7700440883636475,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.4953458309173584,
850
  "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.577,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3659100532531738,
857
  "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6531,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.156020998954773,
864
  "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6418,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.462512969970703,
871
+ "eval_runtime": 40.8077,
872
+ "eval_samples_per_second": 43.031,
873
+ "eval_steps_per_second": 43.031,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.7687715291976929,
879
  "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6085,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.3047523498535156,
886
  "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.6002,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
+ "grad_norm": 1.040693998336792,
893
  "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6145,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9686591029167175,
900
  "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5709,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460991621017456,
907
+ "eval_runtime": 40.9408,
908
+ "eval_samples_per_second": 42.891,
909
+ "eval_steps_per_second": 42.891,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2371070384979248,
915
  "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5449,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.422345757484436,
922
  "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6032,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.229543447494507,
929
  "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6611,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4649646282196045,
936
  "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.5963,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.455350637435913,
943
+ "eval_runtime": 40.876,
944
+ "eval_samples_per_second": 42.959,
945
+ "eval_steps_per_second": 42.959,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.2330511808395386,
951
  "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5561,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1780569553375244,
958
  "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5878,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5878489017486572,
965
  "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5788,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2362117767333984,
972
  "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.685,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4557485580444336,
979
+ "eval_runtime": 40.7838,
980
+ "eval_samples_per_second": 43.056,
981
+ "eval_steps_per_second": 43.056,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4540385007858276,
987
  "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5653,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
+ "grad_norm": 1.560059905052185,
994
  "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5448,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.153442144393921,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5042,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.250948429107666,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.575,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4553444385528564,
1015
+ "eval_runtime": 40.9798,
1016
+ "eval_samples_per_second": 42.85,
1017
+ "eval_steps_per_second": 42.85,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.6559193134307861,
1023
  "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6065,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.6024394035339355,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5194,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3071702718734741,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5348,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1332521438598633,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5913,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.454563617706299,
1051
+ "eval_runtime": 40.8474,
1052
+ "eval_samples_per_second": 42.989,
1053
+ "eval_steps_per_second": 42.989,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.260486364364624,
1059
  "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.612,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.009621500968933,
1066
  "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6078,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.3116769790649414,
1073
  "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5739,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.485499143600464,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6272,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.450514316558838,
1087
+ "eval_runtime": 40.6819,
1088
+ "eval_samples_per_second": 43.164,
1089
+ "eval_steps_per_second": 43.164,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.934110164642334,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5319,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9517920017242432,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.3010960817337036,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5811,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.7256052494049072,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6294,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4498414993286133,
1123
+ "eval_runtime": 41.0253,
1124
+ "eval_samples_per_second": 42.803,
1125
+ "eval_steps_per_second": 42.803,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6172245740890503,
1131
  "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.6309,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3149018287658691,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5658,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.6285394430160522,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.611,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.0910215377807617,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6277,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.449920892715454,
1159
+ "eval_runtime": 40.7694,
1160
+ "eval_samples_per_second": 43.071,
1161
+ "eval_steps_per_second": 43.071,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.497223138809204,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6336,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3010990619659424,
1174
  "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5497,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.4681612253189087,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3477168083190918,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5352,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.4485294818878174,
1195
+ "eval_runtime": 40.856,
1196
+ "eval_samples_per_second": 42.98,
1197
+ "eval_steps_per_second": 42.98,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2609894275665283,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5984,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.498071312904358,
1210
  "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6117,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5235400199890137,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6127,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.7103843688964844,
1224
  "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.5761,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4469785690307617,
1231
+ "eval_runtime": 40.7827,
1232
+ "eval_samples_per_second": 43.058,
1233
+ "eval_steps_per_second": 43.058,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2467267513275146,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6174,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.8229267597198486,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6364,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.1323461532592773,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5595,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.150225043296814,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6266,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.445380926132202,
1267
+ "eval_runtime": 40.7346,
1268
+ "eval_samples_per_second": 43.108,
1269
+ "eval_steps_per_second": 43.108,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.36672842502594,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6212,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.244776725769043,
1282
  "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5734,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3731275796890259,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.536,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
+ "grad_norm": 2.2051963806152344,
1296
  "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6097,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.4447412490844727,
1303
+ "eval_runtime": 40.7537,
1304
+ "eval_samples_per_second": 43.088,
1305
+ "eval_steps_per_second": 43.088,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.2323483228683472,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.555,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0700924396514893,
1318
  "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5598,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
+ "grad_norm": 2.785604238510132,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5682,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6302391290664673,
1332
  "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6002,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.4443070888519287,
1339
+ "eval_runtime": 40.7834,
1340
+ "eval_samples_per_second": 43.057,
1341
+ "eval_steps_per_second": 43.057,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.270948886871338,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0166101455688477,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5687,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.4803165197372437,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.66029953956604,
1368
+ "learning_rate": 1.993059463515288e-05,
1369
+ "loss": 2.5815,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.4443864822387695,
1375
+ "eval_runtime": 40.8884,
1376
+ "eval_samples_per_second": 42.946,
1377
+ "eval_steps_per_second": 42.946,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.5316967964172363,
1383
+ "learning_rate": 1.9696117051209907e-05,
1384
+ "loss": 2.5979,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3586021661758423,
1390
+ "learning_rate": 1.946163946726693e-05,
1391
+ "loss": 2.6304,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.293283462524414,
1397
+ "learning_rate": 1.9227161883323953e-05,
1398
+ "loss": 2.5601,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6579082012176514,
1404
+ "learning_rate": 1.899268429938098e-05,
1405
+ "loss": 2.5124,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.443239688873291,
1411
+ "eval_runtime": 41.5253,
1412
+ "eval_samples_per_second": 42.288,
1413
+ "eval_steps_per_second": 42.288,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2292983531951904,
1419
+ "learning_rate": 1.8758206715438003e-05,
1420
+ "loss": 2.5449,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
+ "grad_norm": 2.1584088802337646,
1426
+ "learning_rate": 1.852372913149503e-05,
1427
+ "loss": 2.576,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.248931646347046,
1433
+ "learning_rate": 1.8289251547552055e-05,
1434
+ "loss": 2.5218,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5526643991470337,
1440
+ "learning_rate": 1.8054773963609077e-05,
1441
+ "loss": 2.5839,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.4422366619110107,
1447
+ "eval_runtime": 41.5286,
1448
+ "eval_samples_per_second": 42.284,
1449
+ "eval_steps_per_second": 42.284,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4182465076446533,
1455
+ "learning_rate": 1.7820296379666105e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.2547794580459595,
1462
+ "learning_rate": 1.758581879572313e-05,
1463
+ "loss": 2.7643,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.093676209449768,
1469
+ "learning_rate": 1.7351341211780155e-05,
1470
+ "loss": 2.5877,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.055103302001953,
1476
+ "learning_rate": 1.711686362783718e-05,
1477
+ "loss": 2.4874,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.4419164657592773,
1483
+ "eval_runtime": 40.7627,
1484
+ "eval_samples_per_second": 43.079,
1485
+ "eval_steps_per_second": 43.079,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0890482664108276,
1491
+ "learning_rate": 1.6882386043894204e-05,
1492
+ "loss": 2.5942,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.8730581998825073,
1498
+ "learning_rate": 1.6647908459951232e-05,
1499
+ "loss": 2.592,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.6372568607330322,
1505
+ "learning_rate": 1.6413430876008254e-05,
1506
+ "loss": 2.5051,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.4793121814727783,
1512
+ "learning_rate": 1.6178953292065278e-05,
1513
+ "loss": 2.5644,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.44052791595459,
1519
+ "eval_runtime": 40.8302,
1520
+ "eval_samples_per_second": 43.007,
1521
+ "eval_steps_per_second": 43.007,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4595574140548706,
1527
+ "learning_rate": 1.5944475708122306e-05,
1528
+ "loss": 2.6267,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3399115800857544,
1534
+ "learning_rate": 1.5709998124179328e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.6734541654586792,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5633,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5579371452331543,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5467,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4398272037506104,
1555
+ "eval_runtime": 40.8947,
1556
+ "eval_samples_per_second": 42.94,
1557
+ "eval_steps_per_second": 42.94,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
+ "grad_norm": 1.932307243347168,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9798572063446045,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.5891,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8812506198883057,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5659,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5422954559326172,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5315,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.439927816390991,
1591
+ "eval_runtime": 40.7058,
1592
+ "eval_samples_per_second": 43.139,
1593
+ "eval_steps_per_second": 43.139,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.2686810493469238,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5809,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.905816674232483,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5225,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.9044383764266968,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.5301,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.2211689949035645,
1620
  "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5885,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.4387078285217285,
1627
+ "eval_runtime": 40.6013,
1628
+ "eval_samples_per_second": 43.25,
1629
+ "eval_steps_per_second": 43.25,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7181427478790283,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5422,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.714859127998352,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4957,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.473822832107544,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.6518057584762573,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5488,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.438912868499756,
1663
+ "eval_runtime": 40.9773,
1664
+ "eval_samples_per_second": 42.853,
1665
+ "eval_steps_per_second": 42.853,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0921835899353027,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5456,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.0887908935546875,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5298,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.09403133392334,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5843,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2155842781066895,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.639,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4376986026763916,
1699
+ "eval_runtime": 40.7687,
1700
+ "eval_samples_per_second": 43.072,
1701
+ "eval_steps_per_second": 43.072,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2745308876037598,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.243294358253479,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6086,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.3740507364273071,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5203,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3419544696807861,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4968,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4370713233947754,
1735
+ "eval_runtime": 40.6311,
1736
+ "eval_samples_per_second": 43.218,
1737
+ "eval_steps_per_second": 43.218,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2722185850143433,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.617,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.336860179901123,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5443,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.5844101905822754,
1757
  "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5358,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.376717209815979,
1764
  "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5065,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4364349842071533,
1771
+ "eval_runtime": 40.68,
1772
+ "eval_samples_per_second": 43.166,
1773
+ "eval_steps_per_second": 43.166,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.2268385887145996,
1779
  "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5043,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.304364800453186,
1786
  "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5213,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.662419319152832,
1793
+ "learning_rate": 8.914837741511913e-06,
1794
+ "loss": 2.6088,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.155359983444214,
1800
+ "learning_rate": 8.680360157568938e-06,
1801
+ "loss": 2.5311,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.4357810020446777,
1807
+ "eval_runtime": 40.7194,
1808
+ "eval_samples_per_second": 43.124,
1809
+ "eval_steps_per_second": 43.124,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
+ "grad_norm": 1.9857499599456787,
1815
+ "learning_rate": 8.44588257362596e-06,
1816
+ "loss": 2.5105,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.383115530014038,
1822
+ "learning_rate": 8.211404989682987e-06,
1823
+ "loss": 2.5388,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.1307530403137207,
1829
+ "learning_rate": 7.976927405740012e-06,
1830
+ "loss": 2.5422,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.7428008317947388,
1836
+ "learning_rate": 7.742449821797037e-06,
1837
+ "loss": 2.5139,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.4361467361450195,
1843
+ "eval_runtime": 40.6659,
1844
+ "eval_samples_per_second": 43.181,
1845
+ "eval_steps_per_second": 43.181,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9510554075241089,
1851
+ "learning_rate": 7.507972237854062e-06,
1852
+ "loss": 2.6383,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.658544898033142,
1858
+ "learning_rate": 7.273494653911086e-06,
1859
+ "loss": 2.5507,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.4996962547302246,
1865
+ "learning_rate": 7.039017069968111e-06,
1866
+ "loss": 2.5093,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6710158586502075,
1872
+ "learning_rate": 6.804539486025137e-06,
1873
+ "loss": 2.6495,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.435485601425171,
1879
+ "eval_runtime": 40.7734,
1880
+ "eval_samples_per_second": 43.067,
1881
+ "eval_steps_per_second": 43.067,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.3965766429901123,
1887
+ "learning_rate": 6.570061902082161e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.1116695404052734,
1894
+ "learning_rate": 6.335584318139186e-06,
1895
+ "loss": 2.584,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0421807765960693,
1901
+ "learning_rate": 6.101106734196211e-06,
1902
+ "loss": 2.5244,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.374508023262024,
1908
+ "learning_rate": 5.8666291502532365e-06,
1909
+ "loss": 2.5097,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4343748092651367,
1915
+ "eval_runtime": 40.8106,
1916
+ "eval_samples_per_second": 43.028,
1917
+ "eval_steps_per_second": 43.028,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.139459252357483,
1923
+ "learning_rate": 5.632151566310261e-06,
1924
+ "loss": 2.586,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.1283456087112427,
1930
+ "learning_rate": 5.397673982367286e-06,
1931
+ "loss": 2.5461,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2529475688934326,
1937
+ "learning_rate": 5.163196398424311e-06,
1938
+ "loss": 2.5577,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7139452695846558,
1944
+ "learning_rate": 4.928718814481336e-06,
1945
+ "loss": 2.5476,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.435107707977295,
1951
+ "eval_runtime": 40.9961,
1952
+ "eval_samples_per_second": 42.833,
1953
+ "eval_steps_per_second": 42.833,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3778159618377686,
1959
+ "learning_rate": 4.69424123053836e-06,
1960
+ "loss": 2.5561,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0923840999603271,
1966
+ "learning_rate": 4.459763646595386e-06,
1967
+ "loss": 2.5184,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2169151306152344,
1973
+ "learning_rate": 4.225286062652411e-06,
1974
+ "loss": 2.6378,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.5901875495910645,
1980
+ "learning_rate": 3.9908084787094354e-06,
1981
+ "loss": 2.6822,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.434779644012451,
1987
+ "eval_runtime": 40.6453,
1988
+ "eval_samples_per_second": 43.203,
1989
+ "eval_steps_per_second": 43.203,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7463274002075195,
1995
+ "learning_rate": 3.7563308947664606e-06,
1996
+ "loss": 2.5749,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.236441731452942,
2002
+ "learning_rate": 3.521853310823486e-06,
2003
+ "loss": 2.5416,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.132720708847046,
2009
+ "learning_rate": 3.28737572688051e-06,
2010
+ "loss": 2.597,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.339376926422119,
2016
+ "learning_rate": 3.0528981429375353e-06,
2017
+ "loss": 2.5564,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4349372386932373,
2023
+ "eval_runtime": 40.7164,
2024
+ "eval_samples_per_second": 43.128,
2025
+ "eval_steps_per_second": 43.128,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.878458857536316,
2031
+ "learning_rate": 2.81842055899456e-06,
2032
+ "loss": 2.6238,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8116399049758911,
2038
+ "learning_rate": 2.5839429750515852e-06,
2039
+ "loss": 2.5114,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.155181884765625,
2045
+ "learning_rate": 2.34946539110861e-06,
2046
+ "loss": 2.5582,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.505588173866272,
2052
+ "learning_rate": 2.1149878071656348e-06,
2053
+ "loss": 2.6288,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.434521436691284,
2059
+ "eval_runtime": 41.0956,
2060
+ "eval_samples_per_second": 42.73,
2061
+ "eval_steps_per_second": 42.73,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4831116199493408,
2067
+ "learning_rate": 1.8805102232226601e-06,
2068
+ "loss": 2.4811,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.931284785270691,
2074
+ "learning_rate": 1.646032639279685e-06,
2075
+ "loss": 2.5426,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6025974750518799,
2081
+ "learning_rate": 1.4115550553367099e-06,
2082
+ "loss": 2.509,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3426520824432373,
2088
+ "learning_rate": 1.1770774713937348e-06,
2089
+ "loss": 2.6057,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4339404106140137,
2095
+ "eval_runtime": 41.3931,
2096
+ "eval_samples_per_second": 42.423,
2097
+ "eval_steps_per_second": 42.423,
2098
  "step": 11600
2099
  },
2100
  {
2101
  "epoch": 2.9495473824143827,
2102
+ "grad_norm": 1.4393337965011597,
2103
+ "learning_rate": 9.425998874507597e-07,
2104
+ "loss": 2.5014,
2105
  "step": 11650
2106
  },
2107
  {
2108
  "epoch": 2.9622080141799074,
2109
+ "grad_norm": 1.692421317100525,
2110
+ "learning_rate": 7.081223035077847e-07,
2111
+ "loss": 2.5586,
2112
  "step": 11700
2113
  },
2114
  {
2115
  "epoch": 2.9748686459454325,
2116
+ "grad_norm": 1.0518131256103516,
2117
+ "learning_rate": 4.7364471956480963e-07,
2118
+ "loss": 2.602,
2119
  "step": 11750
2120
  },
2121
  {
2122
  "epoch": 2.9875292777109577,
2123
+ "grad_norm": 1.4363523721694946,
2124
+ "learning_rate": 2.3916713562183455e-07,
2125
+ "loss": 2.6237,
2126
  "step": 11800
2127
  },
2128
  {
2129
  "epoch": 2.9875292777109577,
2130
+ "eval_loss": 2.4338691234588623,
2131
+ "eval_runtime": 41.308,
2132
+ "eval_samples_per_second": 42.51,
2133
+ "eval_steps_per_second": 42.51,
2134
  "step": 11800
2135
  }
2136
  ],
checkpoint-11800/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8421dc43c44b3cc68cec62a0d36963570aa58f934fcd2e92f9f288f7caa6d69
3
  size 5304
checkpoint-11847/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2096525bcf5c5b06858aea39ffe9f1d86e5e5f698c630fe8a7b6968326c4147d
3
  size 3253104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa2f1df60554ee685cde47575ed40f6a8297745be6f1056960fb1d70e9a86729
3
  size 3253104
checkpoint-11847/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d317fa4b44b54d0a2990dc5b855d3892cbb408e7299889f7dfde8d83f90dec55
3
  size 6548858
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d331ba0a6b9afa0a81d2bdaa8396465259a7d849ce79f88209d45f71dce1e32d
3
  size 6548858
checkpoint-11847/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ace690d17ecc0bd723919a7caf76c676aa6af99ac2aad28a489002b232ed59bd
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34a3be3a0c0b571fefcced99447d767044a403734db5c8947ba5dc5655d95959
3
  size 988
checkpoint-11847/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31739f88076d66774095d513dd74fc5f8222fb41697cf1f0e9e3ad7cd14e52b4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb45fc26f3d2b8d576e1bce346ec7cdad773d5657c24e11a5e0c16bd12c33a15
3
  size 1064
checkpoint-11847/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 11800,
3
- "best_metric": 2.432849645614624,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
  "epoch": 2.999430271570551,
6
  "eval_steps": 200,
@@ -11,2126 +11,2126 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
- "grad_norm": 0.4274106025695801,
15
  "learning_rate": 2.067510548523207e-06,
16
- "loss": 3.4405,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
- "grad_norm": 0.5292551517486572,
22
  "learning_rate": 4.177215189873418e-06,
23
- "loss": 3.4567,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
- "grad_norm": 0.7541739344596863,
29
  "learning_rate": 6.28691983122363e-06,
30
- "loss": 3.4683,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
- "grad_norm": 0.8833445906639099,
36
  "learning_rate": 8.39662447257384e-06,
37
- "loss": 3.5084,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
- "eval_loss": 3.5248923301696777,
43
- "eval_runtime": 39.9384,
44
- "eval_samples_per_second": 43.968,
45
- "eval_steps_per_second": 43.968,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
- "grad_norm": 0.9998921155929565,
51
  "learning_rate": 1.0506329113924052e-05,
52
- "loss": 3.359,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
- "grad_norm": 0.8041885495185852,
58
  "learning_rate": 1.2616033755274262e-05,
59
- "loss": 3.351,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
- "grad_norm": 0.9213416576385498,
65
  "learning_rate": 1.4725738396624473e-05,
66
- "loss": 3.2244,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
- "grad_norm": 1.0922213792800903,
72
  "learning_rate": 1.6835443037974685e-05,
73
- "loss": 3.1565,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
- "eval_loss": 3.151216983795166,
79
- "eval_runtime": 39.7515,
80
- "eval_samples_per_second": 44.174,
81
- "eval_steps_per_second": 44.174,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
- "grad_norm": 1.4199283123016357,
87
  "learning_rate": 1.8945147679324897e-05,
88
- "loss": 3.0154,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
- "grad_norm": 1.077143907546997,
94
  "learning_rate": 2.1054852320675106e-05,
95
- "loss": 3.0456,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
- "grad_norm": 1.5466052293777466,
101
  "learning_rate": 2.3164556962025318e-05,
102
- "loss": 2.9099,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
- "grad_norm": 1.2139467000961304,
108
  "learning_rate": 2.5274261603375527e-05,
109
- "loss": 2.8839,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
- "eval_loss": 2.793567657470703,
115
- "eval_runtime": 40.2573,
116
- "eval_samples_per_second": 43.619,
117
- "eval_steps_per_second": 43.619,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
- "grad_norm": 1.0270315408706665,
123
  "learning_rate": 2.738396624472574e-05,
124
- "loss": 2.8389,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
- "grad_norm": 1.5865377187728882,
130
  "learning_rate": 2.949367088607595e-05,
131
- "loss": 2.8228,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
- "grad_norm": 1.076073408126831,
137
  "learning_rate": 3.160337552742616e-05,
138
- "loss": 2.9255,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
- "grad_norm": 1.4510694742202759,
144
  "learning_rate": 3.3713080168776376e-05,
145
- "loss": 2.8165,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
- "eval_loss": 2.667301893234253,
151
- "eval_runtime": 40.0906,
152
- "eval_samples_per_second": 43.801,
153
- "eval_steps_per_second": 43.801,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
- "grad_norm": 1.5206592082977295,
159
  "learning_rate": 3.5822784810126585e-05,
160
- "loss": 2.8022,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
- "grad_norm": 1.173909068107605,
166
  "learning_rate": 3.7932489451476794e-05,
167
- "loss": 2.8034,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
- "grad_norm": 1.4551103115081787,
173
  "learning_rate": 4.004219409282701e-05,
174
- "loss": 2.774,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
- "grad_norm": 1.509749412536621,
180
  "learning_rate": 4.215189873417722e-05,
181
- "loss": 2.7978,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
- "eval_loss": 2.603116273880005,
187
- "eval_runtime": 39.9617,
188
- "eval_samples_per_second": 43.942,
189
- "eval_steps_per_second": 43.942,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
- "grad_norm": 1.745764136314392,
195
  "learning_rate": 4.426160337552743e-05,
196
- "loss": 2.7249,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
- "grad_norm": 1.6712589263916016,
202
  "learning_rate": 4.637130801687764e-05,
203
- "loss": 2.7618,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
- "grad_norm": 2.256267786026001,
209
  "learning_rate": 4.8481012658227845e-05,
210
- "loss": 2.7431,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
- "grad_norm": 1.5181586742401123,
216
  "learning_rate": 4.993434627649597e-05,
217
- "loss": 2.778,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
- "eval_loss": 2.5704379081726074,
223
- "eval_runtime": 40.1791,
224
- "eval_samples_per_second": 43.704,
225
- "eval_steps_per_second": 43.704,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
- "grad_norm": 1.1885608434677124,
231
  "learning_rate": 4.969986869255299e-05,
232
- "loss": 2.7224,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
- "grad_norm": 1.2136404514312744,
238
  "learning_rate": 4.946539110861002e-05,
239
- "loss": 2.6823,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
- "grad_norm": 0.8780732750892639,
245
  "learning_rate": 4.9230913524667046e-05,
246
- "loss": 2.6961,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
- "grad_norm": 1.0844959020614624,
252
- "learning_rate": 4.899643594072407e-05,
253
- "loss": 2.7014,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
- "eval_loss": 2.54587721824646,
259
- "eval_runtime": 39.8803,
260
- "eval_samples_per_second": 44.032,
261
- "eval_steps_per_second": 44.032,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
- "grad_norm": 1.3518335819244385,
267
- "learning_rate": 4.8761958356781096e-05,
268
- "loss": 2.6393,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
- "grad_norm": 1.1389687061309814,
274
- "learning_rate": 4.852748077283812e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
- "grad_norm": 1.6295430660247803,
281
- "learning_rate": 4.829300318889514e-05,
282
- "loss": 2.6754,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
- "grad_norm": 1.387499451637268,
288
- "learning_rate": 4.8058525604952173e-05,
289
- "loss": 2.6853,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
- "eval_loss": 2.5297553539276123,
295
- "eval_runtime": 39.722,
296
- "eval_samples_per_second": 44.207,
297
- "eval_steps_per_second": 44.207,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
- "grad_norm": 1.014020323753357,
303
- "learning_rate": 4.7824048021009195e-05,
304
- "loss": 2.7275,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
- "grad_norm": 1.1505990028381348,
310
- "learning_rate": 4.7589570437066216e-05,
311
- "loss": 2.6651,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
- "grad_norm": 1.1389458179473877,
317
- "learning_rate": 4.7355092853123244e-05,
318
- "loss": 2.6993,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
- "grad_norm": 1.2159587144851685,
324
- "learning_rate": 4.7120615269180266e-05,
325
- "loss": 2.7239,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
- "eval_loss": 2.5201079845428467,
331
- "eval_runtime": 39.8313,
332
- "eval_samples_per_second": 44.086,
333
- "eval_steps_per_second": 44.086,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
- "grad_norm": 1.1873971223831177,
339
- "learning_rate": 4.6886137685237294e-05,
340
- "loss": 2.6368,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
- "grad_norm": 1.5109103918075562,
346
- "learning_rate": 4.665166010129432e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
- "grad_norm": 1.9981125593185425,
353
- "learning_rate": 4.641718251735134e-05,
354
- "loss": 2.6513,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
- "grad_norm": 1.4879294633865356,
360
- "learning_rate": 4.6182704933408365e-05,
361
- "loss": 2.6433,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
- "eval_loss": 2.5107176303863525,
367
- "eval_runtime": 40.0643,
368
- "eval_samples_per_second": 43.83,
369
- "eval_steps_per_second": 43.83,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
- "grad_norm": 1.2832767963409424,
375
  "learning_rate": 4.5952916901144253e-05,
376
- "loss": 2.6225,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
- "grad_norm": 1.2915899753570557,
382
  "learning_rate": 4.5718439317201275e-05,
383
- "loss": 2.6592,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
- "grad_norm": 1.229929804801941,
389
  "learning_rate": 4.54839617332583e-05,
390
- "loss": 2.6411,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
- "grad_norm": 1.2569608688354492,
396
  "learning_rate": 4.524948414931533e-05,
397
- "loss": 2.6436,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
- "eval_loss": 2.504101514816284,
403
- "eval_runtime": 39.8694,
404
- "eval_samples_per_second": 44.044,
405
- "eval_steps_per_second": 44.044,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
- "grad_norm": 1.3688510656356812,
411
  "learning_rate": 4.501500656537235e-05,
412
- "loss": 2.6819,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
- "grad_norm": 1.1405905485153198,
418
  "learning_rate": 4.4780528981429374e-05,
419
- "loss": 2.6116,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
- "grad_norm": 1.453338861465454,
425
  "learning_rate": 4.45460513974864e-05,
426
- "loss": 2.6154,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
- "grad_norm": 1.0401395559310913,
432
  "learning_rate": 4.431157381354343e-05,
433
- "loss": 2.6018,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
- "eval_loss": 2.498344898223877,
439
- "eval_runtime": 39.9496,
440
- "eval_samples_per_second": 43.955,
441
- "eval_steps_per_second": 43.955,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
- "grad_norm": 1.4646718502044678,
447
  "learning_rate": 4.407709622960045e-05,
448
- "loss": 2.5734,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
- "grad_norm": 1.3828164339065552,
454
  "learning_rate": 4.384261864565748e-05,
455
- "loss": 2.6445,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
- "grad_norm": 2.1768596172332764,
461
  "learning_rate": 4.36081410617145e-05,
462
- "loss": 2.6618,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
- "grad_norm": 1.6110296249389648,
468
  "learning_rate": 4.337366347777152e-05,
469
- "loss": 2.6509,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
- "eval_loss": 2.4937028884887695,
475
- "eval_runtime": 39.8698,
476
- "eval_samples_per_second": 44.043,
477
- "eval_steps_per_second": 44.043,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
- "grad_norm": 1.2363536357879639,
483
  "learning_rate": 4.313918589382856e-05,
484
- "loss": 2.6274,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
- "grad_norm": 2.192110538482666,
490
  "learning_rate": 4.290470830988558e-05,
491
- "loss": 2.6932,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
- "grad_norm": 1.2024074792861938,
497
  "learning_rate": 4.26702307259426e-05,
498
- "loss": 2.6221,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
- "grad_norm": 1.8665797710418701,
504
  "learning_rate": 4.243575314199963e-05,
505
- "loss": 2.6313,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
- "eval_loss": 2.4876773357391357,
511
- "eval_runtime": 40.026,
512
- "eval_samples_per_second": 43.871,
513
- "eval_steps_per_second": 43.871,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
- "grad_norm": 1.4088993072509766,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
- "grad_norm": 1.3225140571594238,
526
  "learning_rate": 4.196679797411368e-05,
527
- "loss": 2.56,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
- "grad_norm": 1.3416539430618286,
533
  "learning_rate": 4.1732320390170706e-05,
534
- "loss": 2.6517,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
- "grad_norm": 1.079567790031433,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
- "eval_loss": 2.4842560291290283,
547
- "eval_runtime": 39.7988,
548
- "eval_samples_per_second": 44.122,
549
- "eval_steps_per_second": 44.122,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
- "grad_norm": 1.4532116651535034,
555
  "learning_rate": 4.126336522228475e-05,
556
- "loss": 2.6232,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
- "grad_norm": 1.5380038022994995,
562
  "learning_rate": 4.102888763834178e-05,
563
- "loss": 2.6212,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
- "grad_norm": 1.3965916633605957,
569
  "learning_rate": 4.0794410054398805e-05,
570
- "loss": 2.5804,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
- "grad_norm": 1.4798463582992554,
576
  "learning_rate": 4.0559932470455826e-05,
577
- "loss": 2.6724,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
- "eval_loss": 2.480894088745117,
583
- "eval_runtime": 39.9604,
584
- "eval_samples_per_second": 43.943,
585
- "eval_steps_per_second": 43.943,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
- "grad_norm": 1.2598360776901245,
591
  "learning_rate": 4.0325454886512854e-05,
592
- "loss": 2.6993,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
- "grad_norm": 1.366295576095581,
598
  "learning_rate": 4.0090977302569876e-05,
599
- "loss": 2.551,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
- "grad_norm": 1.1827855110168457,
605
  "learning_rate": 3.98564997186269e-05,
606
- "loss": 2.6131,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
- "grad_norm": 1.2728627920150757,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
- "eval_loss": 2.477010726928711,
619
- "eval_runtime": 40.2504,
620
- "eval_samples_per_second": 43.627,
621
- "eval_steps_per_second": 43.627,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
- "grad_norm": 1.341917634010315,
627
  "learning_rate": 3.938754455074095e-05,
628
- "loss": 2.5748,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
- "grad_norm": 1.4114609956741333,
634
  "learning_rate": 3.9153066966797975e-05,
635
- "loss": 2.667,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
- "grad_norm": 1.1211490631103516,
641
  "learning_rate": 3.8918589382855e-05,
642
- "loss": 2.5671,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
- "grad_norm": 1.4166322946548462,
648
  "learning_rate": 3.8684111798912024e-05,
649
- "loss": 2.5945,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
- "eval_loss": 2.47322940826416,
655
- "eval_runtime": 40.2079,
656
- "eval_samples_per_second": 43.673,
657
- "eval_steps_per_second": 43.673,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
- "grad_norm": 0.9144394993782043,
663
  "learning_rate": 3.844963421496905e-05,
664
- "loss": 2.6148,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
- "grad_norm": 1.4106061458587646,
670
  "learning_rate": 3.821515663102608e-05,
671
- "loss": 2.6586,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
- "grad_norm": 1.414415717124939,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
- "grad_norm": 1.5448992252349854,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
- "eval_loss": 2.4701173305511475,
691
- "eval_runtime": 40.1267,
692
- "eval_samples_per_second": 43.761,
693
- "eval_steps_per_second": 43.761,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
- "grad_norm": 1.1959314346313477,
699
  "learning_rate": 3.751172387919715e-05,
700
- "loss": 2.6975,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
- "grad_norm": 0.9525274038314819,
706
  "learning_rate": 3.727724629525417e-05,
707
- "loss": 2.6675,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
- "grad_norm": 4.733253479003906,
713
  "learning_rate": 3.70427687113112e-05,
714
- "loss": 2.566,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
- "grad_norm": 1.2803192138671875,
720
  "learning_rate": 3.680829112736823e-05,
721
- "loss": 2.5659,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
- "eval_loss": 2.4702188968658447,
727
- "eval_runtime": 40.1387,
728
- "eval_samples_per_second": 43.748,
729
- "eval_steps_per_second": 43.748,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
- "grad_norm": 1.446990966796875,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
- "grad_norm": 1.3563008308410645,
742
  "learning_rate": 3.633933595948227e-05,
743
- "loss": 2.6252,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
- "grad_norm": 1.5763463973999023,
749
- "learning_rate": 3.61048583755393e-05,
750
- "loss": 2.6593,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
- "grad_norm": 1.0055335760116577,
756
- "learning_rate": 3.587038079159633e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
- "eval_loss": 2.4676930904388428,
763
- "eval_runtime": 40.0342,
764
- "eval_samples_per_second": 43.863,
765
- "eval_steps_per_second": 43.863,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
- "grad_norm": 1.7013343572616577,
771
- "learning_rate": 3.563590320765335e-05,
772
- "loss": 2.59,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
- "grad_norm": 1.541069507598877,
778
- "learning_rate": 3.540142562371038e-05,
779
- "loss": 2.6192,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
- "grad_norm": 1.2536805868148804,
785
- "learning_rate": 3.51669480397674e-05,
786
- "loss": 2.6225,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
- "grad_norm": 1.8328826427459717,
792
- "learning_rate": 3.493247045582442e-05,
793
- "loss": 2.6022,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
- "eval_loss": 2.465629816055298,
799
- "eval_runtime": 39.8532,
800
- "eval_samples_per_second": 44.062,
801
- "eval_steps_per_second": 44.062,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
- "grad_norm": 1.8557270765304565,
807
- "learning_rate": 3.469799287188145e-05,
808
- "loss": 2.6496,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
- "grad_norm": 1.3255618810653687,
814
  "learning_rate": 3.446820483961734e-05,
815
- "loss": 2.5315,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
- "grad_norm": 1.2192399501800537,
821
  "learning_rate": 3.423372725567436e-05,
822
- "loss": 2.5409,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
- "grad_norm": 1.2533234357833862,
828
  "learning_rate": 3.399924967173139e-05,
829
- "loss": 2.6457,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
- "eval_loss": 2.462162733078003,
835
- "eval_runtime": 40.0542,
836
- "eval_samples_per_second": 43.841,
837
- "eval_steps_per_second": 43.841,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
- "grad_norm": 1.8414678573608398,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
- "grad_norm": 1.568259596824646,
850
  "learning_rate": 3.3530294503845436e-05,
851
- "loss": 2.5771,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
- "grad_norm": 1.3547483682632446,
857
  "learning_rate": 3.3295816919902464e-05,
858
- "loss": 2.6525,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
- "grad_norm": 1.1655386686325073,
864
  "learning_rate": 3.3061339335959486e-05,
865
- "loss": 2.6421,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
- "eval_loss": 2.461489200592041,
871
- "eval_runtime": 39.9962,
872
- "eval_samples_per_second": 43.904,
873
- "eval_steps_per_second": 43.904,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
- "grad_norm": 1.798033595085144,
879
  "learning_rate": 3.282686175201651e-05,
880
- "loss": 2.6091,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
- "grad_norm": 3.2964117527008057,
886
  "learning_rate": 3.2592384168073535e-05,
887
- "loss": 2.5997,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
- "grad_norm": 1.0457675457000732,
893
  "learning_rate": 3.2357906584130557e-05,
894
- "loss": 2.6144,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
- "grad_norm": 0.9728056192398071,
900
  "learning_rate": 3.2123429000187585e-05,
901
- "loss": 2.5712,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
- "eval_loss": 2.460186719894409,
907
- "eval_runtime": 39.8386,
908
- "eval_samples_per_second": 44.078,
909
- "eval_steps_per_second": 44.078,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
- "grad_norm": 1.2350194454193115,
915
  "learning_rate": 3.188895141624461e-05,
916
- "loss": 2.5448,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
- "grad_norm": 1.4210622310638428,
922
  "learning_rate": 3.1654473832301634e-05,
923
- "loss": 2.6031,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
- "grad_norm": 2.226473093032837,
929
  "learning_rate": 3.1419996248358656e-05,
930
- "loss": 2.6597,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
- "grad_norm": 2.4525105953216553,
936
  "learning_rate": 3.1185518664415684e-05,
937
- "loss": 2.596,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
- "eval_loss": 2.454537868499756,
943
- "eval_runtime": 39.805,
944
- "eval_samples_per_second": 44.115,
945
- "eval_steps_per_second": 44.115,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
- "grad_norm": 1.265309453010559,
951
  "learning_rate": 3.095104108047271e-05,
952
- "loss": 2.5559,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
- "grad_norm": 2.1364307403564453,
958
  "learning_rate": 3.071656349652973e-05,
959
- "loss": 2.5859,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
- "grad_norm": 1.5945920944213867,
965
  "learning_rate": 3.048208591258676e-05,
966
- "loss": 2.5778,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
- "grad_norm": 1.2479759454727173,
972
  "learning_rate": 3.0247608328643783e-05,
973
- "loss": 2.6846,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
- "eval_loss": 2.4547293186187744,
979
- "eval_runtime": 39.7806,
980
- "eval_samples_per_second": 44.142,
981
- "eval_steps_per_second": 44.142,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
- "grad_norm": 1.4845050573349,
987
  "learning_rate": 3.0013130744700808e-05,
988
- "loss": 2.5661,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
- "grad_norm": 1.5581985712051392,
994
  "learning_rate": 2.9778653160757836e-05,
995
- "loss": 2.5441,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
- "grad_norm": 3.1663737297058105,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
- "loss": 2.5044,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
- "grad_norm": 1.2454484701156616,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
- "loss": 2.5747,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
- "eval_loss": 2.4544529914855957,
1015
- "eval_runtime": 39.9287,
1016
- "eval_samples_per_second": 43.978,
1017
- "eval_steps_per_second": 43.978,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
- "grad_norm": 1.662784457206726,
1023
  "learning_rate": 2.907522040892891e-05,
1024
- "loss": 2.6064,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
- "grad_norm": 1.618458867073059,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
- "loss": 2.5191,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
- "grad_norm": 1.3003348112106323,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
- "loss": 2.5339,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
- "grad_norm": 1.1443992853164673,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
- "loss": 2.5914,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
- "eval_loss": 2.453752279281616,
1051
- "eval_runtime": 39.8234,
1052
- "eval_samples_per_second": 44.095,
1053
- "eval_steps_per_second": 44.095,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
- "grad_norm": 1.2574009895324707,
1059
  "learning_rate": 2.813731007315701e-05,
1060
- "loss": 2.6109,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
- "grad_norm": 1.002815842628479,
1066
  "learning_rate": 2.790283248921403e-05,
1067
- "loss": 2.6075,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
- "grad_norm": 1.306024432182312,
1073
  "learning_rate": 2.766835490527106e-05,
1074
- "loss": 2.5733,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
- "grad_norm": 2.5023701190948486,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
- "loss": 2.6274,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
- "eval_loss": 2.449617862701416,
1087
- "eval_runtime": 39.9401,
1088
- "eval_samples_per_second": 43.966,
1089
- "eval_steps_per_second": 43.966,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
- "grad_norm": 1.9410326480865479,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
- "loss": 2.5312,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
- "grad_norm": 1.9793561697006226,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
- "grad_norm": 1.290531873703003,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
- "loss": 2.5817,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
- "grad_norm": 2.11389422416687,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
- "loss": 2.6287,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
- "eval_loss": 2.4490554332733154,
1123
- "eval_runtime": 39.7474,
1124
- "eval_samples_per_second": 44.179,
1125
- "eval_steps_per_second": 44.179,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
- "grad_norm": 1.6492938995361328,
1131
  "learning_rate": 2.626148940161321e-05,
1132
- "loss": 2.631,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
- "grad_norm": 1.3233673572540283,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
- "loss": 2.5654,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
- "grad_norm": 1.688264012336731,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
- "loss": 2.6096,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
- "grad_norm": 2.064823865890503,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
- "loss": 2.6275,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
- "eval_loss": 2.4488983154296875,
1159
- "eval_runtime": 39.6847,
1160
- "eval_samples_per_second": 44.249,
1161
- "eval_steps_per_second": 44.249,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
- "grad_norm": 1.5599696636199951,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
- "loss": 2.6334,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
- "grad_norm": 1.3142633438110352,
1174
  "learning_rate": 2.508910148189833e-05,
1175
- "loss": 2.5496,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
- "grad_norm": 1.474135160446167,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
- "grad_norm": 1.3737610578536987,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
- "loss": 2.5345,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
- "eval_loss": 2.447559356689453,
1195
- "eval_runtime": 39.7021,
1196
- "eval_samples_per_second": 44.229,
1197
- "eval_steps_per_second": 44.229,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
- "grad_norm": 1.2432060241699219,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
- "loss": 2.5977,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
- "grad_norm": 1.465063452720642,
1210
  "learning_rate": 2.415119114612643e-05,
1211
- "loss": 2.6118,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
- "grad_norm": 1.5186200141906738,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
- "loss": 2.6126,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
- "grad_norm": 1.6869078874588013,
1224
  "learning_rate": 2.368223597824048e-05,
1225
- "loss": 2.576,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
- "eval_loss": 2.4459502696990967,
1231
- "eval_runtime": 39.7653,
1232
- "eval_samples_per_second": 44.159,
1233
- "eval_steps_per_second": 44.159,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
- "grad_norm": 1.2578104734420776,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
- "loss": 2.6178,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
- "grad_norm": 1.7597213983535767,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
- "loss": 2.6358,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
- "grad_norm": 2.144465923309326,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
- "loss": 2.5597,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
- "grad_norm": 1.1808464527130127,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
- "loss": 2.6269,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
- "eval_loss": 2.4444611072540283,
1267
- "eval_runtime": 40.0709,
1268
- "eval_samples_per_second": 43.822,
1269
- "eval_steps_per_second": 43.822,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
- "grad_norm": 1.4550806283950806,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
- "loss": 2.6206,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
- "grad_norm": 1.2635902166366577,
1282
  "learning_rate": 2.227537047458263e-05,
1283
- "loss": 2.5722,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
- "grad_norm": 1.3835856914520264,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
- "loss": 2.535,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
- "grad_norm": 1.735004186630249,
1296
  "learning_rate": 2.180641530669668e-05,
1297
- "loss": 2.6086,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
- "eval_loss": 2.443899154663086,
1303
- "eval_runtime": 40.91,
1304
- "eval_samples_per_second": 42.924,
1305
- "eval_steps_per_second": 42.924,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
- "grad_norm": 1.263051986694336,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
- "loss": 2.5544,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
- "grad_norm": 1.0899442434310913,
1318
  "learning_rate": 2.133746013881073e-05,
1319
- "loss": 2.5603,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
- "grad_norm": 3.038811206817627,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
- "loss": 2.5688,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
- "grad_norm": 1.6385984420776367,
1332
  "learning_rate": 2.086850497092478e-05,
1333
- "loss": 2.6006,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
- "eval_loss": 2.443300724029541,
1339
- "eval_runtime": 40.6649,
1340
- "eval_samples_per_second": 43.182,
1341
- "eval_steps_per_second": 43.182,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
- "grad_norm": 1.2857129573822021,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
- "grad_norm": 1.0289497375488281,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
- "loss": 2.5671,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
- "grad_norm": 1.5041025876998901,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
- "grad_norm": 1.6611964702606201,
1368
- "learning_rate": 1.993528418683174e-05,
1369
- "loss": 2.5801,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
- "eval_loss": 2.443532943725586,
1375
- "eval_runtime": 39.931,
1376
- "eval_samples_per_second": 43.976,
1377
- "eval_steps_per_second": 43.976,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
- "grad_norm": 1.521170735359192,
1383
- "learning_rate": 1.9700806602888767e-05,
1384
- "loss": 2.5969,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
- "grad_norm": 1.3700034618377686,
1390
- "learning_rate": 1.946632901894579e-05,
1391
- "loss": 2.6306,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
- "grad_norm": 2.311443328857422,
1397
- "learning_rate": 1.9231851435002814e-05,
1398
- "loss": 2.5608,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
- "grad_norm": 1.6699820756912231,
1404
- "learning_rate": 1.8997373851059842e-05,
1405
- "loss": 2.5113,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
- "eval_loss": 2.4421675205230713,
1411
- "eval_runtime": 40.1783,
1412
- "eval_samples_per_second": 43.705,
1413
- "eval_steps_per_second": 43.705,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
- "grad_norm": 1.2560683488845825,
1419
- "learning_rate": 1.8762896267116863e-05,
1420
- "loss": 2.545,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
- "grad_norm": 2.176563262939453,
1426
- "learning_rate": 1.852841868317389e-05,
1427
- "loss": 2.5752,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
- "grad_norm": 1.2551178932189941,
1433
- "learning_rate": 1.8293941099230916e-05,
1434
- "loss": 2.5215,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
- "grad_norm": 1.5646872520446777,
1440
- "learning_rate": 1.8059463515287937e-05,
1441
- "loss": 2.5838,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
- "eval_loss": 2.441195249557495,
1447
- "eval_runtime": 39.701,
1448
- "eval_samples_per_second": 44.231,
1449
- "eval_steps_per_second": 44.231,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
- "grad_norm": 1.4227900505065918,
1455
- "learning_rate": 1.7824985931344966e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
- "grad_norm": 1.3013832569122314,
1462
- "learning_rate": 1.759050834740199e-05,
1463
- "loss": 2.7641,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
- "grad_norm": 1.1282143592834473,
1469
- "learning_rate": 1.7356030763459015e-05,
1470
- "loss": 2.5875,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
- "grad_norm": 2.079760789871216,
1476
- "learning_rate": 1.712155317951604e-05,
1477
- "loss": 2.4861,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
- "eval_loss": 2.440812826156616,
1483
- "eval_runtime": 40.0764,
1484
- "eval_samples_per_second": 43.816,
1485
- "eval_steps_per_second": 43.816,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
- "grad_norm": 1.0884991884231567,
1491
- "learning_rate": 1.6887075595573065e-05,
1492
- "loss": 2.5941,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
- "grad_norm": 1.9202015399932861,
1498
- "learning_rate": 1.665259801163009e-05,
1499
- "loss": 2.5929,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
- "grad_norm": 1.5925830602645874,
1505
- "learning_rate": 1.6418120427687114e-05,
1506
- "loss": 2.5046,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
- "grad_norm": 1.5219184160232544,
1512
- "learning_rate": 1.618364284374414e-05,
1513
- "loss": 2.5628,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
- "eval_loss": 2.4396440982818604,
1519
- "eval_runtime": 40.0053,
1520
- "eval_samples_per_second": 43.894,
1521
- "eval_steps_per_second": 43.894,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
- "grad_norm": 1.4882445335388184,
1527
- "learning_rate": 1.5949165259801164e-05,
1528
- "loss": 2.6268,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
- "grad_norm": 1.3513301610946655,
1534
- "learning_rate": 1.571468767585819e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
- "grad_norm": 1.690974473953247,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
- "loss": 2.5631,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
- "grad_norm": 1.5311528444290161,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
- "loss": 2.5454,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
- "eval_loss": 2.4388718605041504,
1555
- "eval_runtime": 40.0289,
1556
- "eval_samples_per_second": 43.868,
1557
- "eval_steps_per_second": 43.868,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
- "grad_norm": 2.1171281337738037,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
- "grad_norm": 1.9706814289093018,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
- "loss": 2.588,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
- "grad_norm": 1.8991297483444214,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
- "loss": 2.5655,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
- "grad_norm": 1.5568820238113403,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
- "loss": 2.5312,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
- "eval_loss": 2.438715696334839,
1591
- "eval_runtime": 40.1748,
1592
- "eval_samples_per_second": 43.709,
1593
- "eval_steps_per_second": 43.709,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
- "grad_norm": 1.277051329612732,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
- "loss": 2.5818,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
- "grad_norm": 1.8890128135681152,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
- "loss": 2.5211,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
- "grad_norm": 1.8824830055236816,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
- "loss": 2.53,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
- "grad_norm": 1.239490032196045,
1620
  "learning_rate": 1.336991183642844e-05,
1621
- "loss": 2.5889,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
- "eval_loss": 2.437577962875366,
1627
- "eval_runtime": 40.1654,
1628
- "eval_samples_per_second": 43.719,
1629
- "eval_steps_per_second": 43.719,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
- "grad_norm": 1.7253328561782837,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
- "loss": 2.5426,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
- "grad_norm": 1.6971838474273682,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
- "loss": 2.4953,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
- "grad_norm": 1.4906270503997803,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
- "grad_norm": 1.658526062965393,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
- "loss": 2.5483,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
- "eval_loss": 2.437896490097046,
1663
- "eval_runtime": 40.7008,
1664
- "eval_samples_per_second": 43.144,
1665
- "eval_steps_per_second": 43.144,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
- "grad_norm": 1.0781177282333374,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
- "loss": 2.5449,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
- "grad_norm": 2.1414873600006104,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
- "loss": 2.5303,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
- "grad_norm": 2.063297986984253,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
- "loss": 2.5837,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
- "grad_norm": 1.2153489589691162,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
- "loss": 2.6384,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
- "eval_loss": 2.4365696907043457,
1699
- "eval_runtime": 40.2398,
1700
- "eval_samples_per_second": 43.638,
1701
- "eval_steps_per_second": 43.638,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
- "grad_norm": 1.2976094484329224,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
- "grad_norm": 1.2775920629501343,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
- "loss": 2.6083,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
- "grad_norm": 1.358311653137207,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
- "loss": 2.5206,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
- "grad_norm": 1.3438369035720825,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
- "loss": 2.4967,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
- "eval_loss": 2.4359662532806396,
1735
- "eval_runtime": 40.0802,
1736
- "eval_samples_per_second": 43.812,
1737
- "eval_steps_per_second": 43.812,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
- "grad_norm": 1.2618831396102905,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
- "loss": 2.6169,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
- "grad_norm": 1.3764727115631104,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
- "loss": 2.5444,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
- "grad_norm": 1.604864478111267,
1757
  "learning_rate": 9.852748077283812e-06,
1758
- "loss": 2.5343,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
- "grad_norm": 1.390496850013733,
1764
  "learning_rate": 9.618270493340837e-06,
1765
- "loss": 2.5051,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
- "eval_loss": 2.4353232383728027,
1771
- "eval_runtime": 40.1607,
1772
- "eval_samples_per_second": 43.724,
1773
- "eval_steps_per_second": 43.724,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
- "grad_norm": 2.1982169151306152,
1779
  "learning_rate": 9.383792909397862e-06,
1780
- "loss": 2.5036,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
- "grad_norm": 1.3033822774887085,
1786
  "learning_rate": 9.149315325454887e-06,
1787
- "loss": 2.5205,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
- "grad_norm": 1.682586431503296,
1793
- "learning_rate": 8.919527293190772e-06,
1794
- "loss": 2.6083,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
- "grad_norm": 3.184382200241089,
1800
- "learning_rate": 8.685049709247797e-06,
1801
- "loss": 2.5314,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
- "eval_loss": 2.434755802154541,
1807
- "eval_runtime": 40.2877,
1808
- "eval_samples_per_second": 43.587,
1809
- "eval_steps_per_second": 43.587,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
- "grad_norm": 2.0026867389678955,
1815
- "learning_rate": 8.450572125304821e-06,
1816
- "loss": 2.5109,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
- "grad_norm": 1.3833885192871094,
1822
- "learning_rate": 8.216094541361846e-06,
1823
- "loss": 2.5362,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
- "grad_norm": 2.157984495162964,
1829
- "learning_rate": 7.981616957418871e-06,
1830
- "loss": 2.5423,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
- "grad_norm": 1.682053565979004,
1836
- "learning_rate": 7.747139373475897e-06,
1837
- "loss": 2.5133,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
- "eval_loss": 2.435208559036255,
1843
- "eval_runtime": 40.4768,
1844
- "eval_samples_per_second": 43.383,
1845
- "eval_steps_per_second": 43.383,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
- "grad_norm": 1.9720139503479004,
1851
- "learning_rate": 7.512661789532921e-06,
1852
- "loss": 2.6372,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
- "grad_norm": 1.6906607151031494,
1858
- "learning_rate": 7.278184205589945e-06,
1859
- "loss": 2.5505,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
- "grad_norm": 1.484045147895813,
1865
- "learning_rate": 7.043706621646972e-06,
1866
- "loss": 2.5095,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
- "grad_norm": 1.6676850318908691,
1872
- "learning_rate": 6.8092290377039955e-06,
1873
- "loss": 2.6487,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
- "eval_loss": 2.4344091415405273,
1879
- "eval_runtime": 39.869,
1880
- "eval_samples_per_second": 44.044,
1881
- "eval_steps_per_second": 44.044,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
- "grad_norm": 1.5012388229370117,
1887
- "learning_rate": 6.57475145376102e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
- "grad_norm": 1.043954849243164,
1894
- "learning_rate": 6.340273869818046e-06,
1895
- "loss": 2.5843,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
- "grad_norm": 1.0455141067504883,
1901
- "learning_rate": 6.105796285875071e-06,
1902
- "loss": 2.5248,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
- "grad_norm": 1.39467453956604,
1908
- "learning_rate": 5.871318701932095e-06,
1909
- "loss": 2.5091,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
- "eval_loss": 2.4331610202789307,
1915
- "eval_runtime": 39.923,
1916
- "eval_samples_per_second": 43.985,
1917
- "eval_steps_per_second": 43.985,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
- "grad_norm": 1.1417715549468994,
1923
- "learning_rate": 5.63684111798912e-06,
1924
- "loss": 2.5853,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
- "grad_norm": 1.133244514465332,
1930
- "learning_rate": 5.402363534046146e-06,
1931
- "loss": 2.5457,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
- "grad_norm": 1.2331452369689941,
1937
- "learning_rate": 5.1678859501031705e-06,
1938
- "loss": 2.5576,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
- "grad_norm": 1.7164263725280762,
1944
- "learning_rate": 4.933408366160195e-06,
1945
- "loss": 2.5471,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
- "eval_loss": 2.4340403079986572,
1951
- "eval_runtime": 40.3849,
1952
- "eval_samples_per_second": 43.482,
1953
- "eval_steps_per_second": 43.482,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
- "grad_norm": 1.3680106401443481,
1959
- "learning_rate": 4.69893078221722e-06,
1960
- "loss": 2.5562,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
- "grad_norm": 1.0978279113769531,
1966
- "learning_rate": 4.464453198274246e-06,
1967
- "loss": 2.5185,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
- "grad_norm": 1.2212647199630737,
1973
- "learning_rate": 4.2299756143312695e-06,
1974
- "loss": 2.6371,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
- "grad_norm": 1.6452165842056274,
1980
- "learning_rate": 3.995498030388295e-06,
1981
- "loss": 2.681,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
- "eval_loss": 2.4337143898010254,
1987
- "eval_runtime": 40.4235,
1988
- "eval_samples_per_second": 43.44,
1989
- "eval_steps_per_second": 43.44,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
- "grad_norm": 1.7757978439331055,
1995
- "learning_rate": 3.7610204464453203e-06,
1996
- "loss": 2.5746,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
- "grad_norm": 1.2373579740524292,
2002
- "learning_rate": 3.5265428625023455e-06,
2003
- "loss": 2.5412,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
- "grad_norm": 1.1407558917999268,
2009
- "learning_rate": 3.29206527855937e-06,
2010
- "loss": 2.5973,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
- "grad_norm": 2.399686813354492,
2016
- "learning_rate": 3.057587694616395e-06,
2017
- "loss": 2.5566,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
- "eval_loss": 2.4338231086730957,
2023
- "eval_runtime": 40.4877,
2024
- "eval_samples_per_second": 43.371,
2025
- "eval_steps_per_second": 43.371,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
- "grad_norm": 1.7053141593933105,
2031
- "learning_rate": 2.8231101106734197e-06,
2032
- "loss": 2.6224,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
- "grad_norm": 1.8215903043746948,
2038
- "learning_rate": 2.5886325267304445e-06,
2039
- "loss": 2.5108,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
- "grad_norm": 1.1648200750350952,
2045
- "learning_rate": 2.3541549427874697e-06,
2046
- "loss": 2.557,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
- "grad_norm": 1.5225868225097656,
2052
- "learning_rate": 2.1196773588444944e-06,
2053
- "loss": 2.6285,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
- "eval_loss": 2.4334514141082764,
2059
- "eval_runtime": 40.3985,
2060
- "eval_samples_per_second": 43.467,
2061
- "eval_steps_per_second": 43.467,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
- "grad_norm": 1.4937622547149658,
2067
- "learning_rate": 1.8851997749015194e-06,
2068
- "loss": 2.481,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
- "grad_norm": 1.9169902801513672,
2074
- "learning_rate": 1.6507221909585446e-06,
2075
- "loss": 2.5412,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
- "grad_norm": 1.6611114740371704,
2081
- "learning_rate": 1.4162446070155693e-06,
2082
- "loss": 2.5086,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
- "grad_norm": 1.3464007377624512,
2088
- "learning_rate": 1.1817670230725943e-06,
2089
- "loss": 2.6063,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
- "eval_loss": 2.4329476356506348,
2095
- "eval_runtime": 40.4334,
2096
- "eval_samples_per_second": 43.429,
2097
- "eval_steps_per_second": 43.429,
2098
  "step": 11600
2099
  },
2100
  {
2101
  "epoch": 2.9495473824143827,
2102
- "grad_norm": 1.453385829925537,
2103
- "learning_rate": 9.472894391296193e-07,
2104
- "loss": 2.5012,
2105
  "step": 11650
2106
  },
2107
  {
2108
  "epoch": 2.9622080141799074,
2109
- "grad_norm": 1.6921356916427612,
2110
- "learning_rate": 7.128118551866442e-07,
2111
- "loss": 2.5589,
2112
  "step": 11700
2113
  },
2114
  {
2115
  "epoch": 2.9748686459454325,
2116
- "grad_norm": 1.0562982559204102,
2117
- "learning_rate": 4.783342712436691e-07,
2118
- "loss": 2.6015,
2119
  "step": 11750
2120
  },
2121
  {
2122
  "epoch": 2.9875292777109577,
2123
- "grad_norm": 1.457960844039917,
2124
- "learning_rate": 2.4385668730069406e-07,
2125
- "loss": 2.6224,
2126
  "step": 11800
2127
  },
2128
  {
2129
  "epoch": 2.9875292777109577,
2130
- "eval_loss": 2.432849645614624,
2131
- "eval_runtime": 40.4518,
2132
- "eval_samples_per_second": 43.41,
2133
- "eval_steps_per_second": 43.41,
2134
  "step": 11800
2135
  }
2136
  ],
 
1
  {
2
  "best_global_step": 11800,
3
+ "best_metric": 2.4338691234588623,
4
  "best_model_checkpoint": "/kaggle/working/senko_adaptive/checkpoint-11800",
5
  "epoch": 2.999430271570551,
6
  "eval_steps": 200,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0126606317655251,
14
+ "grad_norm": 0.4365496337413788,
15
  "learning_rate": 2.067510548523207e-06,
16
+ "loss": 3.4408,
17
  "step": 50
18
  },
19
  {
20
  "epoch": 0.0253212635310502,
21
+ "grad_norm": 0.5391681790351868,
22
  "learning_rate": 4.177215189873418e-06,
23
+ "loss": 3.4568,
24
  "step": 100
25
  },
26
  {
27
  "epoch": 0.0379818952965753,
28
+ "grad_norm": 0.7692704796791077,
29
  "learning_rate": 6.28691983122363e-06,
30
+ "loss": 3.4686,
31
  "step": 150
32
  },
33
  {
34
  "epoch": 0.0506425270621004,
35
+ "grad_norm": 0.9071826934814453,
36
  "learning_rate": 8.39662447257384e-06,
37
+ "loss": 3.5075,
38
  "step": 200
39
  },
40
  {
41
  "epoch": 0.0506425270621004,
42
+ "eval_loss": 3.5223069190979004,
43
+ "eval_runtime": 41.5235,
44
+ "eval_samples_per_second": 42.289,
45
+ "eval_steps_per_second": 42.289,
46
  "step": 200
47
  },
48
  {
49
  "epoch": 0.0633031588276255,
50
+ "grad_norm": 0.976381778717041,
51
  "learning_rate": 1.0506329113924052e-05,
52
+ "loss": 3.3576,
53
  "step": 250
54
  },
55
  {
56
  "epoch": 0.0759637905931506,
57
+ "grad_norm": 0.81010901927948,
58
  "learning_rate": 1.2616033755274262e-05,
59
+ "loss": 3.3492,
60
  "step": 300
61
  },
62
  {
63
  "epoch": 0.0886244223586757,
64
+ "grad_norm": 0.9288440942764282,
65
  "learning_rate": 1.4725738396624473e-05,
66
+ "loss": 3.2229,
67
  "step": 350
68
  },
69
  {
70
  "epoch": 0.1012850541242008,
71
+ "grad_norm": 1.110400676727295,
72
  "learning_rate": 1.6835443037974685e-05,
73
+ "loss": 3.1555,
74
  "step": 400
75
  },
76
  {
77
  "epoch": 0.1012850541242008,
78
+ "eval_loss": 3.1509861946105957,
79
+ "eval_runtime": 40.9241,
80
+ "eval_samples_per_second": 42.909,
81
+ "eval_steps_per_second": 42.909,
82
  "step": 400
83
  },
84
  {
85
  "epoch": 0.1139456858897259,
86
+ "grad_norm": 1.4328745603561401,
87
  "learning_rate": 1.8945147679324897e-05,
88
+ "loss": 3.0152,
89
  "step": 450
90
  },
91
  {
92
  "epoch": 0.126606317655251,
93
+ "grad_norm": 1.094860553741455,
94
  "learning_rate": 2.1054852320675106e-05,
95
+ "loss": 3.0463,
96
  "step": 500
97
  },
98
  {
99
  "epoch": 0.1392669494207761,
100
+ "grad_norm": 1.5432164669036865,
101
  "learning_rate": 2.3164556962025318e-05,
102
+ "loss": 2.9119,
103
  "step": 550
104
  },
105
  {
106
  "epoch": 0.1519275811863012,
107
+ "grad_norm": 1.2089171409606934,
108
  "learning_rate": 2.5274261603375527e-05,
109
+ "loss": 2.885,
110
  "step": 600
111
  },
112
  {
113
  "epoch": 0.1519275811863012,
114
+ "eval_loss": 2.7955658435821533,
115
+ "eval_runtime": 41.2266,
116
+ "eval_samples_per_second": 42.594,
117
+ "eval_steps_per_second": 42.594,
118
  "step": 600
119
  },
120
  {
121
  "epoch": 0.1645882129518263,
122
+ "grad_norm": 1.0353807210922241,
123
  "learning_rate": 2.738396624472574e-05,
124
+ "loss": 2.8395,
125
  "step": 650
126
  },
127
  {
128
  "epoch": 0.1772488447173514,
129
+ "grad_norm": 1.6014362573623657,
130
  "learning_rate": 2.949367088607595e-05,
131
+ "loss": 2.8229,
132
  "step": 700
133
  },
134
  {
135
  "epoch": 0.18990947648287648,
136
+ "grad_norm": 1.0306800603866577,
137
  "learning_rate": 3.160337552742616e-05,
138
+ "loss": 2.9251,
139
  "step": 750
140
  },
141
  {
142
  "epoch": 0.2025701082484016,
143
+ "grad_norm": 1.7377468347549438,
144
  "learning_rate": 3.3713080168776376e-05,
145
+ "loss": 2.816,
146
  "step": 800
147
  },
148
  {
149
  "epoch": 0.2025701082484016,
150
+ "eval_loss": 2.6652894020080566,
151
+ "eval_runtime": 41.6497,
152
+ "eval_samples_per_second": 42.161,
153
+ "eval_steps_per_second": 42.161,
154
  "step": 800
155
  },
156
  {
157
  "epoch": 0.2152307400139267,
158
+ "grad_norm": 1.550484299659729,
159
  "learning_rate": 3.5822784810126585e-05,
160
+ "loss": 2.8018,
161
  "step": 850
162
  },
163
  {
164
  "epoch": 0.2278913717794518,
165
+ "grad_norm": 1.1680374145507812,
166
  "learning_rate": 3.7932489451476794e-05,
167
+ "loss": 2.8037,
168
  "step": 900
169
  },
170
  {
171
  "epoch": 0.24055200354497688,
172
+ "grad_norm": 1.4538466930389404,
173
  "learning_rate": 4.004219409282701e-05,
174
+ "loss": 2.7734,
175
  "step": 950
176
  },
177
  {
178
  "epoch": 0.253212635310502,
179
+ "grad_norm": 1.5268754959106445,
180
  "learning_rate": 4.215189873417722e-05,
181
+ "loss": 2.7975,
182
  "step": 1000
183
  },
184
  {
185
  "epoch": 0.253212635310502,
186
+ "eval_loss": 2.6034388542175293,
187
+ "eval_runtime": 41.1034,
188
+ "eval_samples_per_second": 42.722,
189
+ "eval_steps_per_second": 42.722,
190
  "step": 1000
191
  },
192
  {
193
  "epoch": 0.2658732670760271,
194
+ "grad_norm": 1.813955545425415,
195
  "learning_rate": 4.426160337552743e-05,
196
+ "loss": 2.7258,
197
  "step": 1050
198
  },
199
  {
200
  "epoch": 0.2785338988415522,
201
+ "grad_norm": 1.664474368095398,
202
  "learning_rate": 4.637130801687764e-05,
203
+ "loss": 2.7607,
204
  "step": 1100
205
  },
206
  {
207
  "epoch": 0.2911945306070773,
208
+ "grad_norm": 2.343366861343384,
209
  "learning_rate": 4.8481012658227845e-05,
210
+ "loss": 2.7418,
211
  "step": 1150
212
  },
213
  {
214
  "epoch": 0.3038551623726024,
215
+ "grad_norm": 1.5289666652679443,
216
  "learning_rate": 4.993434627649597e-05,
217
+ "loss": 2.7779,
218
  "step": 1200
219
  },
220
  {
221
  "epoch": 0.3038551623726024,
222
+ "eval_loss": 2.571211814880371,
223
+ "eval_runtime": 41.5835,
224
+ "eval_samples_per_second": 42.228,
225
+ "eval_steps_per_second": 42.228,
226
  "step": 1200
227
  },
228
  {
229
  "epoch": 0.31651579413812747,
230
+ "grad_norm": 1.2860196828842163,
231
  "learning_rate": 4.969986869255299e-05,
232
+ "loss": 2.7216,
233
  "step": 1250
234
  },
235
  {
236
  "epoch": 0.3291764259036526,
237
+ "grad_norm": 1.2128461599349976,
238
  "learning_rate": 4.946539110861002e-05,
239
+ "loss": 2.6822,
240
  "step": 1300
241
  },
242
  {
243
  "epoch": 0.3418370576691777,
244
+ "grad_norm": 0.8949031233787537,
245
  "learning_rate": 4.9230913524667046e-05,
246
+ "loss": 2.696,
247
  "step": 1350
248
  },
249
  {
250
  "epoch": 0.3544976894347028,
251
+ "grad_norm": 1.1098757982254028,
252
+ "learning_rate": 4.900112549240293e-05,
253
+ "loss": 2.7004,
254
  "step": 1400
255
  },
256
  {
257
  "epoch": 0.3544976894347028,
258
+ "eval_loss": 2.5462076663970947,
259
+ "eval_runtime": 41.8547,
260
+ "eval_samples_per_second": 41.955,
261
+ "eval_steps_per_second": 41.955,
262
  "step": 1400
263
  },
264
  {
265
  "epoch": 0.3671583212002279,
266
+ "grad_norm": 1.3600550889968872,
267
+ "learning_rate": 4.8766647908459956e-05,
268
+ "loss": 2.6385,
269
  "step": 1450
270
  },
271
  {
272
  "epoch": 0.37981895296575297,
273
+ "grad_norm": 1.1471352577209473,
274
+ "learning_rate": 4.853217032451698e-05,
275
  "loss": 2.7002,
276
  "step": 1500
277
  },
278
  {
279
  "epoch": 0.3924795847312781,
280
+ "grad_norm": 1.666717767715454,
281
+ "learning_rate": 4.8297692740574e-05,
282
+ "loss": 2.6755,
283
  "step": 1550
284
  },
285
  {
286
  "epoch": 0.4051402164968032,
287
+ "grad_norm": 1.4194293022155762,
288
+ "learning_rate": 4.8063215156631034e-05,
289
+ "loss": 2.6852,
290
  "step": 1600
291
  },
292
  {
293
  "epoch": 0.4051402164968032,
294
+ "eval_loss": 2.5299158096313477,
295
+ "eval_runtime": 41.8943,
296
+ "eval_samples_per_second": 41.915,
297
+ "eval_steps_per_second": 41.915,
298
  "step": 1600
299
  },
300
  {
301
  "epoch": 0.4178008482623283,
302
+ "grad_norm": 0.9786908626556396,
303
+ "learning_rate": 4.7828737572688055e-05,
304
+ "loss": 2.7279,
305
  "step": 1650
306
  },
307
  {
308
  "epoch": 0.4304614800278534,
309
+ "grad_norm": 1.1531881093978882,
310
+ "learning_rate": 4.759425998874508e-05,
311
+ "loss": 2.6643,
312
  "step": 1700
313
  },
314
  {
315
  "epoch": 0.4431221117933785,
316
+ "grad_norm": 1.1486274003982544,
317
+ "learning_rate": 4.7359782404802105e-05,
318
+ "loss": 2.6994,
319
  "step": 1750
320
  },
321
  {
322
  "epoch": 0.4557827435589036,
323
+ "grad_norm": 1.2005631923675537,
324
+ "learning_rate": 4.7125304820859126e-05,
325
+ "loss": 2.7233,
326
  "step": 1800
327
  },
328
  {
329
  "epoch": 0.4557827435589036,
330
+ "eval_loss": 2.520224094390869,
331
+ "eval_runtime": 41.9149,
332
+ "eval_samples_per_second": 41.894,
333
+ "eval_steps_per_second": 41.894,
334
  "step": 1800
335
  },
336
  {
337
  "epoch": 0.4684433753244287,
338
+ "grad_norm": 1.2023993730545044,
339
+ "learning_rate": 4.6890827236916154e-05,
340
+ "loss": 2.6352,
341
  "step": 1850
342
  },
343
  {
344
  "epoch": 0.48110400708995377,
345
+ "grad_norm": 1.5334537029266357,
346
+ "learning_rate": 4.665634965297318e-05,
347
  "loss": 2.6827,
348
  "step": 1900
349
  },
350
  {
351
  "epoch": 0.4937646388554789,
352
+ "grad_norm": 1.9828767776489258,
353
+ "learning_rate": 4.6421872069030204e-05,
354
+ "loss": 2.6507,
355
  "step": 1950
356
  },
357
  {
358
  "epoch": 0.506425270621004,
359
+ "grad_norm": 1.5031970739364624,
360
+ "learning_rate": 4.6187394485087225e-05,
361
+ "loss": 2.6432,
362
  "step": 2000
363
  },
364
  {
365
  "epoch": 0.506425270621004,
366
+ "eval_loss": 2.511120319366455,
367
+ "eval_runtime": 41.0032,
368
+ "eval_samples_per_second": 42.826,
369
+ "eval_steps_per_second": 42.826,
370
  "step": 2000
371
  },
372
  {
373
  "epoch": 0.5190859023865291,
374
+ "grad_norm": 1.2630196809768677,
375
  "learning_rate": 4.5952916901144253e-05,
376
+ "loss": 2.6226,
377
  "step": 2050
378
  },
379
  {
380
  "epoch": 0.5317465341520542,
381
+ "grad_norm": 1.3230444192886353,
382
  "learning_rate": 4.5718439317201275e-05,
383
+ "loss": 2.6596,
384
  "step": 2100
385
  },
386
  {
387
  "epoch": 0.5444071659175793,
388
+ "grad_norm": 1.2275980710983276,
389
  "learning_rate": 4.54839617332583e-05,
390
+ "loss": 2.6419,
391
  "step": 2150
392
  },
393
  {
394
  "epoch": 0.5570677976831044,
395
+ "grad_norm": 1.2460874319076538,
396
  "learning_rate": 4.524948414931533e-05,
397
+ "loss": 2.6432,
398
  "step": 2200
399
  },
400
  {
401
  "epoch": 0.5570677976831044,
402
+ "eval_loss": 2.5041472911834717,
403
+ "eval_runtime": 40.9893,
404
+ "eval_samples_per_second": 42.84,
405
+ "eval_steps_per_second": 42.84,
406
  "step": 2200
407
  },
408
  {
409
  "epoch": 0.5697284294486294,
410
+ "grad_norm": 1.4000052213668823,
411
  "learning_rate": 4.501500656537235e-05,
412
+ "loss": 2.681,
413
  "step": 2250
414
  },
415
  {
416
  "epoch": 0.5823890612141546,
417
+ "grad_norm": 1.139631748199463,
418
  "learning_rate": 4.4780528981429374e-05,
419
+ "loss": 2.6119,
420
  "step": 2300
421
  },
422
  {
423
  "epoch": 0.5950496929796797,
424
+ "grad_norm": 1.4779937267303467,
425
  "learning_rate": 4.45460513974864e-05,
426
+ "loss": 2.615,
427
  "step": 2350
428
  },
429
  {
430
  "epoch": 0.6077103247452048,
431
+ "grad_norm": 1.0678008794784546,
432
  "learning_rate": 4.431157381354343e-05,
433
+ "loss": 2.6014,
434
  "step": 2400
435
  },
436
  {
437
  "epoch": 0.6077103247452048,
438
+ "eval_loss": 2.4982025623321533,
439
+ "eval_runtime": 41.0253,
440
+ "eval_samples_per_second": 42.803,
441
+ "eval_steps_per_second": 42.803,
442
  "step": 2400
443
  },
444
  {
445
  "epoch": 0.6203709565107299,
446
+ "grad_norm": 1.4893417358398438,
447
  "learning_rate": 4.407709622960045e-05,
448
+ "loss": 2.574,
449
  "step": 2450
450
  },
451
  {
452
  "epoch": 0.6330315882762549,
453
+ "grad_norm": 1.3910084962844849,
454
  "learning_rate": 4.384261864565748e-05,
455
+ "loss": 2.6452,
456
  "step": 2500
457
  },
458
  {
459
  "epoch": 0.6456922200417801,
460
+ "grad_norm": 2.1891777515411377,
461
  "learning_rate": 4.36081410617145e-05,
462
+ "loss": 2.6627,
463
  "step": 2550
464
  },
465
  {
466
  "epoch": 0.6583528518073052,
467
+ "grad_norm": 1.6157493591308594,
468
  "learning_rate": 4.337366347777152e-05,
469
+ "loss": 2.6508,
470
  "step": 2600
471
  },
472
  {
473
  "epoch": 0.6583528518073052,
474
+ "eval_loss": 2.4939932823181152,
475
+ "eval_runtime": 41.1171,
476
+ "eval_samples_per_second": 42.707,
477
+ "eval_steps_per_second": 42.707,
478
  "step": 2600
479
  },
480
  {
481
  "epoch": 0.6710134835728303,
482
+ "grad_norm": 1.2457774877548218,
483
  "learning_rate": 4.313918589382856e-05,
484
+ "loss": 2.6282,
485
  "step": 2650
486
  },
487
  {
488
  "epoch": 0.6836741153383554,
489
+ "grad_norm": 2.1914823055267334,
490
  "learning_rate": 4.290470830988558e-05,
491
+ "loss": 2.6931,
492
  "step": 2700
493
  },
494
  {
495
  "epoch": 0.6963347471038804,
496
+ "grad_norm": 1.186735987663269,
497
  "learning_rate": 4.26702307259426e-05,
498
+ "loss": 2.6229,
499
  "step": 2750
500
  },
501
  {
502
  "epoch": 0.7089953788694056,
503
+ "grad_norm": 1.868569016456604,
504
  "learning_rate": 4.243575314199963e-05,
505
+ "loss": 2.6312,
506
  "step": 2800
507
  },
508
  {
509
  "epoch": 0.7089953788694056,
510
+ "eval_loss": 2.487884283065796,
511
+ "eval_runtime": 41.4603,
512
+ "eval_samples_per_second": 42.354,
513
+ "eval_steps_per_second": 42.354,
514
  "step": 2800
515
  },
516
  {
517
  "epoch": 0.7216560106349307,
518
+ "grad_norm": 1.3528209924697876,
519
  "learning_rate": 4.220127555805665e-05,
520
  "loss": 2.5675,
521
  "step": 2850
522
  },
523
  {
524
  "epoch": 0.7343166424004558,
525
+ "grad_norm": 1.319753646850586,
526
  "learning_rate": 4.196679797411368e-05,
527
+ "loss": 2.5599,
528
  "step": 2900
529
  },
530
  {
531
  "epoch": 0.7469772741659809,
532
+ "grad_norm": 1.338115930557251,
533
  "learning_rate": 4.1732320390170706e-05,
534
+ "loss": 2.6528,
535
  "step": 2950
536
  },
537
  {
538
  "epoch": 0.7596379059315059,
539
+ "grad_norm": 1.2844877243041992,
540
  "learning_rate": 4.149784280622773e-05,
541
  "loss": 2.698,
542
  "step": 3000
543
  },
544
  {
545
  "epoch": 0.7596379059315059,
546
+ "eval_loss": 2.4847159385681152,
547
+ "eval_runtime": 41.1324,
548
+ "eval_samples_per_second": 42.691,
549
+ "eval_steps_per_second": 42.691,
550
  "step": 3000
551
  },
552
  {
553
  "epoch": 0.772298537697031,
554
+ "grad_norm": 1.4525926113128662,
555
  "learning_rate": 4.126336522228475e-05,
556
+ "loss": 2.622,
557
  "step": 3050
558
  },
559
  {
560
  "epoch": 0.7849591694625562,
561
+ "grad_norm": 1.5551460981369019,
562
  "learning_rate": 4.102888763834178e-05,
563
+ "loss": 2.6219,
564
  "step": 3100
565
  },
566
  {
567
  "epoch": 0.7976198012280813,
568
+ "grad_norm": 1.39869225025177,
569
  "learning_rate": 4.0794410054398805e-05,
570
+ "loss": 2.5807,
571
  "step": 3150
572
  },
573
  {
574
  "epoch": 0.8102804329936064,
575
+ "grad_norm": 1.4835882186889648,
576
  "learning_rate": 4.0559932470455826e-05,
577
+ "loss": 2.6733,
578
  "step": 3200
579
  },
580
  {
581
  "epoch": 0.8102804329936064,
582
+ "eval_loss": 2.4816081523895264,
583
+ "eval_runtime": 40.9609,
584
+ "eval_samples_per_second": 42.87,
585
+ "eval_steps_per_second": 42.87,
586
  "step": 3200
587
  },
588
  {
589
  "epoch": 0.8229410647591315,
590
+ "grad_norm": 1.2404175996780396,
591
  "learning_rate": 4.0325454886512854e-05,
592
+ "loss": 2.6991,
593
  "step": 3250
594
  },
595
  {
596
  "epoch": 0.8356016965246565,
597
+ "grad_norm": 1.3770995140075684,
598
  "learning_rate": 4.0090977302569876e-05,
599
+ "loss": 2.5512,
600
  "step": 3300
601
  },
602
  {
603
  "epoch": 0.8482623282901817,
604
+ "grad_norm": 1.1706722974777222,
605
  "learning_rate": 3.98564997186269e-05,
606
+ "loss": 2.6126,
607
  "step": 3350
608
  },
609
  {
610
  "epoch": 0.8609229600557068,
611
+ "grad_norm": 1.290719985961914,
612
  "learning_rate": 3.9622022134683925e-05,
613
  "loss": 2.6178,
614
  "step": 3400
615
  },
616
  {
617
  "epoch": 0.8609229600557068,
618
+ "eval_loss": 2.4776341915130615,
619
+ "eval_runtime": 41.0702,
620
+ "eval_samples_per_second": 42.756,
621
+ "eval_steps_per_second": 42.756,
622
  "step": 3400
623
  },
624
  {
625
  "epoch": 0.8735835918212319,
626
+ "grad_norm": 1.32352614402771,
627
  "learning_rate": 3.938754455074095e-05,
628
+ "loss": 2.5755,
629
  "step": 3450
630
  },
631
  {
632
  "epoch": 0.886244223586757,
633
+ "grad_norm": 1.4078598022460938,
634
  "learning_rate": 3.9153066966797975e-05,
635
+ "loss": 2.6678,
636
  "step": 3500
637
  },
638
  {
639
  "epoch": 0.898904855352282,
640
+ "grad_norm": 1.1207985877990723,
641
  "learning_rate": 3.8918589382855e-05,
642
+ "loss": 2.5674,
643
  "step": 3550
644
  },
645
  {
646
  "epoch": 0.9115654871178072,
647
+ "grad_norm": 1.4133316278457642,
648
  "learning_rate": 3.8684111798912024e-05,
649
+ "loss": 2.5949,
650
  "step": 3600
651
  },
652
  {
653
  "epoch": 0.9115654871178072,
654
+ "eval_loss": 2.473680019378662,
655
+ "eval_runtime": 42.1618,
656
+ "eval_samples_per_second": 41.649,
657
+ "eval_steps_per_second": 41.649,
658
  "step": 3600
659
  },
660
  {
661
  "epoch": 0.9242261188833323,
662
+ "grad_norm": 0.9091076254844666,
663
  "learning_rate": 3.844963421496905e-05,
664
+ "loss": 2.6154,
665
  "step": 3650
666
  },
667
  {
668
  "epoch": 0.9368867506488574,
669
+ "grad_norm": 1.3824701309204102,
670
  "learning_rate": 3.821515663102608e-05,
671
+ "loss": 2.6569,
672
  "step": 3700
673
  },
674
  {
675
  "epoch": 0.9495473824143825,
676
+ "grad_norm": 1.3944271802902222,
677
  "learning_rate": 3.79806790470831e-05,
678
  "loss": 2.5874,
679
  "step": 3750
680
  },
681
  {
682
  "epoch": 0.9622080141799075,
683
+ "grad_norm": 1.504271388053894,
684
  "learning_rate": 3.774620146314012e-05,
685
  "loss": 2.6422,
686
  "step": 3800
687
  },
688
  {
689
  "epoch": 0.9622080141799075,
690
+ "eval_loss": 2.4708938598632812,
691
+ "eval_runtime": 41.7484,
692
+ "eval_samples_per_second": 42.061,
693
+ "eval_steps_per_second": 42.061,
694
  "step": 3800
695
  },
696
  {
697
  "epoch": 0.9748686459454327,
698
+ "grad_norm": 1.1897855997085571,
699
  "learning_rate": 3.751172387919715e-05,
700
+ "loss": 2.6979,
701
  "step": 3850
702
  },
703
  {
704
  "epoch": 0.9875292777109578,
705
+ "grad_norm": 0.9344286918640137,
706
  "learning_rate": 3.727724629525417e-05,
707
+ "loss": 2.6678,
708
  "step": 3900
709
  },
710
  {
711
  "epoch": 1.0,
712
+ "grad_norm": 4.620224475860596,
713
  "learning_rate": 3.70427687113112e-05,
714
+ "loss": 2.5652,
715
  "step": 3950
716
  },
717
  {
718
  "epoch": 1.0126606317655251,
719
+ "grad_norm": 1.275289535522461,
720
  "learning_rate": 3.680829112736823e-05,
721
+ "loss": 2.5655,
722
  "step": 4000
723
  },
724
  {
725
  "epoch": 1.0126606317655251,
726
+ "eval_loss": 2.4711084365844727,
727
+ "eval_runtime": 40.8651,
728
+ "eval_samples_per_second": 42.971,
729
+ "eval_steps_per_second": 42.971,
730
  "step": 4000
731
  },
732
  {
733
  "epoch": 1.0253212635310502,
734
+ "grad_norm": 1.460325837135315,
735
  "learning_rate": 3.657381354342525e-05,
736
  "loss": 2.627,
737
  "step": 4050
738
  },
739
  {
740
  "epoch": 1.0379818952965754,
741
+ "grad_norm": 1.2776564359664917,
742
  "learning_rate": 3.633933595948227e-05,
743
+ "loss": 2.626,
744
  "step": 4100
745
  },
746
  {
747
  "epoch": 1.0506425270621005,
748
+ "grad_norm": 1.5591661930084229,
749
+ "learning_rate": 3.6109547927218154e-05,
750
+ "loss": 2.6603,
751
  "step": 4150
752
  },
753
  {
754
  "epoch": 1.0633031588276256,
755
+ "grad_norm": 1.0031243562698364,
756
+ "learning_rate": 3.587507034327519e-05,
757
  "loss": 2.5955,
758
  "step": 4200
759
  },
760
  {
761
  "epoch": 1.0633031588276256,
762
+ "eval_loss": 2.4686498641967773,
763
+ "eval_runtime": 41.059,
764
+ "eval_samples_per_second": 42.768,
765
+ "eval_steps_per_second": 42.768,
766
  "step": 4200
767
  },
768
  {
769
  "epoch": 1.0759637905931505,
770
+ "grad_norm": 1.662988543510437,
771
+ "learning_rate": 3.564059275933221e-05,
772
+ "loss": 2.5906,
773
  "step": 4250
774
  },
775
  {
776
  "epoch": 1.0886244223586756,
777
+ "grad_norm": 1.5336205959320068,
778
+ "learning_rate": 3.540611517538923e-05,
779
+ "loss": 2.62,
780
  "step": 4300
781
  },
782
  {
783
  "epoch": 1.1012850541242007,
784
+ "grad_norm": 1.2656798362731934,
785
+ "learning_rate": 3.517163759144626e-05,
786
+ "loss": 2.6229,
787
  "step": 4350
788
  },
789
  {
790
  "epoch": 1.1139456858897259,
791
+ "grad_norm": 1.5082098245620728,
792
+ "learning_rate": 3.493716000750328e-05,
793
+ "loss": 2.6015,
794
  "step": 4400
795
  },
796
  {
797
  "epoch": 1.1139456858897259,
798
+ "eval_loss": 2.466660737991333,
799
+ "eval_runtime": 40.8118,
800
+ "eval_samples_per_second": 43.027,
801
+ "eval_steps_per_second": 43.027,
802
  "step": 4400
803
  },
804
  {
805
  "epoch": 1.126606317655251,
806
+ "grad_norm": 1.8201966285705566,
807
+ "learning_rate": 3.470268242356031e-05,
808
+ "loss": 2.6495,
809
  "step": 4450
810
  },
811
  {
812
  "epoch": 1.139266949420776,
813
+ "grad_norm": 1.3035717010498047,
814
  "learning_rate": 3.446820483961734e-05,
815
+ "loss": 2.531,
816
  "step": 4500
817
  },
818
  {
819
  "epoch": 1.1519275811863012,
820
+ "grad_norm": 1.2087314128875732,
821
  "learning_rate": 3.423372725567436e-05,
822
+ "loss": 2.5412,
823
  "step": 4550
824
  },
825
  {
826
  "epoch": 1.1645882129518264,
827
+ "grad_norm": 1.2561825513839722,
828
  "learning_rate": 3.399924967173139e-05,
829
+ "loss": 2.6465,
830
  "step": 4600
831
  },
832
  {
833
  "epoch": 1.1645882129518264,
834
+ "eval_loss": 2.4628918170928955,
835
+ "eval_runtime": 40.8309,
836
+ "eval_samples_per_second": 43.007,
837
+ "eval_steps_per_second": 43.007,
838
  "step": 4600
839
  },
840
  {
841
  "epoch": 1.1772488447173515,
842
+ "grad_norm": 1.7700440883636475,
843
  "learning_rate": 3.376477208778841e-05,
844
  "loss": 2.5658,
845
  "step": 4650
846
  },
847
  {
848
  "epoch": 1.1899094764828764,
849
+ "grad_norm": 1.4953458309173584,
850
  "learning_rate": 3.3530294503845436e-05,
851
+ "loss": 2.577,
852
  "step": 4700
853
  },
854
  {
855
  "epoch": 1.2025701082484015,
856
+ "grad_norm": 1.3659100532531738,
857
  "learning_rate": 3.3295816919902464e-05,
858
+ "loss": 2.6531,
859
  "step": 4750
860
  },
861
  {
862
  "epoch": 1.2152307400139266,
863
+ "grad_norm": 1.156020998954773,
864
  "learning_rate": 3.3061339335959486e-05,
865
+ "loss": 2.6418,
866
  "step": 4800
867
  },
868
  {
869
  "epoch": 1.2152307400139266,
870
+ "eval_loss": 2.462512969970703,
871
+ "eval_runtime": 40.8077,
872
+ "eval_samples_per_second": 43.031,
873
+ "eval_steps_per_second": 43.031,
874
  "step": 4800
875
  },
876
  {
877
  "epoch": 1.2278913717794517,
878
+ "grad_norm": 1.7687715291976929,
879
  "learning_rate": 3.282686175201651e-05,
880
+ "loss": 2.6085,
881
  "step": 4850
882
  },
883
  {
884
  "epoch": 1.2405520035449769,
885
+ "grad_norm": 3.3047523498535156,
886
  "learning_rate": 3.2592384168073535e-05,
887
+ "loss": 2.6002,
888
  "step": 4900
889
  },
890
  {
891
  "epoch": 1.253212635310502,
892
+ "grad_norm": 1.040693998336792,
893
  "learning_rate": 3.2357906584130557e-05,
894
+ "loss": 2.6145,
895
  "step": 4950
896
  },
897
  {
898
  "epoch": 1.265873267076027,
899
+ "grad_norm": 0.9686591029167175,
900
  "learning_rate": 3.2123429000187585e-05,
901
+ "loss": 2.5709,
902
  "step": 5000
903
  },
904
  {
905
  "epoch": 1.265873267076027,
906
+ "eval_loss": 2.460991621017456,
907
+ "eval_runtime": 40.9408,
908
+ "eval_samples_per_second": 42.891,
909
+ "eval_steps_per_second": 42.891,
910
  "step": 5000
911
  },
912
  {
913
  "epoch": 1.2785338988415522,
914
+ "grad_norm": 1.2371070384979248,
915
  "learning_rate": 3.188895141624461e-05,
916
+ "loss": 2.5449,
917
  "step": 5050
918
  },
919
  {
920
  "epoch": 1.2911945306070773,
921
+ "grad_norm": 1.422345757484436,
922
  "learning_rate": 3.1654473832301634e-05,
923
+ "loss": 2.6032,
924
  "step": 5100
925
  },
926
  {
927
  "epoch": 1.3038551623726025,
928
+ "grad_norm": 2.229543447494507,
929
  "learning_rate": 3.1419996248358656e-05,
930
+ "loss": 2.6611,
931
  "step": 5150
932
  },
933
  {
934
  "epoch": 1.3165157941381276,
935
+ "grad_norm": 2.4649646282196045,
936
  "learning_rate": 3.1185518664415684e-05,
937
+ "loss": 2.5963,
938
  "step": 5200
939
  },
940
  {
941
  "epoch": 1.3165157941381276,
942
+ "eval_loss": 2.455350637435913,
943
+ "eval_runtime": 40.876,
944
+ "eval_samples_per_second": 42.959,
945
+ "eval_steps_per_second": 42.959,
946
  "step": 5200
947
  },
948
  {
949
  "epoch": 1.3291764259036527,
950
+ "grad_norm": 1.2330511808395386,
951
  "learning_rate": 3.095104108047271e-05,
952
+ "loss": 2.5561,
953
  "step": 5250
954
  },
955
  {
956
  "epoch": 1.3418370576691778,
957
+ "grad_norm": 2.1780569553375244,
958
  "learning_rate": 3.071656349652973e-05,
959
+ "loss": 2.5878,
960
  "step": 5300
961
  },
962
  {
963
  "epoch": 1.3544976894347027,
964
+ "grad_norm": 1.5878489017486572,
965
  "learning_rate": 3.048208591258676e-05,
966
+ "loss": 2.5788,
967
  "step": 5350
968
  },
969
  {
970
  "epoch": 1.3671583212002278,
971
+ "grad_norm": 1.2362117767333984,
972
  "learning_rate": 3.0247608328643783e-05,
973
+ "loss": 2.685,
974
  "step": 5400
975
  },
976
  {
977
  "epoch": 1.3671583212002278,
978
+ "eval_loss": 2.4557485580444336,
979
+ "eval_runtime": 40.7838,
980
+ "eval_samples_per_second": 43.056,
981
+ "eval_steps_per_second": 43.056,
982
  "step": 5400
983
  },
984
  {
985
  "epoch": 1.379818952965753,
986
+ "grad_norm": 1.4540385007858276,
987
  "learning_rate": 3.0013130744700808e-05,
988
+ "loss": 2.5653,
989
  "step": 5450
990
  },
991
  {
992
  "epoch": 1.392479584731278,
993
+ "grad_norm": 1.560059905052185,
994
  "learning_rate": 2.9778653160757836e-05,
995
+ "loss": 2.5448,
996
  "step": 5500
997
  },
998
  {
999
  "epoch": 1.4051402164968032,
1000
+ "grad_norm": 3.153442144393921,
1001
  "learning_rate": 2.9544175576814857e-05,
1002
+ "loss": 2.5042,
1003
  "step": 5550
1004
  },
1005
  {
1006
  "epoch": 1.4178008482623283,
1007
+ "grad_norm": 1.250948429107666,
1008
  "learning_rate": 2.9309697992871882e-05,
1009
+ "loss": 2.575,
1010
  "step": 5600
1011
  },
1012
  {
1013
  "epoch": 1.4178008482623283,
1014
+ "eval_loss": 2.4553444385528564,
1015
+ "eval_runtime": 40.9798,
1016
+ "eval_samples_per_second": 42.85,
1017
+ "eval_steps_per_second": 42.85,
1018
  "step": 5600
1019
  },
1020
  {
1021
  "epoch": 1.4304614800278534,
1022
+ "grad_norm": 1.6559193134307861,
1023
  "learning_rate": 2.907522040892891e-05,
1024
+ "loss": 2.6065,
1025
  "step": 5650
1026
  },
1027
  {
1028
  "epoch": 1.4431221117933786,
1029
+ "grad_norm": 1.6024394035339355,
1030
  "learning_rate": 2.8840742824985935e-05,
1031
+ "loss": 2.5194,
1032
  "step": 5700
1033
  },
1034
  {
1035
  "epoch": 1.4557827435589035,
1036
+ "grad_norm": 1.3071702718734741,
1037
  "learning_rate": 2.8606265241042956e-05,
1038
+ "loss": 2.5348,
1039
  "step": 5750
1040
  },
1041
  {
1042
  "epoch": 1.4684433753244286,
1043
+ "grad_norm": 1.1332521438598633,
1044
  "learning_rate": 2.8371787657099984e-05,
1045
+ "loss": 2.5913,
1046
  "step": 5800
1047
  },
1048
  {
1049
  "epoch": 1.4684433753244286,
1050
+ "eval_loss": 2.454563617706299,
1051
+ "eval_runtime": 40.8474,
1052
+ "eval_samples_per_second": 42.989,
1053
+ "eval_steps_per_second": 42.989,
1054
  "step": 5800
1055
  },
1056
  {
1057
  "epoch": 1.4811040070899537,
1058
+ "grad_norm": 1.260486364364624,
1059
  "learning_rate": 2.813731007315701e-05,
1060
+ "loss": 2.612,
1061
  "step": 5850
1062
  },
1063
  {
1064
  "epoch": 1.4937646388554788,
1065
+ "grad_norm": 1.009621500968933,
1066
  "learning_rate": 2.790283248921403e-05,
1067
+ "loss": 2.6078,
1068
  "step": 5900
1069
  },
1070
  {
1071
  "epoch": 1.506425270621004,
1072
+ "grad_norm": 1.3116769790649414,
1073
  "learning_rate": 2.766835490527106e-05,
1074
+ "loss": 2.5739,
1075
  "step": 5950
1076
  },
1077
  {
1078
  "epoch": 1.519085902386529,
1079
+ "grad_norm": 2.485499143600464,
1080
  "learning_rate": 2.7433877321328083e-05,
1081
+ "loss": 2.6272,
1082
  "step": 6000
1083
  },
1084
  {
1085
  "epoch": 1.519085902386529,
1086
+ "eval_loss": 2.450514316558838,
1087
+ "eval_runtime": 40.6819,
1088
+ "eval_samples_per_second": 43.164,
1089
+ "eval_steps_per_second": 43.164,
1090
  "step": 6000
1091
  },
1092
  {
1093
  "epoch": 1.5317465341520542,
1094
+ "grad_norm": 1.934110164642334,
1095
  "learning_rate": 2.7199399737385105e-05,
1096
+ "loss": 2.5319,
1097
  "step": 6050
1098
  },
1099
  {
1100
  "epoch": 1.5444071659175793,
1101
+ "grad_norm": 1.9517920017242432,
1102
  "learning_rate": 2.6964922153442136e-05,
1103
  "loss": 2.5759,
1104
  "step": 6100
1105
  },
1106
  {
1107
  "epoch": 1.5570677976831044,
1108
+ "grad_norm": 1.3010960817337036,
1109
  "learning_rate": 2.6730444569499157e-05,
1110
+ "loss": 2.5811,
1111
  "step": 6150
1112
  },
1113
  {
1114
  "epoch": 1.5697284294486296,
1115
+ "grad_norm": 2.7256052494049072,
1116
  "learning_rate": 2.6495966985556182e-05,
1117
+ "loss": 2.6294,
1118
  "step": 6200
1119
  },
1120
  {
1121
  "epoch": 1.5697284294486296,
1122
+ "eval_loss": 2.4498414993286133,
1123
+ "eval_runtime": 41.0253,
1124
+ "eval_samples_per_second": 42.803,
1125
+ "eval_steps_per_second": 42.803,
1126
  "step": 6200
1127
  },
1128
  {
1129
  "epoch": 1.5823890612141547,
1130
+ "grad_norm": 1.6172245740890503,
1131
  "learning_rate": 2.626148940161321e-05,
1132
+ "loss": 2.6309,
1133
  "step": 6250
1134
  },
1135
  {
1136
  "epoch": 1.5950496929796798,
1137
+ "grad_norm": 1.3149018287658691,
1138
  "learning_rate": 2.6027011817670232e-05,
1139
+ "loss": 2.5658,
1140
  "step": 6300
1141
  },
1142
  {
1143
  "epoch": 1.607710324745205,
1144
+ "grad_norm": 1.6285394430160522,
1145
  "learning_rate": 2.5792534233727257e-05,
1146
+ "loss": 2.611,
1147
  "step": 6350
1148
  },
1149
  {
1150
  "epoch": 1.62037095651073,
1151
+ "grad_norm": 2.0910215377807617,
1152
  "learning_rate": 2.5558056649784285e-05,
1153
+ "loss": 2.6277,
1154
  "step": 6400
1155
  },
1156
  {
1157
  "epoch": 1.62037095651073,
1158
+ "eval_loss": 2.449920892715454,
1159
+ "eval_runtime": 40.7694,
1160
+ "eval_samples_per_second": 43.071,
1161
+ "eval_steps_per_second": 43.071,
1162
  "step": 6400
1163
  },
1164
  {
1165
  "epoch": 1.633031588276255,
1166
+ "grad_norm": 1.497223138809204,
1167
  "learning_rate": 2.5323579065841306e-05,
1168
+ "loss": 2.6336,
1169
  "step": 6450
1170
  },
1171
  {
1172
  "epoch": 1.64569222004178,
1173
+ "grad_norm": 1.3010990619659424,
1174
  "learning_rate": 2.508910148189833e-05,
1175
+ "loss": 2.5497,
1176
  "step": 6500
1177
  },
1178
  {
1179
  "epoch": 1.6583528518073052,
1180
+ "grad_norm": 1.4681612253189087,
1181
  "learning_rate": 2.4854623897955356e-05,
1182
  "loss": 2.5628,
1183
  "step": 6550
1184
  },
1185
  {
1186
  "epoch": 1.6710134835728303,
1187
+ "grad_norm": 1.3477168083190918,
1188
  "learning_rate": 2.4620146314012384e-05,
1189
+ "loss": 2.5352,
1190
  "step": 6600
1191
  },
1192
  {
1193
  "epoch": 1.6710134835728303,
1194
+ "eval_loss": 2.4485294818878174,
1195
+ "eval_runtime": 40.856,
1196
+ "eval_samples_per_second": 42.98,
1197
+ "eval_steps_per_second": 42.98,
1198
  "step": 6600
1199
  },
1200
  {
1201
  "epoch": 1.6836741153383554,
1202
+ "grad_norm": 1.2609894275665283,
1203
  "learning_rate": 2.4385668730069405e-05,
1204
+ "loss": 2.5984,
1205
  "step": 6650
1206
  },
1207
  {
1208
  "epoch": 1.6963347471038803,
1209
+ "grad_norm": 1.498071312904358,
1210
  "learning_rate": 2.415119114612643e-05,
1211
+ "loss": 2.6117,
1212
  "step": 6700
1213
  },
1214
  {
1215
  "epoch": 1.7089953788694054,
1216
+ "grad_norm": 1.5235400199890137,
1217
  "learning_rate": 2.3916713562183458e-05,
1218
+ "loss": 2.6127,
1219
  "step": 6750
1220
  },
1221
  {
1222
  "epoch": 1.7216560106349306,
1223
+ "grad_norm": 1.7103843688964844,
1224
  "learning_rate": 2.368223597824048e-05,
1225
+ "loss": 2.5761,
1226
  "step": 6800
1227
  },
1228
  {
1229
  "epoch": 1.7216560106349306,
1230
+ "eval_loss": 2.4469785690307617,
1231
+ "eval_runtime": 40.7827,
1232
+ "eval_samples_per_second": 43.058,
1233
+ "eval_steps_per_second": 43.058,
1234
  "step": 6800
1235
  },
1236
  {
1237
  "epoch": 1.7343166424004557,
1238
+ "grad_norm": 1.2467267513275146,
1239
  "learning_rate": 2.3447758394297507e-05,
1240
+ "loss": 2.6174,
1241
  "step": 6850
1242
  },
1243
  {
1244
  "epoch": 1.7469772741659808,
1245
+ "grad_norm": 1.8229267597198486,
1246
  "learning_rate": 2.3213280810354532e-05,
1247
+ "loss": 2.6364,
1248
  "step": 6900
1249
  },
1250
  {
1251
  "epoch": 1.759637905931506,
1252
+ "grad_norm": 2.1323461532592773,
1253
  "learning_rate": 2.2978803226411554e-05,
1254
+ "loss": 2.5595,
1255
  "step": 6950
1256
  },
1257
  {
1258
  "epoch": 1.772298537697031,
1259
+ "grad_norm": 1.150225043296814,
1260
  "learning_rate": 2.2744325642468582e-05,
1261
+ "loss": 2.6266,
1262
  "step": 7000
1263
  },
1264
  {
1265
  "epoch": 1.772298537697031,
1266
+ "eval_loss": 2.445380926132202,
1267
+ "eval_runtime": 40.7346,
1268
+ "eval_samples_per_second": 43.108,
1269
+ "eval_steps_per_second": 43.108,
1270
  "step": 7000
1271
  },
1272
  {
1273
  "epoch": 1.7849591694625562,
1274
+ "grad_norm": 1.36672842502594,
1275
  "learning_rate": 2.2509848058525606e-05,
1276
+ "loss": 2.6212,
1277
  "step": 7050
1278
  },
1279
  {
1280
  "epoch": 1.7976198012280813,
1281
+ "grad_norm": 1.244776725769043,
1282
  "learning_rate": 2.227537047458263e-05,
1283
+ "loss": 2.5734,
1284
  "step": 7100
1285
  },
1286
  {
1287
  "epoch": 1.8102804329936064,
1288
+ "grad_norm": 1.3731275796890259,
1289
  "learning_rate": 2.2040892890639656e-05,
1290
+ "loss": 2.536,
1291
  "step": 7150
1292
  },
1293
  {
1294
  "epoch": 1.8229410647591315,
1295
+ "grad_norm": 2.2051963806152344,
1296
  "learning_rate": 2.180641530669668e-05,
1297
+ "loss": 2.6097,
1298
  "step": 7200
1299
  },
1300
  {
1301
  "epoch": 1.8229410647591315,
1302
+ "eval_loss": 2.4447412490844727,
1303
+ "eval_runtime": 40.7537,
1304
+ "eval_samples_per_second": 43.088,
1305
+ "eval_steps_per_second": 43.088,
1306
  "step": 7200
1307
  },
1308
  {
1309
  "epoch": 1.8356016965246567,
1310
+ "grad_norm": 1.2323483228683472,
1311
  "learning_rate": 2.1571937722753706e-05,
1312
+ "loss": 2.555,
1313
  "step": 7250
1314
  },
1315
  {
1316
  "epoch": 1.8482623282901818,
1317
+ "grad_norm": 1.0700924396514893,
1318
  "learning_rate": 2.133746013881073e-05,
1319
+ "loss": 2.5598,
1320
  "step": 7300
1321
  },
1322
  {
1323
  "epoch": 1.860922960055707,
1324
+ "grad_norm": 2.785604238510132,
1325
  "learning_rate": 2.1102982554867755e-05,
1326
+ "loss": 2.5682,
1327
  "step": 7350
1328
  },
1329
  {
1330
  "epoch": 1.873583591821232,
1331
+ "grad_norm": 1.6302391290664673,
1332
  "learning_rate": 2.086850497092478e-05,
1333
+ "loss": 2.6002,
1334
  "step": 7400
1335
  },
1336
  {
1337
  "epoch": 1.873583591821232,
1338
+ "eval_loss": 2.4443070888519287,
1339
+ "eval_runtime": 40.7834,
1340
+ "eval_samples_per_second": 43.057,
1341
+ "eval_steps_per_second": 43.057,
1342
  "step": 7400
1343
  },
1344
  {
1345
  "epoch": 1.8862442235867571,
1346
+ "grad_norm": 1.270948886871338,
1347
  "learning_rate": 2.0634027386981805e-05,
1348
  "loss": 2.5563,
1349
  "step": 7450
1350
  },
1351
  {
1352
  "epoch": 1.898904855352282,
1353
+ "grad_norm": 1.0166101455688477,
1354
  "learning_rate": 2.0399549803038833e-05,
1355
+ "loss": 2.5687,
1356
  "step": 7500
1357
  },
1358
  {
1359
  "epoch": 1.9115654871178072,
1360
+ "grad_norm": 1.4803165197372437,
1361
  "learning_rate": 2.0165072219095854e-05,
1362
  "loss": 2.5689,
1363
  "step": 7550
1364
  },
1365
  {
1366
  "epoch": 1.9242261188833323,
1367
+ "grad_norm": 1.66029953956604,
1368
+ "learning_rate": 1.993059463515288e-05,
1369
+ "loss": 2.5815,
1370
  "step": 7600
1371
  },
1372
  {
1373
  "epoch": 1.9242261188833323,
1374
+ "eval_loss": 2.4443864822387695,
1375
+ "eval_runtime": 40.8884,
1376
+ "eval_samples_per_second": 42.946,
1377
+ "eval_steps_per_second": 42.946,
1378
  "step": 7600
1379
  },
1380
  {
1381
  "epoch": 1.9368867506488574,
1382
+ "grad_norm": 1.5316967964172363,
1383
+ "learning_rate": 1.9696117051209907e-05,
1384
+ "loss": 2.5979,
1385
  "step": 7650
1386
  },
1387
  {
1388
  "epoch": 1.9495473824143825,
1389
+ "grad_norm": 1.3586021661758423,
1390
+ "learning_rate": 1.946163946726693e-05,
1391
+ "loss": 2.6304,
1392
  "step": 7700
1393
  },
1394
  {
1395
  "epoch": 1.9622080141799074,
1396
+ "grad_norm": 2.293283462524414,
1397
+ "learning_rate": 1.9227161883323953e-05,
1398
+ "loss": 2.5601,
1399
  "step": 7750
1400
  },
1401
  {
1402
  "epoch": 1.9748686459454325,
1403
+ "grad_norm": 1.6579082012176514,
1404
+ "learning_rate": 1.899268429938098e-05,
1405
+ "loss": 2.5124,
1406
  "step": 7800
1407
  },
1408
  {
1409
  "epoch": 1.9748686459454325,
1410
+ "eval_loss": 2.443239688873291,
1411
+ "eval_runtime": 41.5253,
1412
+ "eval_samples_per_second": 42.288,
1413
+ "eval_steps_per_second": 42.288,
1414
  "step": 7800
1415
  },
1416
  {
1417
  "epoch": 1.9875292777109577,
1418
+ "grad_norm": 1.2292983531951904,
1419
+ "learning_rate": 1.8758206715438003e-05,
1420
+ "loss": 2.5449,
1421
  "step": 7850
1422
  },
1423
  {
1424
  "epoch": 2.0,
1425
+ "grad_norm": 2.1584088802337646,
1426
+ "learning_rate": 1.852372913149503e-05,
1427
+ "loss": 2.576,
1428
  "step": 7900
1429
  },
1430
  {
1431
  "epoch": 2.012660631765525,
1432
+ "grad_norm": 1.248931646347046,
1433
+ "learning_rate": 1.8289251547552055e-05,
1434
+ "loss": 2.5218,
1435
  "step": 7950
1436
  },
1437
  {
1438
  "epoch": 2.0253212635310502,
1439
+ "grad_norm": 1.5526643991470337,
1440
+ "learning_rate": 1.8054773963609077e-05,
1441
+ "loss": 2.5839,
1442
  "step": 8000
1443
  },
1444
  {
1445
  "epoch": 2.0253212635310502,
1446
+ "eval_loss": 2.4422366619110107,
1447
+ "eval_runtime": 41.5286,
1448
+ "eval_samples_per_second": 42.284,
1449
+ "eval_steps_per_second": 42.284,
1450
  "step": 8000
1451
  },
1452
  {
1453
  "epoch": 2.0379818952965754,
1454
+ "grad_norm": 1.4182465076446533,
1455
+ "learning_rate": 1.7820296379666105e-05,
1456
  "loss": 2.5597,
1457
  "step": 8050
1458
  },
1459
  {
1460
  "epoch": 2.0506425270621005,
1461
+ "grad_norm": 1.2547794580459595,
1462
+ "learning_rate": 1.758581879572313e-05,
1463
+ "loss": 2.7643,
1464
  "step": 8100
1465
  },
1466
  {
1467
  "epoch": 2.0633031588276256,
1468
+ "grad_norm": 1.093676209449768,
1469
+ "learning_rate": 1.7351341211780155e-05,
1470
+ "loss": 2.5877,
1471
  "step": 8150
1472
  },
1473
  {
1474
  "epoch": 2.0759637905931507,
1475
+ "grad_norm": 2.055103302001953,
1476
+ "learning_rate": 1.711686362783718e-05,
1477
+ "loss": 2.4874,
1478
  "step": 8200
1479
  },
1480
  {
1481
  "epoch": 2.0759637905931507,
1482
+ "eval_loss": 2.4419164657592773,
1483
+ "eval_runtime": 40.7627,
1484
+ "eval_samples_per_second": 43.079,
1485
+ "eval_steps_per_second": 43.079,
1486
  "step": 8200
1487
  },
1488
  {
1489
  "epoch": 2.088624422358676,
1490
+ "grad_norm": 1.0890482664108276,
1491
+ "learning_rate": 1.6882386043894204e-05,
1492
+ "loss": 2.5942,
1493
  "step": 8250
1494
  },
1495
  {
1496
  "epoch": 2.101285054124201,
1497
+ "grad_norm": 1.8730581998825073,
1498
+ "learning_rate": 1.6647908459951232e-05,
1499
+ "loss": 2.592,
1500
  "step": 8300
1501
  },
1502
  {
1503
  "epoch": 2.113945685889726,
1504
+ "grad_norm": 1.6372568607330322,
1505
+ "learning_rate": 1.6413430876008254e-05,
1506
+ "loss": 2.5051,
1507
  "step": 8350
1508
  },
1509
  {
1510
  "epoch": 2.126606317655251,
1511
+ "grad_norm": 1.4793121814727783,
1512
+ "learning_rate": 1.6178953292065278e-05,
1513
+ "loss": 2.5644,
1514
  "step": 8400
1515
  },
1516
  {
1517
  "epoch": 2.126606317655251,
1518
+ "eval_loss": 2.44052791595459,
1519
+ "eval_runtime": 40.8302,
1520
+ "eval_samples_per_second": 43.007,
1521
+ "eval_steps_per_second": 43.007,
1522
  "step": 8400
1523
  },
1524
  {
1525
  "epoch": 2.139266949420776,
1526
+ "grad_norm": 1.4595574140548706,
1527
+ "learning_rate": 1.5944475708122306e-05,
1528
+ "loss": 2.6267,
1529
  "step": 8450
1530
  },
1531
  {
1532
  "epoch": 2.151927581186301,
1533
+ "grad_norm": 1.3399115800857544,
1534
+ "learning_rate": 1.5709998124179328e-05,
1535
  "loss": 2.5277,
1536
  "step": 8500
1537
  },
1538
  {
1539
  "epoch": 2.164588212951826,
1540
+ "grad_norm": 1.6734541654586792,
1541
  "learning_rate": 1.5480210091915216e-05,
1542
+ "loss": 2.5633,
1543
  "step": 8550
1544
  },
1545
  {
1546
  "epoch": 2.1772488447173513,
1547
+ "grad_norm": 1.5579371452331543,
1548
  "learning_rate": 1.5245732507972238e-05,
1549
+ "loss": 2.5467,
1550
  "step": 8600
1551
  },
1552
  {
1553
  "epoch": 2.1772488447173513,
1554
+ "eval_loss": 2.4398272037506104,
1555
+ "eval_runtime": 40.8947,
1556
+ "eval_samples_per_second": 42.94,
1557
+ "eval_steps_per_second": 42.94,
1558
  "step": 8600
1559
  },
1560
  {
1561
  "epoch": 2.1899094764828764,
1562
+ "grad_norm": 1.932307243347168,
1563
  "learning_rate": 1.5011254924029264e-05,
1564
  "loss": 2.6112,
1565
  "step": 8650
1566
  },
1567
  {
1568
  "epoch": 2.2025701082484015,
1569
+ "grad_norm": 1.9798572063446045,
1570
  "learning_rate": 1.4776777340086289e-05,
1571
+ "loss": 2.5891,
1572
  "step": 8700
1573
  },
1574
  {
1575
  "epoch": 2.2152307400139266,
1576
+ "grad_norm": 1.8812506198883057,
1577
  "learning_rate": 1.4542299756143312e-05,
1578
+ "loss": 2.5659,
1579
  "step": 8750
1580
  },
1581
  {
1582
  "epoch": 2.2278913717794517,
1583
+ "grad_norm": 1.5422954559326172,
1584
  "learning_rate": 1.4307822172200339e-05,
1585
+ "loss": 2.5315,
1586
  "step": 8800
1587
  },
1588
  {
1589
  "epoch": 2.2278913717794517,
1590
+ "eval_loss": 2.439927816390991,
1591
+ "eval_runtime": 40.7058,
1592
+ "eval_samples_per_second": 43.139,
1593
+ "eval_steps_per_second": 43.139,
1594
  "step": 8800
1595
  },
1596
  {
1597
  "epoch": 2.240552003544977,
1598
+ "grad_norm": 1.2686810493469238,
1599
  "learning_rate": 1.4073344588257365e-05,
1600
+ "loss": 2.5809,
1601
  "step": 8850
1602
  },
1603
  {
1604
  "epoch": 2.253212635310502,
1605
+ "grad_norm": 1.905816674232483,
1606
  "learning_rate": 1.3838867004314388e-05,
1607
+ "loss": 2.5225,
1608
  "step": 8900
1609
  },
1610
  {
1611
  "epoch": 2.265873267076027,
1612
+ "grad_norm": 1.9044383764266968,
1613
  "learning_rate": 1.3604389420371413e-05,
1614
+ "loss": 2.5301,
1615
  "step": 8950
1616
  },
1617
  {
1618
  "epoch": 2.278533898841552,
1619
+ "grad_norm": 1.2211689949035645,
1620
  "learning_rate": 1.336991183642844e-05,
1621
+ "loss": 2.5885,
1622
  "step": 9000
1623
  },
1624
  {
1625
  "epoch": 2.278533898841552,
1626
+ "eval_loss": 2.4387078285217285,
1627
+ "eval_runtime": 40.6013,
1628
+ "eval_samples_per_second": 43.25,
1629
+ "eval_steps_per_second": 43.25,
1630
  "step": 9000
1631
  },
1632
  {
1633
  "epoch": 2.2911945306070773,
1634
+ "grad_norm": 1.7181427478790283,
1635
  "learning_rate": 1.3135434252485462e-05,
1636
+ "loss": 2.5422,
1637
  "step": 9050
1638
  },
1639
  {
1640
  "epoch": 2.3038551623726025,
1641
+ "grad_norm": 1.714859127998352,
1642
  "learning_rate": 1.2900956668542489e-05,
1643
+ "loss": 2.4957,
1644
  "step": 9100
1645
  },
1646
  {
1647
  "epoch": 2.3165157941381276,
1648
+ "grad_norm": 1.473822832107544,
1649
  "learning_rate": 1.2666479084599514e-05,
1650
  "loss": 2.606,
1651
  "step": 9150
1652
  },
1653
  {
1654
  "epoch": 2.3291764259036527,
1655
+ "grad_norm": 1.6518057584762573,
1656
  "learning_rate": 1.2432001500656538e-05,
1657
+ "loss": 2.5488,
1658
  "step": 9200
1659
  },
1660
  {
1661
  "epoch": 2.3291764259036527,
1662
+ "eval_loss": 2.438912868499756,
1663
+ "eval_runtime": 40.9773,
1664
+ "eval_samples_per_second": 42.853,
1665
+ "eval_steps_per_second": 42.853,
1666
  "step": 9200
1667
  },
1668
  {
1669
  "epoch": 2.341837057669178,
1670
+ "grad_norm": 1.0921835899353027,
1671
  "learning_rate": 1.2197523916713563e-05,
1672
+ "loss": 2.5456,
1673
  "step": 9250
1674
  },
1675
  {
1676
  "epoch": 2.354497689434703,
1677
+ "grad_norm": 2.0887908935546875,
1678
  "learning_rate": 1.1963046332770588e-05,
1679
+ "loss": 2.5298,
1680
  "step": 9300
1681
  },
1682
  {
1683
  "epoch": 2.367158321200228,
1684
+ "grad_norm": 2.09403133392334,
1685
  "learning_rate": 1.1728568748827613e-05,
1686
+ "loss": 2.5843,
1687
  "step": 9350
1688
  },
1689
  {
1690
  "epoch": 2.3798189529657527,
1691
+ "grad_norm": 1.2155842781066895,
1692
  "learning_rate": 1.1494091164884637e-05,
1693
+ "loss": 2.639,
1694
  "step": 9400
1695
  },
1696
  {
1697
  "epoch": 2.3798189529657527,
1698
+ "eval_loss": 2.4376986026763916,
1699
+ "eval_runtime": 40.7687,
1700
+ "eval_samples_per_second": 43.072,
1701
+ "eval_steps_per_second": 43.072,
1702
  "step": 9400
1703
  },
1704
  {
1705
  "epoch": 2.3924795847312783,
1706
+ "grad_norm": 1.2745308876037598,
1707
  "learning_rate": 1.1259613580941662e-05,
1708
  "loss": 2.572,
1709
  "step": 9450
1710
  },
1711
  {
1712
  "epoch": 2.405140216496803,
1713
+ "grad_norm": 1.243294358253479,
1714
  "learning_rate": 1.1025135996998689e-05,
1715
+ "loss": 2.6086,
1716
  "step": 9500
1717
  },
1718
  {
1719
  "epoch": 2.417800848262328,
1720
+ "grad_norm": 1.3740507364273071,
1721
  "learning_rate": 1.0790658413055713e-05,
1722
+ "loss": 2.5203,
1723
  "step": 9550
1724
  },
1725
  {
1726
  "epoch": 2.4304614800278532,
1727
+ "grad_norm": 1.3419544696807861,
1728
  "learning_rate": 1.0556180829112736e-05,
1729
+ "loss": 2.4968,
1730
  "step": 9600
1731
  },
1732
  {
1733
  "epoch": 2.4304614800278532,
1734
+ "eval_loss": 2.4370713233947754,
1735
+ "eval_runtime": 40.6311,
1736
+ "eval_samples_per_second": 43.218,
1737
+ "eval_steps_per_second": 43.218,
1738
  "step": 9600
1739
  },
1740
  {
1741
  "epoch": 2.4431221117933783,
1742
+ "grad_norm": 1.2722185850143433,
1743
  "learning_rate": 1.0321703245169763e-05,
1744
+ "loss": 2.617,
1745
  "step": 9650
1746
  },
1747
  {
1748
  "epoch": 2.4557827435589035,
1749
+ "grad_norm": 1.336860179901123,
1750
  "learning_rate": 1.0087225661226788e-05,
1751
+ "loss": 2.5443,
1752
  "step": 9700
1753
  },
1754
  {
1755
  "epoch": 2.4684433753244286,
1756
+ "grad_norm": 1.5844101905822754,
1757
  "learning_rate": 9.852748077283812e-06,
1758
+ "loss": 2.5358,
1759
  "step": 9750
1760
  },
1761
  {
1762
  "epoch": 2.4811040070899537,
1763
+ "grad_norm": 1.376717209815979,
1764
  "learning_rate": 9.618270493340837e-06,
1765
+ "loss": 2.5065,
1766
  "step": 9800
1767
  },
1768
  {
1769
  "epoch": 2.4811040070899537,
1770
+ "eval_loss": 2.4364349842071533,
1771
+ "eval_runtime": 40.68,
1772
+ "eval_samples_per_second": 43.166,
1773
+ "eval_steps_per_second": 43.166,
1774
  "step": 9800
1775
  },
1776
  {
1777
  "epoch": 2.493764638855479,
1778
+ "grad_norm": 2.2268385887145996,
1779
  "learning_rate": 9.383792909397862e-06,
1780
+ "loss": 2.5043,
1781
  "step": 9850
1782
  },
1783
  {
1784
  "epoch": 2.506425270621004,
1785
+ "grad_norm": 1.304364800453186,
1786
  "learning_rate": 9.149315325454887e-06,
1787
+ "loss": 2.5213,
1788
  "step": 9900
1789
  },
1790
  {
1791
  "epoch": 2.519085902386529,
1792
+ "grad_norm": 1.662419319152832,
1793
+ "learning_rate": 8.914837741511913e-06,
1794
+ "loss": 2.6088,
1795
  "step": 9950
1796
  },
1797
  {
1798
  "epoch": 2.531746534152054,
1799
+ "grad_norm": 3.155359983444214,
1800
+ "learning_rate": 8.680360157568938e-06,
1801
+ "loss": 2.5311,
1802
  "step": 10000
1803
  },
1804
  {
1805
  "epoch": 2.531746534152054,
1806
+ "eval_loss": 2.4357810020446777,
1807
+ "eval_runtime": 40.7194,
1808
+ "eval_samples_per_second": 43.124,
1809
+ "eval_steps_per_second": 43.124,
1810
  "step": 10000
1811
  },
1812
  {
1813
  "epoch": 2.5444071659175793,
1814
+ "grad_norm": 1.9857499599456787,
1815
+ "learning_rate": 8.44588257362596e-06,
1816
+ "loss": 2.5105,
1817
  "step": 10050
1818
  },
1819
  {
1820
  "epoch": 2.5570677976831044,
1821
+ "grad_norm": 1.383115530014038,
1822
+ "learning_rate": 8.211404989682987e-06,
1823
+ "loss": 2.5388,
1824
  "step": 10100
1825
  },
1826
  {
1827
  "epoch": 2.5697284294486296,
1828
+ "grad_norm": 2.1307530403137207,
1829
+ "learning_rate": 7.976927405740012e-06,
1830
+ "loss": 2.5422,
1831
  "step": 10150
1832
  },
1833
  {
1834
  "epoch": 2.5823890612141547,
1835
+ "grad_norm": 1.7428008317947388,
1836
+ "learning_rate": 7.742449821797037e-06,
1837
+ "loss": 2.5139,
1838
  "step": 10200
1839
  },
1840
  {
1841
  "epoch": 2.5823890612141547,
1842
+ "eval_loss": 2.4361467361450195,
1843
+ "eval_runtime": 40.6659,
1844
+ "eval_samples_per_second": 43.181,
1845
+ "eval_steps_per_second": 43.181,
1846
  "step": 10200
1847
  },
1848
  {
1849
  "epoch": 2.59504969297968,
1850
+ "grad_norm": 1.9510554075241089,
1851
+ "learning_rate": 7.507972237854062e-06,
1852
+ "loss": 2.6383,
1853
  "step": 10250
1854
  },
1855
  {
1856
  "epoch": 2.607710324745205,
1857
+ "grad_norm": 1.658544898033142,
1858
+ "learning_rate": 7.273494653911086e-06,
1859
+ "loss": 2.5507,
1860
  "step": 10300
1861
  },
1862
  {
1863
  "epoch": 2.62037095651073,
1864
+ "grad_norm": 1.4996962547302246,
1865
+ "learning_rate": 7.039017069968111e-06,
1866
+ "loss": 2.5093,
1867
  "step": 10350
1868
  },
1869
  {
1870
  "epoch": 2.633031588276255,
1871
+ "grad_norm": 1.6710158586502075,
1872
+ "learning_rate": 6.804539486025137e-06,
1873
+ "loss": 2.6495,
1874
  "step": 10400
1875
  },
1876
  {
1877
  "epoch": 2.633031588276255,
1878
+ "eval_loss": 2.435485601425171,
1879
+ "eval_runtime": 40.7734,
1880
+ "eval_samples_per_second": 43.067,
1881
+ "eval_steps_per_second": 43.067,
1882
  "step": 10400
1883
  },
1884
  {
1885
  "epoch": 2.64569222004178,
1886
+ "grad_norm": 1.3965766429901123,
1887
+ "learning_rate": 6.570061902082161e-06,
1888
  "loss": 2.5756,
1889
  "step": 10450
1890
  },
1891
  {
1892
  "epoch": 2.6583528518073054,
1893
+ "grad_norm": 1.1116695404052734,
1894
+ "learning_rate": 6.335584318139186e-06,
1895
+ "loss": 2.584,
1896
  "step": 10500
1897
  },
1898
  {
1899
  "epoch": 2.67101348357283,
1900
+ "grad_norm": 1.0421807765960693,
1901
+ "learning_rate": 6.101106734196211e-06,
1902
+ "loss": 2.5244,
1903
  "step": 10550
1904
  },
1905
  {
1906
  "epoch": 2.6836741153383556,
1907
+ "grad_norm": 1.374508023262024,
1908
+ "learning_rate": 5.8666291502532365e-06,
1909
+ "loss": 2.5097,
1910
  "step": 10600
1911
  },
1912
  {
1913
  "epoch": 2.6836741153383556,
1914
+ "eval_loss": 2.4343748092651367,
1915
+ "eval_runtime": 40.8106,
1916
+ "eval_samples_per_second": 43.028,
1917
+ "eval_steps_per_second": 43.028,
1918
  "step": 10600
1919
  },
1920
  {
1921
  "epoch": 2.6963347471038803,
1922
+ "grad_norm": 1.139459252357483,
1923
+ "learning_rate": 5.632151566310261e-06,
1924
+ "loss": 2.586,
1925
  "step": 10650
1926
  },
1927
  {
1928
  "epoch": 2.7089953788694054,
1929
+ "grad_norm": 1.1283456087112427,
1930
+ "learning_rate": 5.397673982367286e-06,
1931
+ "loss": 2.5461,
1932
  "step": 10700
1933
  },
1934
  {
1935
  "epoch": 2.7216560106349306,
1936
+ "grad_norm": 1.2529475688934326,
1937
+ "learning_rate": 5.163196398424311e-06,
1938
+ "loss": 2.5577,
1939
  "step": 10750
1940
  },
1941
  {
1942
  "epoch": 2.7343166424004557,
1943
+ "grad_norm": 1.7139452695846558,
1944
+ "learning_rate": 4.928718814481336e-06,
1945
+ "loss": 2.5476,
1946
  "step": 10800
1947
  },
1948
  {
1949
  "epoch": 2.7343166424004557,
1950
+ "eval_loss": 2.435107707977295,
1951
+ "eval_runtime": 40.9961,
1952
+ "eval_samples_per_second": 42.833,
1953
+ "eval_steps_per_second": 42.833,
1954
  "step": 10800
1955
  },
1956
  {
1957
  "epoch": 2.746977274165981,
1958
+ "grad_norm": 1.3778159618377686,
1959
+ "learning_rate": 4.69424123053836e-06,
1960
+ "loss": 2.5561,
1961
  "step": 10850
1962
  },
1963
  {
1964
  "epoch": 2.759637905931506,
1965
+ "grad_norm": 1.0923840999603271,
1966
+ "learning_rate": 4.459763646595386e-06,
1967
+ "loss": 2.5184,
1968
  "step": 10900
1969
  },
1970
  {
1971
  "epoch": 2.772298537697031,
1972
+ "grad_norm": 1.2169151306152344,
1973
+ "learning_rate": 4.225286062652411e-06,
1974
+ "loss": 2.6378,
1975
  "step": 10950
1976
  },
1977
  {
1978
  "epoch": 2.784959169462556,
1979
+ "grad_norm": 1.5901875495910645,
1980
+ "learning_rate": 3.9908084787094354e-06,
1981
+ "loss": 2.6822,
1982
  "step": 11000
1983
  },
1984
  {
1985
  "epoch": 2.784959169462556,
1986
+ "eval_loss": 2.434779644012451,
1987
+ "eval_runtime": 40.6453,
1988
+ "eval_samples_per_second": 43.203,
1989
+ "eval_steps_per_second": 43.203,
1990
  "step": 11000
1991
  },
1992
  {
1993
  "epoch": 2.7976198012280813,
1994
+ "grad_norm": 1.7463274002075195,
1995
+ "learning_rate": 3.7563308947664606e-06,
1996
+ "loss": 2.5749,
1997
  "step": 11050
1998
  },
1999
  {
2000
  "epoch": 2.8102804329936064,
2001
+ "grad_norm": 1.236441731452942,
2002
+ "learning_rate": 3.521853310823486e-06,
2003
+ "loss": 2.5416,
2004
  "step": 11100
2005
  },
2006
  {
2007
  "epoch": 2.8229410647591315,
2008
+ "grad_norm": 1.132720708847046,
2009
+ "learning_rate": 3.28737572688051e-06,
2010
+ "loss": 2.597,
2011
  "step": 11150
2012
  },
2013
  {
2014
  "epoch": 2.8356016965246567,
2015
+ "grad_norm": 2.339376926422119,
2016
+ "learning_rate": 3.0528981429375353e-06,
2017
+ "loss": 2.5564,
2018
  "step": 11200
2019
  },
2020
  {
2021
  "epoch": 2.8356016965246567,
2022
+ "eval_loss": 2.4349372386932373,
2023
+ "eval_runtime": 40.7164,
2024
+ "eval_samples_per_second": 43.128,
2025
+ "eval_steps_per_second": 43.128,
2026
  "step": 11200
2027
  },
2028
  {
2029
  "epoch": 2.8482623282901818,
2030
+ "grad_norm": 1.878458857536316,
2031
+ "learning_rate": 2.81842055899456e-06,
2032
+ "loss": 2.6238,
2033
  "step": 11250
2034
  },
2035
  {
2036
  "epoch": 2.860922960055707,
2037
+ "grad_norm": 1.8116399049758911,
2038
+ "learning_rate": 2.5839429750515852e-06,
2039
+ "loss": 2.5114,
2040
  "step": 11300
2041
  },
2042
  {
2043
  "epoch": 2.873583591821232,
2044
+ "grad_norm": 1.155181884765625,
2045
+ "learning_rate": 2.34946539110861e-06,
2046
+ "loss": 2.5582,
2047
  "step": 11350
2048
  },
2049
  {
2050
  "epoch": 2.886244223586757,
2051
+ "grad_norm": 1.505588173866272,
2052
+ "learning_rate": 2.1149878071656348e-06,
2053
+ "loss": 2.6288,
2054
  "step": 11400
2055
  },
2056
  {
2057
  "epoch": 2.886244223586757,
2058
+ "eval_loss": 2.434521436691284,
2059
+ "eval_runtime": 41.0956,
2060
+ "eval_samples_per_second": 42.73,
2061
+ "eval_steps_per_second": 42.73,
2062
  "step": 11400
2063
  },
2064
  {
2065
  "epoch": 2.8989048553522823,
2066
+ "grad_norm": 1.4831116199493408,
2067
+ "learning_rate": 1.8805102232226601e-06,
2068
+ "loss": 2.4811,
2069
  "step": 11450
2070
  },
2071
  {
2072
  "epoch": 2.911565487117807,
2073
+ "grad_norm": 1.931284785270691,
2074
+ "learning_rate": 1.646032639279685e-06,
2075
+ "loss": 2.5426,
2076
  "step": 11500
2077
  },
2078
  {
2079
  "epoch": 2.9242261188833325,
2080
+ "grad_norm": 1.6025974750518799,
2081
+ "learning_rate": 1.4115550553367099e-06,
2082
+ "loss": 2.509,
2083
  "step": 11550
2084
  },
2085
  {
2086
  "epoch": 2.936886750648857,
2087
+ "grad_norm": 1.3426520824432373,
2088
+ "learning_rate": 1.1770774713937348e-06,
2089
+ "loss": 2.6057,
2090
  "step": 11600
2091
  },
2092
  {
2093
  "epoch": 2.936886750648857,
2094
+ "eval_loss": 2.4339404106140137,
2095
+ "eval_runtime": 41.3931,
2096
+ "eval_samples_per_second": 42.423,
2097
+ "eval_steps_per_second": 42.423,
2098
  "step": 11600
2099
  },
2100
  {
2101
  "epoch": 2.9495473824143827,
2102
+ "grad_norm": 1.4393337965011597,
2103
+ "learning_rate": 9.425998874507597e-07,
2104
+ "loss": 2.5014,
2105
  "step": 11650
2106
  },
2107
  {
2108
  "epoch": 2.9622080141799074,
2109
+ "grad_norm": 1.692421317100525,
2110
+ "learning_rate": 7.081223035077847e-07,
2111
+ "loss": 2.5586,
2112
  "step": 11700
2113
  },
2114
  {
2115
  "epoch": 2.9748686459454325,
2116
+ "grad_norm": 1.0518131256103516,
2117
+ "learning_rate": 4.7364471956480963e-07,
2118
+ "loss": 2.602,
2119
  "step": 11750
2120
  },
2121
  {
2122
  "epoch": 2.9875292777109577,
2123
+ "grad_norm": 1.4363523721694946,
2124
+ "learning_rate": 2.3916713562183455e-07,
2125
+ "loss": 2.6237,
2126
  "step": 11800
2127
  },
2128
  {
2129
  "epoch": 2.9875292777109577,
2130
+ "eval_loss": 2.4338691234588623,
2131
+ "eval_runtime": 41.308,
2132
+ "eval_samples_per_second": 42.51,
2133
+ "eval_steps_per_second": 42.51,
2134
  "step": 11800
2135
  }
2136
  ],
checkpoint-11847/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8421dc43c44b3cc68cec62a0d36963570aa58f934fcd2e92f9f288f7caa6d69
3
  size 5304
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d46b7107c3dfd17ff0fb12b8cdcc256a49e8d4594358d4495904a5fcb069382
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8421dc43c44b3cc68cec62a0d36963570aa58f934fcd2e92f9f288f7caa6d69
3
  size 5304