ErrorAI commited on
Commit
19981f8
·
verified ·
1 Parent(s): 653fc06

Training in progress, step 690, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d94ef6bdc97bf939001e00a3eec4dc3910747528528e91a46806b3d34347b2f
3
  size 4731640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb3724c874ea65782d8dcd4cb5c3209c89ff98f654f1190a20ab5fa9de8602ff
3
  size 4731640
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce84a0b8e34030d580523275904d4c7a50cd2bf803807f4c60772c5e3a0df929
3
  size 2505850
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbcbb0ae8f5791ad43b6585f30b8675ce7b11665e8aa85d4cd2b157c82c0a59d
3
  size 2505850
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d69466bc6c844b889e21aa8ca57bcb74721160c3b9d42bb9b7806591c1ecf00
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eccaa3eca2e911dfe6c95134e6dab45713a2d23ff2880dd37ecc335fd49e8a8f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7769c444878ce8e9c8e8a6ec90449c1e52a5df6741e803674eff7c3726f007cc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4c2fbcfa9fc7eb2f52e6e150e8b4f82c141787d028ff20f699dcb142cbe5fb
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5010893246187363,
5
  "eval_steps": 230,
6
- "global_step": 460,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3251,6 +3251,1624 @@
3251
  "eval_samples_per_second": 139.095,
3252
  "eval_steps_per_second": 69.727,
3253
  "step": 460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3254
  }
3255
  ],
3256
  "logging_steps": 1,
@@ -3270,7 +4888,7 @@
3270
  "attributes": {}
3271
  }
3272
  },
3273
- "total_flos": 1564312385617920.0,
3274
  "train_batch_size": 2,
3275
  "trial_name": null,
3276
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7516339869281046,
5
  "eval_steps": 230,
6
+ "global_step": 690,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3251
  "eval_samples_per_second": 139.095,
3252
  "eval_steps_per_second": 69.727,
3253
  "step": 460
3254
+ },
3255
+ {
3256
+ "epoch": 0.5021786492374728,
3257
+ "grad_norm": 1697.1995849609375,
3258
+ "learning_rate": 0.00010103795250292778,
3259
+ "loss": 15.0099,
3260
+ "step": 461
3261
+ },
3262
+ {
3263
+ "epoch": 0.5032679738562091,
3264
+ "grad_norm": 1438.0911865234375,
3265
+ "learning_rate": 0.00010069197523827833,
3266
+ "loss": 16.3135,
3267
+ "step": 462
3268
+ },
3269
+ {
3270
+ "epoch": 0.5043572984749455,
3271
+ "grad_norm": 1578.0306396484375,
3272
+ "learning_rate": 0.00010034598969004705,
3273
+ "loss": 14.3354,
3274
+ "step": 463
3275
+ },
3276
+ {
3277
+ "epoch": 0.5054466230936819,
3278
+ "grad_norm": 1145.7254638671875,
3279
+ "learning_rate": 0.0001,
3280
+ "loss": 15.1201,
3281
+ "step": 464
3282
+ },
3283
+ {
3284
+ "epoch": 0.5065359477124183,
3285
+ "grad_norm": 1281.2977294921875,
3286
+ "learning_rate": 9.965401030995301e-05,
3287
+ "loss": 15.0423,
3288
+ "step": 465
3289
+ },
3290
+ {
3291
+ "epoch": 0.5076252723311547,
3292
+ "grad_norm": 2497.565673828125,
3293
+ "learning_rate": 9.930802476172169e-05,
3294
+ "loss": 12.9232,
3295
+ "step": 466
3296
+ },
3297
+ {
3298
+ "epoch": 0.5087145969498911,
3299
+ "grad_norm": 1575.5599365234375,
3300
+ "learning_rate": 9.896204749707228e-05,
3301
+ "loss": 14.6681,
3302
+ "step": 467
3303
+ },
3304
+ {
3305
+ "epoch": 0.5098039215686274,
3306
+ "grad_norm": 941.0279541015625,
3307
+ "learning_rate": 9.861608265767167e-05,
3308
+ "loss": 14.7297,
3309
+ "step": 468
3310
+ },
3311
+ {
3312
+ "epoch": 0.5108932461873639,
3313
+ "grad_norm": 1556.48388671875,
3314
+ "learning_rate": 9.827013438503822e-05,
3315
+ "loss": 14.6033,
3316
+ "step": 469
3317
+ },
3318
+ {
3319
+ "epoch": 0.5119825708061002,
3320
+ "grad_norm": 1751.578125,
3321
+ "learning_rate": 9.792420682049174e-05,
3322
+ "loss": 15.0415,
3323
+ "step": 470
3324
+ },
3325
+ {
3326
+ "epoch": 0.5130718954248366,
3327
+ "grad_norm": 1077.553955078125,
3328
+ "learning_rate": 9.757830410510433e-05,
3329
+ "loss": 13.5249,
3330
+ "step": 471
3331
+ },
3332
+ {
3333
+ "epoch": 0.514161220043573,
3334
+ "grad_norm": 1023.6539916992188,
3335
+ "learning_rate": 9.723243037965056e-05,
3336
+ "loss": 13.8337,
3337
+ "step": 472
3338
+ },
3339
+ {
3340
+ "epoch": 0.5152505446623094,
3341
+ "grad_norm": 1367.58984375,
3342
+ "learning_rate": 9.688658978455784e-05,
3343
+ "loss": 15.3791,
3344
+ "step": 473
3345
+ },
3346
+ {
3347
+ "epoch": 0.5163398692810458,
3348
+ "grad_norm": 1138.333740234375,
3349
+ "learning_rate": 9.654078645985722e-05,
3350
+ "loss": 14.343,
3351
+ "step": 474
3352
+ },
3353
+ {
3354
+ "epoch": 0.5174291938997821,
3355
+ "grad_norm": 958.8590698242188,
3356
+ "learning_rate": 9.619502454513338e-05,
3357
+ "loss": 14.1471,
3358
+ "step": 475
3359
+ },
3360
+ {
3361
+ "epoch": 0.5185185185185185,
3362
+ "grad_norm": 1362.31689453125,
3363
+ "learning_rate": 9.584930817947544e-05,
3364
+ "loss": 14.2489,
3365
+ "step": 476
3366
+ },
3367
+ {
3368
+ "epoch": 0.5196078431372549,
3369
+ "grad_norm": 1034.6534423828125,
3370
+ "learning_rate": 9.550364150142713e-05,
3371
+ "loss": 13.4652,
3372
+ "step": 477
3373
+ },
3374
+ {
3375
+ "epoch": 0.5206971677559913,
3376
+ "grad_norm": 1429.5125732421875,
3377
+ "learning_rate": 9.515802864893739e-05,
3378
+ "loss": 14.1628,
3379
+ "step": 478
3380
+ },
3381
+ {
3382
+ "epoch": 0.5217864923747276,
3383
+ "grad_norm": 1044.7430419921875,
3384
+ "learning_rate": 9.481247375931094e-05,
3385
+ "loss": 13.1049,
3386
+ "step": 479
3387
+ },
3388
+ {
3389
+ "epoch": 0.5228758169934641,
3390
+ "grad_norm": 1164.5643310546875,
3391
+ "learning_rate": 9.446698096915847e-05,
3392
+ "loss": 13.8229,
3393
+ "step": 480
3394
+ },
3395
+ {
3396
+ "epoch": 0.5239651416122004,
3397
+ "grad_norm": 1188.510986328125,
3398
+ "learning_rate": 9.412155441434741e-05,
3399
+ "loss": 13.4517,
3400
+ "step": 481
3401
+ },
3402
+ {
3403
+ "epoch": 0.5250544662309368,
3404
+ "grad_norm": 1218.689208984375,
3405
+ "learning_rate": 9.377619822995219e-05,
3406
+ "loss": 15.032,
3407
+ "step": 482
3408
+ },
3409
+ {
3410
+ "epoch": 0.5261437908496732,
3411
+ "grad_norm": 802.5537109375,
3412
+ "learning_rate": 9.343091655020495e-05,
3413
+ "loss": 13.9797,
3414
+ "step": 483
3415
+ },
3416
+ {
3417
+ "epoch": 0.5272331154684096,
3418
+ "grad_norm": 938.1593627929688,
3419
+ "learning_rate": 9.308571350844584e-05,
3420
+ "loss": 13.8941,
3421
+ "step": 484
3422
+ },
3423
+ {
3424
+ "epoch": 0.528322440087146,
3425
+ "grad_norm": 1472.3194580078125,
3426
+ "learning_rate": 9.274059323707366e-05,
3427
+ "loss": 13.2638,
3428
+ "step": 485
3429
+ },
3430
+ {
3431
+ "epoch": 0.5294117647058824,
3432
+ "grad_norm": 1048.2354736328125,
3433
+ "learning_rate": 9.239555986749645e-05,
3434
+ "loss": 14.3531,
3435
+ "step": 486
3436
+ },
3437
+ {
3438
+ "epoch": 0.5305010893246187,
3439
+ "grad_norm": 1296.9527587890625,
3440
+ "learning_rate": 9.205061753008183e-05,
3441
+ "loss": 14.203,
3442
+ "step": 487
3443
+ },
3444
+ {
3445
+ "epoch": 0.5315904139433552,
3446
+ "grad_norm": 1657.743896484375,
3447
+ "learning_rate": 9.170577035410783e-05,
3448
+ "loss": 13.7112,
3449
+ "step": 488
3450
+ },
3451
+ {
3452
+ "epoch": 0.5326797385620915,
3453
+ "grad_norm": 1212.4053955078125,
3454
+ "learning_rate": 9.136102246771314e-05,
3455
+ "loss": 13.2756,
3456
+ "step": 489
3457
+ },
3458
+ {
3459
+ "epoch": 0.5337690631808278,
3460
+ "grad_norm": 1304.51220703125,
3461
+ "learning_rate": 9.101637799784804e-05,
3462
+ "loss": 14.4237,
3463
+ "step": 490
3464
+ },
3465
+ {
3466
+ "epoch": 0.5348583877995643,
3467
+ "grad_norm": 1542.832275390625,
3468
+ "learning_rate": 9.06718410702247e-05,
3469
+ "loss": 14.4378,
3470
+ "step": 491
3471
+ },
3472
+ {
3473
+ "epoch": 0.5359477124183006,
3474
+ "grad_norm": 1302.9571533203125,
3475
+ "learning_rate": 9.032741580926787e-05,
3476
+ "loss": 13.1055,
3477
+ "step": 492
3478
+ },
3479
+ {
3480
+ "epoch": 0.5370370370370371,
3481
+ "grad_norm": 1939.5728759765625,
3482
+ "learning_rate": 8.998310633806571e-05,
3483
+ "loss": 12.4175,
3484
+ "step": 493
3485
+ },
3486
+ {
3487
+ "epoch": 0.5381263616557734,
3488
+ "grad_norm": 1355.798095703125,
3489
+ "learning_rate": 8.963891677832011e-05,
3490
+ "loss": 12.9592,
3491
+ "step": 494
3492
+ },
3493
+ {
3494
+ "epoch": 0.5392156862745098,
3495
+ "grad_norm": 2074.481201171875,
3496
+ "learning_rate": 8.929485125029766e-05,
3497
+ "loss": 13.6742,
3498
+ "step": 495
3499
+ },
3500
+ {
3501
+ "epoch": 0.5403050108932462,
3502
+ "grad_norm": 1087.583740234375,
3503
+ "learning_rate": 8.895091387277999e-05,
3504
+ "loss": 14.9366,
3505
+ "step": 496
3506
+ },
3507
+ {
3508
+ "epoch": 0.5413943355119826,
3509
+ "grad_norm": 1182.4432373046875,
3510
+ "learning_rate": 8.860710876301484e-05,
3511
+ "loss": 12.0956,
3512
+ "step": 497
3513
+ },
3514
+ {
3515
+ "epoch": 0.5424836601307189,
3516
+ "grad_norm": 1062.96923828125,
3517
+ "learning_rate": 8.826344003666647e-05,
3518
+ "loss": 13.0663,
3519
+ "step": 498
3520
+ },
3521
+ {
3522
+ "epoch": 0.5435729847494554,
3523
+ "grad_norm": 1257.4434814453125,
3524
+ "learning_rate": 8.791991180776648e-05,
3525
+ "loss": 14.0272,
3526
+ "step": 499
3527
+ },
3528
+ {
3529
+ "epoch": 0.5446623093681917,
3530
+ "grad_norm": 1445.0548095703125,
3531
+ "learning_rate": 8.757652818866471e-05,
3532
+ "loss": 12.8504,
3533
+ "step": 500
3534
+ },
3535
+ {
3536
+ "epoch": 0.545751633986928,
3537
+ "grad_norm": 1665.5859375,
3538
+ "learning_rate": 8.723329328997973e-05,
3539
+ "loss": 13.8356,
3540
+ "step": 501
3541
+ },
3542
+ {
3543
+ "epoch": 0.5468409586056645,
3544
+ "grad_norm": 995.88427734375,
3545
+ "learning_rate": 8.689021122054996e-05,
3546
+ "loss": 12.8165,
3547
+ "step": 502
3548
+ },
3549
+ {
3550
+ "epoch": 0.5479302832244008,
3551
+ "grad_norm": 1658.434326171875,
3552
+ "learning_rate": 8.654728608738418e-05,
3553
+ "loss": 14.196,
3554
+ "step": 503
3555
+ },
3556
+ {
3557
+ "epoch": 0.5490196078431373,
3558
+ "grad_norm": 984.2662963867188,
3559
+ "learning_rate": 8.620452199561254e-05,
3560
+ "loss": 13.6334,
3561
+ "step": 504
3562
+ },
3563
+ {
3564
+ "epoch": 0.5501089324618736,
3565
+ "grad_norm": 1411.3946533203125,
3566
+ "learning_rate": 8.58619230484374e-05,
3567
+ "loss": 13.7294,
3568
+ "step": 505
3569
+ },
3570
+ {
3571
+ "epoch": 0.55119825708061,
3572
+ "grad_norm": 1083.3192138671875,
3573
+ "learning_rate": 8.551949334708415e-05,
3574
+ "loss": 12.9704,
3575
+ "step": 506
3576
+ },
3577
+ {
3578
+ "epoch": 0.5522875816993464,
3579
+ "grad_norm": 1257.832275390625,
3580
+ "learning_rate": 8.51772369907522e-05,
3581
+ "loss": 12.5718,
3582
+ "step": 507
3583
+ },
3584
+ {
3585
+ "epoch": 0.5533769063180828,
3586
+ "grad_norm": 871.9485473632812,
3587
+ "learning_rate": 8.483515807656576e-05,
3588
+ "loss": 13.1949,
3589
+ "step": 508
3590
+ },
3591
+ {
3592
+ "epoch": 0.5544662309368191,
3593
+ "grad_norm": 1094.6038818359375,
3594
+ "learning_rate": 8.449326069952506e-05,
3595
+ "loss": 14.2022,
3596
+ "step": 509
3597
+ },
3598
+ {
3599
+ "epoch": 0.5555555555555556,
3600
+ "grad_norm": 1254.0933837890625,
3601
+ "learning_rate": 8.415154895245697e-05,
3602
+ "loss": 13.1929,
3603
+ "step": 510
3604
+ },
3605
+ {
3606
+ "epoch": 0.5566448801742919,
3607
+ "grad_norm": 819.5022583007812,
3608
+ "learning_rate": 8.381002692596635e-05,
3609
+ "loss": 13.739,
3610
+ "step": 511
3611
+ },
3612
+ {
3613
+ "epoch": 0.5577342047930284,
3614
+ "grad_norm": 1080.34521484375,
3615
+ "learning_rate": 8.346869870838685e-05,
3616
+ "loss": 12.4172,
3617
+ "step": 512
3618
+ },
3619
+ {
3620
+ "epoch": 0.5588235294117647,
3621
+ "grad_norm": 844.1549072265625,
3622
+ "learning_rate": 8.312756838573208e-05,
3623
+ "loss": 14.1843,
3624
+ "step": 513
3625
+ },
3626
+ {
3627
+ "epoch": 0.5599128540305011,
3628
+ "grad_norm": 1664.24462890625,
3629
+ "learning_rate": 8.278664004164665e-05,
3630
+ "loss": 13.9817,
3631
+ "step": 514
3632
+ },
3633
+ {
3634
+ "epoch": 0.5610021786492375,
3635
+ "grad_norm": 1837.83544921875,
3636
+ "learning_rate": 8.244591775735732e-05,
3637
+ "loss": 14.2748,
3638
+ "step": 515
3639
+ },
3640
+ {
3641
+ "epoch": 0.5620915032679739,
3642
+ "grad_norm": 794.2677612304688,
3643
+ "learning_rate": 8.210540561162412e-05,
3644
+ "loss": 13.3806,
3645
+ "step": 516
3646
+ },
3647
+ {
3648
+ "epoch": 0.5631808278867102,
3649
+ "grad_norm": 1501.6959228515625,
3650
+ "learning_rate": 8.176510768069147e-05,
3651
+ "loss": 13.5997,
3652
+ "step": 517
3653
+ },
3654
+ {
3655
+ "epoch": 0.5642701525054467,
3656
+ "grad_norm": 1058.7418212890625,
3657
+ "learning_rate": 8.142502803823955e-05,
3658
+ "loss": 12.9171,
3659
+ "step": 518
3660
+ },
3661
+ {
3662
+ "epoch": 0.565359477124183,
3663
+ "grad_norm": 1096.179443359375,
3664
+ "learning_rate": 8.108517075533531e-05,
3665
+ "loss": 13.1182,
3666
+ "step": 519
3667
+ },
3668
+ {
3669
+ "epoch": 0.5664488017429193,
3670
+ "grad_norm": 1459.3277587890625,
3671
+ "learning_rate": 8.074553990038395e-05,
3672
+ "loss": 14.0618,
3673
+ "step": 520
3674
+ },
3675
+ {
3676
+ "epoch": 0.5675381263616558,
3677
+ "grad_norm": 1117.51318359375,
3678
+ "learning_rate": 8.040613953908005e-05,
3679
+ "loss": 12.8453,
3680
+ "step": 521
3681
+ },
3682
+ {
3683
+ "epoch": 0.5686274509803921,
3684
+ "grad_norm": 1164.1474609375,
3685
+ "learning_rate": 8.0066973734359e-05,
3686
+ "loss": 13.228,
3687
+ "step": 522
3688
+ },
3689
+ {
3690
+ "epoch": 0.5697167755991286,
3691
+ "grad_norm": 1310.04296875,
3692
+ "learning_rate": 7.972804654634834e-05,
3693
+ "loss": 13.3479,
3694
+ "step": 523
3695
+ },
3696
+ {
3697
+ "epoch": 0.5708061002178649,
3698
+ "grad_norm": 1271.54833984375,
3699
+ "learning_rate": 7.938936203231912e-05,
3700
+ "loss": 12.7188,
3701
+ "step": 524
3702
+ },
3703
+ {
3704
+ "epoch": 0.5718954248366013,
3705
+ "grad_norm": 1231.713623046875,
3706
+ "learning_rate": 7.905092424663735e-05,
3707
+ "loss": 14.596,
3708
+ "step": 525
3709
+ },
3710
+ {
3711
+ "epoch": 0.5729847494553377,
3712
+ "grad_norm": 2087.1767578125,
3713
+ "learning_rate": 7.871273724071553e-05,
3714
+ "loss": 12.5966,
3715
+ "step": 526
3716
+ },
3717
+ {
3718
+ "epoch": 0.5740740740740741,
3719
+ "grad_norm": 1349.419189453125,
3720
+ "learning_rate": 7.837480506296404e-05,
3721
+ "loss": 12.99,
3722
+ "step": 527
3723
+ },
3724
+ {
3725
+ "epoch": 0.5751633986928104,
3726
+ "grad_norm": 1333.2598876953125,
3727
+ "learning_rate": 7.803713175874275e-05,
3728
+ "loss": 12.8456,
3729
+ "step": 528
3730
+ },
3731
+ {
3732
+ "epoch": 0.5762527233115469,
3733
+ "grad_norm": 1339.005615234375,
3734
+ "learning_rate": 7.769972137031262e-05,
3735
+ "loss": 14.4523,
3736
+ "step": 529
3737
+ },
3738
+ {
3739
+ "epoch": 0.5773420479302832,
3740
+ "grad_norm": 1520.0718994140625,
3741
+ "learning_rate": 7.736257793678714e-05,
3742
+ "loss": 13.4148,
3743
+ "step": 530
3744
+ },
3745
+ {
3746
+ "epoch": 0.5784313725490197,
3747
+ "grad_norm": 1333.5904541015625,
3748
+ "learning_rate": 7.702570549408428e-05,
3749
+ "loss": 13.4515,
3750
+ "step": 531
3751
+ },
3752
+ {
3753
+ "epoch": 0.579520697167756,
3754
+ "grad_norm": 1032.6591796875,
3755
+ "learning_rate": 7.668910807487783e-05,
3756
+ "loss": 13.9835,
3757
+ "step": 532
3758
+ },
3759
+ {
3760
+ "epoch": 0.5806100217864923,
3761
+ "grad_norm": 1167.9801025390625,
3762
+ "learning_rate": 7.635278970854943e-05,
3763
+ "loss": 13.1408,
3764
+ "step": 533
3765
+ },
3766
+ {
3767
+ "epoch": 0.5816993464052288,
3768
+ "grad_norm": 1407.249267578125,
3769
+ "learning_rate": 7.601675442114009e-05,
3770
+ "loss": 13.9407,
3771
+ "step": 534
3772
+ },
3773
+ {
3774
+ "epoch": 0.5827886710239651,
3775
+ "grad_norm": 2209.746826171875,
3776
+ "learning_rate": 7.568100623530217e-05,
3777
+ "loss": 12.4755,
3778
+ "step": 535
3779
+ },
3780
+ {
3781
+ "epoch": 0.5838779956427015,
3782
+ "grad_norm": 1797.746826171875,
3783
+ "learning_rate": 7.534554917025119e-05,
3784
+ "loss": 12.8268,
3785
+ "step": 536
3786
+ },
3787
+ {
3788
+ "epoch": 0.5849673202614379,
3789
+ "grad_norm": 1302.199462890625,
3790
+ "learning_rate": 7.501038724171756e-05,
3791
+ "loss": 13.197,
3792
+ "step": 537
3793
+ },
3794
+ {
3795
+ "epoch": 0.5860566448801743,
3796
+ "grad_norm": 2067.767822265625,
3797
+ "learning_rate": 7.46755244618988e-05,
3798
+ "loss": 12.8685,
3799
+ "step": 538
3800
+ },
3801
+ {
3802
+ "epoch": 0.5871459694989106,
3803
+ "grad_norm": 1580.0186767578125,
3804
+ "learning_rate": 7.434096483941115e-05,
3805
+ "loss": 13.4972,
3806
+ "step": 539
3807
+ },
3808
+ {
3809
+ "epoch": 0.5882352941176471,
3810
+ "grad_norm": 894.298828125,
3811
+ "learning_rate": 7.400671237924202e-05,
3812
+ "loss": 13.1393,
3813
+ "step": 540
3814
+ },
3815
+ {
3816
+ "epoch": 0.5893246187363834,
3817
+ "grad_norm": 1165.6744384765625,
3818
+ "learning_rate": 7.367277108270156e-05,
3819
+ "loss": 13.9111,
3820
+ "step": 541
3821
+ },
3822
+ {
3823
+ "epoch": 0.5904139433551199,
3824
+ "grad_norm": 1790.1239013671875,
3825
+ "learning_rate": 7.333914494737514e-05,
3826
+ "loss": 13.351,
3827
+ "step": 542
3828
+ },
3829
+ {
3830
+ "epoch": 0.5915032679738562,
3831
+ "grad_norm": 984.1708374023438,
3832
+ "learning_rate": 7.300583796707539e-05,
3833
+ "loss": 14.5447,
3834
+ "step": 543
3835
+ },
3836
+ {
3837
+ "epoch": 0.5925925925925926,
3838
+ "grad_norm": 1341.8519287109375,
3839
+ "learning_rate": 7.267285413179421e-05,
3840
+ "loss": 14.1327,
3841
+ "step": 544
3842
+ },
3843
+ {
3844
+ "epoch": 0.593681917211329,
3845
+ "grad_norm": 1000.2693481445312,
3846
+ "learning_rate": 7.234019742765532e-05,
3847
+ "loss": 13.3989,
3848
+ "step": 545
3849
+ },
3850
+ {
3851
+ "epoch": 0.5947712418300654,
3852
+ "grad_norm": 1023.481689453125,
3853
+ "learning_rate": 7.200787183686625e-05,
3854
+ "loss": 13.069,
3855
+ "step": 546
3856
+ },
3857
+ {
3858
+ "epoch": 0.5958605664488017,
3859
+ "grad_norm": 1108.57470703125,
3860
+ "learning_rate": 7.167588133767091e-05,
3861
+ "loss": 12.2698,
3862
+ "step": 547
3863
+ },
3864
+ {
3865
+ "epoch": 0.5969498910675382,
3866
+ "grad_norm": 964.7139282226562,
3867
+ "learning_rate": 7.134422990430176e-05,
3868
+ "loss": 13.6585,
3869
+ "step": 548
3870
+ },
3871
+ {
3872
+ "epoch": 0.5980392156862745,
3873
+ "grad_norm": 1239.9150390625,
3874
+ "learning_rate": 7.101292150693241e-05,
3875
+ "loss": 13.0273,
3876
+ "step": 549
3877
+ },
3878
+ {
3879
+ "epoch": 0.599128540305011,
3880
+ "grad_norm": 846.2174072265625,
3881
+ "learning_rate": 7.068196011162994e-05,
3882
+ "loss": 13.0756,
3883
+ "step": 550
3884
+ },
3885
+ {
3886
+ "epoch": 0.6002178649237473,
3887
+ "grad_norm": 987.310546875,
3888
+ "learning_rate": 7.03513496803075e-05,
3889
+ "loss": 14.1704,
3890
+ "step": 551
3891
+ },
3892
+ {
3893
+ "epoch": 0.6013071895424836,
3894
+ "grad_norm": 5245.39697265625,
3895
+ "learning_rate": 7.002109417067697e-05,
3896
+ "loss": 14.4467,
3897
+ "step": 552
3898
+ },
3899
+ {
3900
+ "epoch": 0.6023965141612201,
3901
+ "grad_norm": 1111.834228515625,
3902
+ "learning_rate": 6.969119753620135e-05,
3903
+ "loss": 13.517,
3904
+ "step": 553
3905
+ },
3906
+ {
3907
+ "epoch": 0.6034858387799564,
3908
+ "grad_norm": 996.4967041015625,
3909
+ "learning_rate": 6.936166372604773e-05,
3910
+ "loss": 13.5025,
3911
+ "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.6045751633986928,
3915
+ "grad_norm": 867.002685546875,
3916
+ "learning_rate": 6.903249668503972e-05,
3917
+ "loss": 12.8567,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.6056644880174292,
3922
+ "grad_norm": 1155.432373046875,
3923
+ "learning_rate": 6.87037003536104e-05,
3924
+ "loss": 13.0582,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.6067538126361656,
3929
+ "grad_norm": 1074.625732421875,
3930
+ "learning_rate": 6.837527866775522e-05,
3931
+ "loss": 13.5709,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.6078431372549019,
3936
+ "grad_norm": 1409.87841796875,
3937
+ "learning_rate": 6.804723555898458e-05,
3938
+ "loss": 13.7728,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.6089324618736384,
3943
+ "grad_norm": 987.9152221679688,
3944
+ "learning_rate": 6.771957495427716e-05,
3945
+ "loss": 13.0499,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.6100217864923747,
3950
+ "grad_norm": 1056.71240234375,
3951
+ "learning_rate": 6.739230077603259e-05,
3952
+ "loss": 13.4185,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.6111111111111112,
3957
+ "grad_norm": 965.7537841796875,
3958
+ "learning_rate": 6.706541694202471e-05,
3959
+ "loss": 13.3033,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.6122004357298475,
3964
+ "grad_norm": 950.843505859375,
3965
+ "learning_rate": 6.673892736535448e-05,
3966
+ "loss": 13.2638,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.6132897603485838,
3971
+ "grad_norm": 888.200439453125,
3972
+ "learning_rate": 6.641283595440323e-05,
3973
+ "loss": 14.2555,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.6143790849673203,
3978
+ "grad_norm": 1082.6995849609375,
3979
+ "learning_rate": 6.608714661278606e-05,
3980
+ "loss": 13.0727,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.6154684095860566,
3985
+ "grad_norm": 1750.720947265625,
3986
+ "learning_rate": 6.576186323930466e-05,
3987
+ "loss": 12.9111,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.616557734204793,
3992
+ "grad_norm": 1437.4244384765625,
3993
+ "learning_rate": 6.543698972790117e-05,
3994
+ "loss": 14.7961,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.6176470588235294,
3999
+ "grad_norm": 1005.5140380859375,
4000
+ "learning_rate": 6.51125299676111e-05,
4001
+ "loss": 14.3845,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.6187363834422658,
4006
+ "grad_norm": 802.7247924804688,
4007
+ "learning_rate": 6.478848784251713e-05,
4008
+ "loss": 14.1054,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.6198257080610022,
4013
+ "grad_norm": 780.930908203125,
4014
+ "learning_rate": 6.446486723170236e-05,
4015
+ "loss": 13.4999,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.6209150326797386,
4020
+ "grad_norm": 1568.1295166015625,
4021
+ "learning_rate": 6.414167200920391e-05,
4022
+ "loss": 14.3808,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.6220043572984749,
4027
+ "grad_norm": 649.1087036132812,
4028
+ "learning_rate": 6.381890604396687e-05,
4029
+ "loss": 14.4266,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.6230936819172114,
4034
+ "grad_norm": 1113.2542724609375,
4035
+ "learning_rate": 6.349657319979742e-05,
4036
+ "loss": 14.2873,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.6241830065359477,
4041
+ "grad_norm": 735.5198974609375,
4042
+ "learning_rate": 6.317467733531712e-05,
4043
+ "loss": 14.1431,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.6252723311546841,
4048
+ "grad_norm": 889.42822265625,
4049
+ "learning_rate": 6.28532223039163e-05,
4050
+ "loss": 14.5709,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.6263616557734205,
4055
+ "grad_norm": 1000.8912963867188,
4056
+ "learning_rate": 6.253221195370826e-05,
4057
+ "loss": 14.9789,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.6274509803921569,
4062
+ "grad_norm": 1058.2774658203125,
4063
+ "learning_rate": 6.221165012748297e-05,
4064
+ "loss": 13.2536,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.6285403050108932,
4069
+ "grad_norm": 752.534912109375,
4070
+ "learning_rate": 6.189154066266112e-05,
4071
+ "loss": 14.1625,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.6296296296296297,
4076
+ "grad_norm": 756.6798095703125,
4077
+ "learning_rate": 6.157188739124834e-05,
4078
+ "loss": 13.6298,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.630718954248366,
4083
+ "grad_norm": 1065.2354736328125,
4084
+ "learning_rate": 6.125269413978907e-05,
4085
+ "loss": 13.4351,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.6318082788671024,
4090
+ "grad_norm": 1031.8475341796875,
4091
+ "learning_rate": 6.093396472932103e-05,
4092
+ "loss": 12.9427,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.6328976034858388,
4097
+ "grad_norm": 545.2572631835938,
4098
+ "learning_rate": 6.0615702975329194e-05,
4099
+ "loss": 13.4271,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.6339869281045751,
4104
+ "grad_norm": 629.153076171875,
4105
+ "learning_rate": 6.029791268770029e-05,
4106
+ "loss": 13.7342,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.6350762527233116,
4111
+ "grad_norm": 808.3855590820312,
4112
+ "learning_rate": 5.998059767067728e-05,
4113
+ "loss": 12.8523,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.6361655773420479,
4118
+ "grad_norm": 966.9733276367188,
4119
+ "learning_rate": 5.9663761722813495e-05,
4120
+ "loss": 13.8446,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.6372549019607843,
4125
+ "grad_norm": 832.753173828125,
4126
+ "learning_rate": 5.934740863692759e-05,
4127
+ "loss": 14.0291,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.6383442265795207,
4132
+ "grad_norm": 870.2808837890625,
4133
+ "learning_rate": 5.903154220005771e-05,
4134
+ "loss": 14.7135,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.6394335511982571,
4139
+ "grad_norm": 888.448974609375,
4140
+ "learning_rate": 5.871616619341653e-05,
4141
+ "loss": 13.6237,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.6405228758169934,
4146
+ "grad_norm": 604.5468139648438,
4147
+ "learning_rate": 5.840128439234571e-05,
4148
+ "loss": 13.1112,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.6416122004357299,
4153
+ "grad_norm": 740.3869018554688,
4154
+ "learning_rate": 5.80869005662708e-05,
4155
+ "loss": 13.2927,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.6427015250544662,
4160
+ "grad_norm": 729.2908935546875,
4161
+ "learning_rate": 5.777301847865629e-05,
4162
+ "loss": 13.1883,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.6437908496732027,
4167
+ "grad_norm": 679.76171875,
4168
+ "learning_rate": 5.7459641886960244e-05,
4169
+ "loss": 12.7278,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.644880174291939,
4174
+ "grad_norm": 771.1044921875,
4175
+ "learning_rate": 5.714677454258947e-05,
4176
+ "loss": 14.2043,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.6459694989106753,
4181
+ "grad_norm": 525.4684448242188,
4182
+ "learning_rate": 5.6834420190854745e-05,
4183
+ "loss": 13.7949,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.6470588235294118,
4188
+ "grad_norm": 1367.8231201171875,
4189
+ "learning_rate": 5.652258257092569e-05,
4190
+ "loss": 13.5713,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.6481481481481481,
4195
+ "grad_norm": 1199.23876953125,
4196
+ "learning_rate": 5.621126541578632e-05,
4197
+ "loss": 13.6268,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.6492374727668845,
4202
+ "grad_norm": 995.0260009765625,
4203
+ "learning_rate": 5.590047245219009e-05,
4204
+ "loss": 14.3565,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.6503267973856209,
4209
+ "grad_norm": 805.2388916015625,
4210
+ "learning_rate": 5.559020740061549e-05,
4211
+ "loss": 14.2207,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.6514161220043573,
4216
+ "grad_norm": 779.6170654296875,
4217
+ "learning_rate": 5.528047397522133e-05,
4218
+ "loss": 13.4574,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.6525054466230937,
4223
+ "grad_norm": 974.6607055664062,
4224
+ "learning_rate": 5.497127588380244e-05,
4225
+ "loss": 13.5537,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.6535947712418301,
4230
+ "grad_norm": 957.2470092773438,
4231
+ "learning_rate": 5.4662616827745185e-05,
4232
+ "loss": 13.6918,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.6546840958605664,
4237
+ "grad_norm": 564.2131958007812,
4238
+ "learning_rate": 5.4354500501983074e-05,
4239
+ "loss": 14.5732,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.6557734204793029,
4244
+ "grad_norm": 970.4561157226562,
4245
+ "learning_rate": 5.404693059495285e-05,
4246
+ "loss": 14.259,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.6568627450980392,
4251
+ "grad_norm": 1475.91943359375,
4252
+ "learning_rate": 5.373991078854992e-05,
4253
+ "loss": 13.7511,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.6579520697167756,
4258
+ "grad_norm": 1206.491943359375,
4259
+ "learning_rate": 5.3433444758084604e-05,
4260
+ "loss": 13.9658,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.659041394335512,
4265
+ "grad_norm": 729.8560791015625,
4266
+ "learning_rate": 5.312753617223794e-05,
4267
+ "loss": 12.3194,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.6601307189542484,
4272
+ "grad_norm": 833.2518310546875,
4273
+ "learning_rate": 5.282218869301788e-05,
4274
+ "loss": 13.2262,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.6612200435729847,
4279
+ "grad_norm": 1121.98388671875,
4280
+ "learning_rate": 5.251740597571542e-05,
4281
+ "loss": 13.1887,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.6623093681917211,
4286
+ "grad_norm": 732.55810546875,
4287
+ "learning_rate": 5.221319166886073e-05,
4288
+ "loss": 11.9579,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.6633986928104575,
4293
+ "grad_norm": 795.4283447265625,
4294
+ "learning_rate": 5.190954941417977e-05,
4295
+ "loss": 13.6673,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.664488017429194,
4300
+ "grad_norm": 742.3599243164062,
4301
+ "learning_rate": 5.160648284655032e-05,
4302
+ "loss": 13.4396,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.6655773420479303,
4307
+ "grad_norm": 872.8551635742188,
4308
+ "learning_rate": 5.1303995593958824e-05,
4309
+ "loss": 14.0764,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.6666666666666666,
4314
+ "grad_norm": 865.8281860351562,
4315
+ "learning_rate": 5.100209127745661e-05,
4316
+ "loss": 13.2594,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.6677559912854031,
4321
+ "grad_norm": 553.1524658203125,
4322
+ "learning_rate": 5.0700773511116906e-05,
4323
+ "loss": 13.4783,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.6688453159041394,
4328
+ "grad_norm": 981.0257568359375,
4329
+ "learning_rate": 5.040004590199128e-05,
4330
+ "loss": 14.2846,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.6699346405228758,
4335
+ "grad_norm": 817.2470703125,
4336
+ "learning_rate": 5.0099912050066556e-05,
4337
+ "loss": 12.6324,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.6710239651416122,
4342
+ "grad_norm": 593.127197265625,
4343
+ "learning_rate": 4.9800375548221845e-05,
4344
+ "loss": 13.0678,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.6721132897603486,
4349
+ "grad_norm": 940.2001953125,
4350
+ "learning_rate": 4.950143998218531e-05,
4351
+ "loss": 13.5529,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.673202614379085,
4356
+ "grad_norm": 717.47021484375,
4357
+ "learning_rate": 4.920310893049146e-05,
4358
+ "loss": 12.7114,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.6742919389978214,
4363
+ "grad_norm": 874.2285766601562,
4364
+ "learning_rate": 4.89053859644381e-05,
4365
+ "loss": 14.4422,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.6753812636165577,
4370
+ "grad_norm": 847.6348876953125,
4371
+ "learning_rate": 4.860827464804383e-05,
4372
+ "loss": 12.8582,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.6764705882352942,
4377
+ "grad_norm": 552.3037719726562,
4378
+ "learning_rate": 4.831177853800511e-05,
4379
+ "loss": 13.5564,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.6775599128540305,
4384
+ "grad_norm": 704.1598510742188,
4385
+ "learning_rate": 4.801590118365383e-05,
4386
+ "loss": 14.2366,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.6786492374727668,
4391
+ "grad_norm": 535.1793823242188,
4392
+ "learning_rate": 4.77206461269149e-05,
4393
+ "loss": 13.5502,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.6797385620915033,
4398
+ "grad_norm": 1089.44775390625,
4399
+ "learning_rate": 4.7426016902263636e-05,
4400
+ "loss": 13.391,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.6808278867102396,
4405
+ "grad_norm": 793.9139404296875,
4406
+ "learning_rate": 4.713201703668367e-05,
4407
+ "loss": 13.9273,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.681917211328976,
4412
+ "grad_norm": 1263.728759765625,
4413
+ "learning_rate": 4.683865004962452e-05,
4414
+ "loss": 13.2224,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.6830065359477124,
4419
+ "grad_norm": 583.6502685546875,
4420
+ "learning_rate": 4.654591945295969e-05,
4421
+ "loss": 13.1198,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.6840958605664488,
4426
+ "grad_norm": 546.5349731445312,
4427
+ "learning_rate": 4.6253828750944375e-05,
4428
+ "loss": 14.2708,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.6851851851851852,
4433
+ "grad_norm": 573.7505493164062,
4434
+ "learning_rate": 4.596238144017369e-05,
4435
+ "loss": 14.4553,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.6862745098039216,
4440
+ "grad_norm": 485.56976318359375,
4441
+ "learning_rate": 4.567158100954083e-05,
4442
+ "loss": 13.9009,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.6873638344226579,
4447
+ "grad_norm": 859.7692260742188,
4448
+ "learning_rate": 4.53814309401951e-05,
4449
+ "loss": 13.4996,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.6884531590413944,
4454
+ "grad_norm": 806.7265014648438,
4455
+ "learning_rate": 4.509193470550056e-05,
4456
+ "loss": 13.8831,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.6895424836601307,
4461
+ "grad_norm": 678.3446044921875,
4462
+ "learning_rate": 4.4803095770994106e-05,
4463
+ "loss": 12.5697,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.690631808278867,
4468
+ "grad_norm": 760.7838745117188,
4469
+ "learning_rate": 4.4514917594344184e-05,
4470
+ "loss": 13.5064,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.6917211328976035,
4475
+ "grad_norm": 1103.8131103515625,
4476
+ "learning_rate": 4.422740362530945e-05,
4477
+ "loss": 14.8133,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.6928104575163399,
4482
+ "grad_norm": 737.2034912109375,
4483
+ "learning_rate": 4.3940557305697226e-05,
4484
+ "loss": 14.7038,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.6938997821350763,
4489
+ "grad_norm": 579.3258056640625,
4490
+ "learning_rate": 4.3654382069322644e-05,
4491
+ "loss": 14.0494,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.6949891067538126,
4496
+ "grad_norm": 680.4292602539062,
4497
+ "learning_rate": 4.3368881341967135e-05,
4498
+ "loss": 14.3115,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.696078431372549,
4503
+ "grad_norm": 617.1260986328125,
4504
+ "learning_rate": 4.308405854133786e-05,
4505
+ "loss": 13.788,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.6971677559912854,
4510
+ "grad_norm": 951.0760498046875,
4511
+ "learning_rate": 4.2799917077026394e-05,
4512
+ "loss": 13.1248,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.6982570806100218,
4517
+ "grad_norm": 606.4050903320312,
4518
+ "learning_rate": 4.251646035046814e-05,
4519
+ "loss": 14.0699,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.6993464052287581,
4524
+ "grad_norm": 596.975830078125,
4525
+ "learning_rate": 4.223369175490162e-05,
4526
+ "loss": 11.8161,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.7004357298474946,
4531
+ "grad_norm": 867.7359619140625,
4532
+ "learning_rate": 4.195161467532769e-05,
4533
+ "loss": 13.4987,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.7015250544662309,
4538
+ "grad_norm": 802.6876831054688,
4539
+ "learning_rate": 4.167023248846925e-05,
4540
+ "loss": 12.8087,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.7026143790849673,
4545
+ "grad_norm": 687.2286987304688,
4546
+ "learning_rate": 4.138954856273054e-05,
4547
+ "loss": 12.1043,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.7037037037037037,
4552
+ "grad_norm": 669.9190673828125,
4553
+ "learning_rate": 4.110956625815713e-05,
4554
+ "loss": 12.1478,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.7047930283224401,
4559
+ "grad_norm": 841.2921752929688,
4560
+ "learning_rate": 4.083028892639541e-05,
4561
+ "loss": 12.806,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.7058823529411765,
4566
+ "grad_norm": 808.080078125,
4567
+ "learning_rate": 4.055171991065262e-05,
4568
+ "loss": 13.3545,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.7069716775599129,
4573
+ "grad_norm": 919.4234619140625,
4574
+ "learning_rate": 4.027386254565688e-05,
4575
+ "loss": 13.9735,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.7080610021786492,
4580
+ "grad_norm": 769.6393432617188,
4581
+ "learning_rate": 3.9996720157617094e-05,
4582
+ "loss": 14.6133,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.7091503267973857,
4587
+ "grad_norm": 880.525146484375,
4588
+ "learning_rate": 3.972029606418335e-05,
4589
+ "loss": 13.0994,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.710239651416122,
4594
+ "grad_norm": 945.1436767578125,
4595
+ "learning_rate": 3.9444593574406915e-05,
4596
+ "loss": 14.2055,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.7113289760348583,
4601
+ "grad_norm": 690.2129516601562,
4602
+ "learning_rate": 3.9169615988701e-05,
4603
+ "loss": 13.2737,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.7124183006535948,
4608
+ "grad_norm": 563.0156860351562,
4609
+ "learning_rate": 3.8895366598800896e-05,
4610
+ "loss": 12.8401,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.7135076252723311,
4615
+ "grad_norm": 814.678466796875,
4616
+ "learning_rate": 3.862184868772473e-05,
4617
+ "loss": 12.2818,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.7145969498910676,
4622
+ "grad_norm": 712.4736938476562,
4623
+ "learning_rate": 3.834906552973424e-05,
4624
+ "loss": 12.5831,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.7156862745098039,
4629
+ "grad_norm": 1027.3265380859375,
4630
+ "learning_rate": 3.807702039029539e-05,
4631
+ "loss": 13.827,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.7167755991285403,
4636
+ "grad_norm": 1087.4530029296875,
4637
+ "learning_rate": 3.780571652603949e-05,
4638
+ "loss": 13.3193,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.7178649237472767,
4643
+ "grad_norm": 926.59326171875,
4644
+ "learning_rate": 3.753515718472402e-05,
4645
+ "loss": 13.3761,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.7189542483660131,
4650
+ "grad_norm": 758.7875366210938,
4651
+ "learning_rate": 3.726534560519381e-05,
4652
+ "loss": 12.2786,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.7200435729847494,
4657
+ "grad_norm": 693.2098999023438,
4658
+ "learning_rate": 3.6996285017342406e-05,
4659
+ "loss": 12.9392,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.7211328976034859,
4664
+ "grad_norm": 1132.9970703125,
4665
+ "learning_rate": 3.672797864207316e-05,
4666
+ "loss": 12.6866,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.7222222222222222,
4671
+ "grad_norm": 868.7568969726562,
4672
+ "learning_rate": 3.646042969126093e-05,
4673
+ "loss": 12.7426,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.7233115468409586,
4678
+ "grad_norm": 800.123291015625,
4679
+ "learning_rate": 3.619364136771337e-05,
4680
+ "loss": 12.4544,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.724400871459695,
4685
+ "grad_norm": 509.1244201660156,
4686
+ "learning_rate": 3.5927616865132884e-05,
4687
+ "loss": 13.2459,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.7254901960784313,
4692
+ "grad_norm": 568.7617797851562,
4693
+ "learning_rate": 3.566235936807808e-05,
4694
+ "loss": 13.3732,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.7265795206971678,
4699
+ "grad_norm": 839.1197509765625,
4700
+ "learning_rate": 3.539787205192586e-05,
4701
+ "loss": 12.6018,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.7276688453159041,
4706
+ "grad_norm": 750.1957397460938,
4707
+ "learning_rate": 3.513415808283341e-05,
4708
+ "loss": 13.1899,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.7287581699346405,
4713
+ "grad_norm": 604.1763305664062,
4714
+ "learning_rate": 3.4871220617700126e-05,
4715
+ "loss": 13.0681,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.7298474945533769,
4720
+ "grad_norm": 637.3112182617188,
4721
+ "learning_rate": 3.460906280413007e-05,
4722
+ "loss": 13.0931,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.7309368191721133,
4727
+ "grad_norm": 778.7605590820312,
4728
+ "learning_rate": 3.4347687780394e-05,
4729
+ "loss": 13.7031,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.7320261437908496,
4734
+ "grad_norm": 1044.1873779296875,
4735
+ "learning_rate": 3.4087098675392104e-05,
4736
+ "loss": 12.2163,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.7331154684095861,
4741
+ "grad_norm": 833.3889770507812,
4742
+ "learning_rate": 3.382729860861632e-05,
4743
+ "loss": 13.2927,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.7342047930283224,
4748
+ "grad_norm": 655.1200561523438,
4749
+ "learning_rate": 3.3568290690113034e-05,
4750
+ "loss": 12.0668,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.7352941176470589,
4755
+ "grad_norm": 702.258056640625,
4756
+ "learning_rate": 3.331007802044601e-05,
4757
+ "loss": 11.1181,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.7363834422657952,
4762
+ "grad_norm": 791.837890625,
4763
+ "learning_rate": 3.305266369065901e-05,
4764
+ "loss": 12.7031,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.7374727668845316,
4769
+ "grad_norm": 636.5592041015625,
4770
+ "learning_rate": 3.279605078223906e-05,
4771
+ "loss": 14.1468,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.738562091503268,
4776
+ "grad_norm": 1960.34228515625,
4777
+ "learning_rate": 3.25402423670793e-05,
4778
+ "loss": 12.7793,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.7396514161220044,
4783
+ "grad_norm": 987.132080078125,
4784
+ "learning_rate": 3.228524150744249e-05,
4785
+ "loss": 13.8105,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.7407407407407407,
4790
+ "grad_norm": 536.3349609375,
4791
+ "learning_rate": 3.2031051255924085e-05,
4792
+ "loss": 13.0451,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.7418300653594772,
4797
+ "grad_norm": 829.6603393554688,
4798
+ "learning_rate": 3.1777674655415834e-05,
4799
+ "loss": 13.352,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.7429193899782135,
4804
+ "grad_norm": 825.07275390625,
4805
+ "learning_rate": 3.1525114739069415e-05,
4806
+ "loss": 13.4622,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.7440087145969498,
4811
+ "grad_norm": 1563.3502197265625,
4812
+ "learning_rate": 3.127337453025994e-05,
4813
+ "loss": 13.9679,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.7450980392156863,
4818
+ "grad_norm": 955.0587158203125,
4819
+ "learning_rate": 3.102245704254995e-05,
4820
+ "loss": 12.3828,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.7461873638344226,
4825
+ "grad_norm": 784.834716796875,
4826
+ "learning_rate": 3.077236527965318e-05,
4827
+ "loss": 13.1804,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.7472766884531591,
4832
+ "grad_norm": 735.6173706054688,
4833
+ "learning_rate": 3.0523102235398714e-05,
4834
+ "loss": 13.0926,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.7483660130718954,
4839
+ "grad_norm": 1419.035888671875,
4840
+ "learning_rate": 3.0274670893695147e-05,
4841
+ "loss": 13.365,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.7494553376906318,
4846
+ "grad_norm": 1074.7969970703125,
4847
+ "learning_rate": 3.002707422849472e-05,
4848
+ "loss": 13.357,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.7505446623093682,
4853
+ "grad_norm": 1391.6951904296875,
4854
+ "learning_rate": 2.978031520375798e-05,
4855
+ "loss": 13.65,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.7516339869281046,
4860
+ "grad_norm": 789.4784545898438,
4861
+ "learning_rate": 2.9534396773417994e-05,
4862
+ "loss": 12.0752,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.7516339869281046,
4867
+ "eval_loss": 3.2088863849639893,
4868
+ "eval_runtime": 2.6648,
4869
+ "eval_samples_per_second": 145.229,
4870
+ "eval_steps_per_second": 72.802,
4871
+ "step": 690
4872
  }
4873
  ],
4874
  "logging_steps": 1,
 
4888
  "attributes": {}
4889
  }
4890
  },
4891
+ "total_flos": 2341481805250560.0,
4892
  "train_batch_size": 2,
4893
  "trial_name": null,
4894
  "trial_params": null