Kudod commited on
Commit
f169e50
·
verified ·
1 Parent(s): 6b319f6

Training in progress, step 30000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6789ecf17de6b09ea80d804f0b8dc5e1c1b21bcba924a77a805e56dd4ee42061
3
  size 641630264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8abd655a9b44d771c5739a43035851191180013c05198446b425dced1ab3d2e
3
  size 641630264
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffca1600b0efb977373a4c2ece9c0412c55a4fd4474eebaa16d8012352dae00e
3
  size 1283324282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:768112938401618c4edaf6704bba0a776118317185538856e2178b0e3f6df8f3
3
  size 1283324282
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b03a5c8e99d354bc4c6dd731292cef24d6f5abef66e79af41e19acbb72904f5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e8920ec89da786a5fc2fb4b9f1869b6bd25c311f6d06cbb6e1066128e3d88f7
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74bef543997127e7c2c660d0faf1701a82784e882d7ffc358a865a081ae18884
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e494a2773f091703acc430a5f9f465e98f323447a3c54b31aebd20d1297c3d30
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:634cc21d2ee6681c0a2a9372f98b9644e031750fd180f98cae66b1e01b326f8b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49bc4c081ed4c8705851a14034a365f34c3c0a4efcee412cbd9dc7fcb3ff7092
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.49662296384584825,
6
  "eval_steps": 5000,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -320,6 +320,162 @@
320
  "eval_samples_per_second": 305.443,
321
  "eval_steps_per_second": 9.545,
322
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  }
324
  ],
325
  "logging_steps": 500,
@@ -339,7 +495,7 @@
339
  "attributes": {}
340
  }
341
  },
342
- "total_flos": 8.405239526988595e+16,
343
  "train_batch_size": 32,
344
  "trial_name": null,
345
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7449344457687723,
6
  "eval_steps": 5000,
7
+ "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
320
  "eval_samples_per_second": 305.443,
321
  "eval_steps_per_second": 9.545,
322
  "step": 20000
323
+ },
324
+ {
325
+ "epoch": 0.5090385379419944,
326
+ "grad_norm": 2.7354369163513184,
327
+ "learning_rate": 0.0004730348487195945,
328
+ "loss": 2.6474,
329
+ "step": 20500
330
+ },
331
+ {
332
+ "epoch": 0.5214541120381406,
333
+ "grad_norm": 3.169034719467163,
334
+ "learning_rate": 0.0004723431862951241,
335
+ "loss": 2.6225,
336
+ "step": 21000
337
+ },
338
+ {
339
+ "epoch": 0.5338696861342869,
340
+ "grad_norm": 2.654437780380249,
341
+ "learning_rate": 0.0004716515238706536,
342
+ "loss": 2.6248,
343
+ "step": 21500
344
+ },
345
+ {
346
+ "epoch": 0.546285260230433,
347
+ "grad_norm": 3.1007559299468994,
348
+ "learning_rate": 0.0004709598614461831,
349
+ "loss": 2.6233,
350
+ "step": 22000
351
+ },
352
+ {
353
+ "epoch": 0.5587008343265792,
354
+ "grad_norm": 2.5275120735168457,
355
+ "learning_rate": 0.0004702681990217127,
356
+ "loss": 2.6037,
357
+ "step": 22500
358
+ },
359
+ {
360
+ "epoch": 0.5711164084227255,
361
+ "grad_norm": 2.6772823333740234,
362
+ "learning_rate": 0.0004695765365972422,
363
+ "loss": 2.6014,
364
+ "step": 23000
365
+ },
366
+ {
367
+ "epoch": 0.5835319825188717,
368
+ "grad_norm": 3.494856595993042,
369
+ "learning_rate": 0.00046888625749762073,
370
+ "loss": 2.5994,
371
+ "step": 23500
372
+ },
373
+ {
374
+ "epoch": 0.5959475566150179,
375
+ "grad_norm": 3.0711567401885986,
376
+ "learning_rate": 0.00046819597839799915,
377
+ "loss": 2.5539,
378
+ "step": 24000
379
+ },
380
+ {
381
+ "epoch": 0.6083631307111641,
382
+ "grad_norm": 2.804976224899292,
383
+ "learning_rate": 0.0004675043159735287,
384
+ "loss": 2.5649,
385
+ "step": 24500
386
+ },
387
+ {
388
+ "epoch": 0.6207787048073103,
389
+ "grad_norm": 2.4519996643066406,
390
+ "learning_rate": 0.00046681265354905823,
391
+ "loss": 2.5486,
392
+ "step": 25000
393
+ },
394
+ {
395
+ "epoch": 0.6207787048073103,
396
+ "eval_loss": 1.5942130088806152,
397
+ "eval_runtime": 4197.6423,
398
+ "eval_samples_per_second": 307.004,
399
+ "eval_steps_per_second": 9.594,
400
+ "step": 25000
401
+ },
402
+ {
403
+ "epoch": 0.6331942789034565,
404
+ "grad_norm": 2.5043067932128906,
405
+ "learning_rate": 0.00046612099112458777,
406
+ "loss": 2.5406,
407
+ "step": 25500
408
+ },
409
+ {
410
+ "epoch": 0.6456098529996027,
411
+ "grad_norm": 2.5639116764068604,
412
+ "learning_rate": 0.00046543071202496624,
413
+ "loss": 2.5053,
414
+ "step": 26000
415
+ },
416
+ {
417
+ "epoch": 0.658025427095749,
418
+ "grad_norm": 3.016726493835449,
419
+ "learning_rate": 0.0004647390496004958,
420
+ "loss": 2.5144,
421
+ "step": 26500
422
+ },
423
+ {
424
+ "epoch": 0.6704410011918951,
425
+ "grad_norm": 3.4621164798736572,
426
+ "learning_rate": 0.0004640487705008743,
427
+ "loss": 2.5116,
428
+ "step": 27000
429
+ },
430
+ {
431
+ "epoch": 0.6828565752880413,
432
+ "grad_norm": 3.288008451461792,
433
+ "learning_rate": 0.0004633571080764038,
434
+ "loss": 2.4934,
435
+ "step": 27500
436
+ },
437
+ {
438
+ "epoch": 0.6952721493841876,
439
+ "grad_norm": 2.741086721420288,
440
+ "learning_rate": 0.00046266544565193334,
441
+ "loss": 2.4951,
442
+ "step": 28000
443
+ },
444
+ {
445
+ "epoch": 0.7076877234803337,
446
+ "grad_norm": 2.276293992996216,
447
+ "learning_rate": 0.0004619737832274629,
448
+ "loss": 2.4767,
449
+ "step": 28500
450
+ },
451
+ {
452
+ "epoch": 0.72010329757648,
453
+ "grad_norm": 2.5310287475585938,
454
+ "learning_rate": 0.0004612821208029924,
455
+ "loss": 2.5034,
456
+ "step": 29000
457
+ },
458
+ {
459
+ "epoch": 0.7325188716726262,
460
+ "grad_norm": 2.1986629962921143,
461
+ "learning_rate": 0.00046059045837852197,
462
+ "loss": 2.459,
463
+ "step": 29500
464
+ },
465
+ {
466
+ "epoch": 0.7449344457687723,
467
+ "grad_norm": 1.948871374130249,
468
+ "learning_rate": 0.0004598987959540515,
469
+ "loss": 2.4328,
470
+ "step": 30000
471
+ },
472
+ {
473
+ "epoch": 0.7449344457687723,
474
+ "eval_loss": 1.53118896484375,
475
+ "eval_runtime": 4144.3778,
476
+ "eval_samples_per_second": 310.949,
477
+ "eval_steps_per_second": 9.717,
478
+ "step": 30000
479
  }
480
  ],
481
  "logging_steps": 500,
 
495
  "attributes": {}
496
  }
497
  },
498
+ "total_flos": 1.260592966550446e+17,
499
  "train_batch_size": 32,
500
  "trial_name": null,
501
  "trial_params": null