shulijia commited on
Commit
d9108b4
·
verified ·
1 Parent(s): dbb1f47

Training in progress, step 2808, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efcff6c48c08e94349cc49cc783b8ae3ecc9cd09a53a4758ee78a29572cccc1f
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:722a15a41fb1cf0c0aed8d758c5e1a337541cfa86629d2c46863528950276e2f
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0517d23275a28b9e0e485af28a3fd8e3367187b1b3fa646e2dad946ff34488fc
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d12628f51c7f4bc8cb3f237b3ca8b7dcf3d7dfee77e7169e602bea9973ebd683
3
  size 4768663315
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:020a6f14b3cac02d92f73eeb404b16f26c01f0db6a2cbf8681f2b679e8af8524
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29adc3f04b16f4ac7f5b1dc3d9fcb19c78040ad671c4b2bf4a3cc4d244df933
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a080da6311c41d255bb3f245c892db20898f0a63654357045b9affb58ad25368
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:284b167bb422ee615f8e9d8e8547811bfc46bf29da5b79cfc94d4da6720ac4b5
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.6711194229227893,
6
  "eval_steps": 100,
7
- "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2458,6 +2458,300 @@
2458
  "eval_samples_per_second": 30.289,
2459
  "eval_steps_per_second": 1.893,
2460
  "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2461
  }
2462
  ],
2463
  "logging_steps": 10,
@@ -2472,12 +2766,12 @@
2472
  "should_evaluate": false,
2473
  "should_log": false,
2474
  "should_save": true,
2475
- "should_training_stop": false
2476
  },
2477
  "attributes": {}
2478
  }
2479
  },
2480
- "total_flos": 5.411381606508134e+16,
2481
  "train_batch_size": 2,
2482
  "trial_name": null,
2483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 2808,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2458
  "eval_samples_per_second": 30.289,
2459
  "eval_steps_per_second": 1.893,
2460
  "step": 2500
2461
+ },
2462
+ {
2463
+ "epoch": 2.6818060379374833,
2464
+ "grad_norm": 1.0547696352005005,
2465
+ "learning_rate": 1.183221210922042e-06,
2466
+ "loss": 0.158,
2467
+ "mean_token_accuracy": 0.826284246146679,
2468
+ "num_tokens": 20557824.0,
2469
+ "step": 2510
2470
+ },
2471
+ {
2472
+ "epoch": 2.6924926529521773,
2473
+ "grad_norm": 1.7910685539245605,
2474
+ "learning_rate": 1.143648595172141e-06,
2475
+ "loss": 0.1559,
2476
+ "mean_token_accuracy": 0.8219667319208384,
2477
+ "num_tokens": 20639744.0,
2478
+ "step": 2520
2479
+ },
2480
+ {
2481
+ "epoch": 2.7031792679668714,
2482
+ "grad_norm": 1.1875276565551758,
2483
+ "learning_rate": 1.10407597942224e-06,
2484
+ "loss": 0.1595,
2485
+ "mean_token_accuracy": 0.8250611502677202,
2486
+ "num_tokens": 20721664.0,
2487
+ "step": 2530
2488
+ },
2489
+ {
2490
+ "epoch": 2.7138658829815654,
2491
+ "grad_norm": 1.1379560232162476,
2492
+ "learning_rate": 1.064503363672339e-06,
2493
+ "loss": 0.1273,
2494
+ "mean_token_accuracy": 0.8377813111990691,
2495
+ "num_tokens": 20803584.0,
2496
+ "step": 2540
2497
+ },
2498
+ {
2499
+ "epoch": 2.7245524979962594,
2500
+ "grad_norm": 1.1959253549575806,
2501
+ "learning_rate": 1.024930747922438e-06,
2502
+ "loss": 0.1348,
2503
+ "mean_token_accuracy": 0.832387474551797,
2504
+ "num_tokens": 20885504.0,
2505
+ "step": 2550
2506
+ },
2507
+ {
2508
+ "epoch": 2.735239113010954,
2509
+ "grad_norm": 1.2150254249572754,
2510
+ "learning_rate": 9.853581321725367e-07,
2511
+ "loss": 0.1551,
2512
+ "mean_token_accuracy": 0.821587573364377,
2513
+ "num_tokens": 20967424.0,
2514
+ "step": 2560
2515
+ },
2516
+ {
2517
+ "epoch": 2.745925728025648,
2518
+ "grad_norm": 1.5686252117156982,
2519
+ "learning_rate": 9.457855164226357e-07,
2520
+ "loss": 0.1461,
2521
+ "mean_token_accuracy": 0.8346624251455068,
2522
+ "num_tokens": 21049344.0,
2523
+ "step": 2570
2524
+ },
2525
+ {
2526
+ "epoch": 2.756612343040342,
2527
+ "grad_norm": 1.5219730138778687,
2528
+ "learning_rate": 9.062129006727345e-07,
2529
+ "loss": 0.1545,
2530
+ "mean_token_accuracy": 0.8117783740162849,
2531
+ "num_tokens": 21131264.0,
2532
+ "step": 2580
2533
+ },
2534
+ {
2535
+ "epoch": 2.767298958055036,
2536
+ "grad_norm": 1.216352939605713,
2537
+ "learning_rate": 8.666402849228335e-07,
2538
+ "loss": 0.1249,
2539
+ "mean_token_accuracy": 0.831531309708953,
2540
+ "num_tokens": 21213184.0,
2541
+ "step": 2590
2542
+ },
2543
+ {
2544
+ "epoch": 2.77798557306973,
2545
+ "grad_norm": 0.99131840467453,
2546
+ "learning_rate": 8.270676691729324e-07,
2547
+ "loss": 0.1551,
2548
+ "step": 2600
2549
+ },
2550
+ {
2551
+ "epoch": 2.77798557306973,
2552
+ "eval_loss": 0.35856378078460693,
2553
+ "eval_mean_token_accuracy": 0.9245799160920657,
2554
+ "eval_num_tokens": 21295104.0,
2555
+ "eval_runtime": 54.645,
2556
+ "eval_samples_per_second": 30.451,
2557
+ "eval_steps_per_second": 1.903,
2558
+ "step": 2600
2559
+ },
2560
+ {
2561
+ "epoch": 2.7886721880844245,
2562
+ "grad_norm": 1.2371041774749756,
2563
+ "learning_rate": 7.874950534230314e-07,
2564
+ "loss": 0.155,
2565
+ "mean_token_accuracy": 0.8295437873341143,
2566
+ "num_tokens": 21377024.0,
2567
+ "step": 2610
2568
+ },
2569
+ {
2570
+ "epoch": 2.7993588030991186,
2571
+ "grad_norm": 1.222365140914917,
2572
+ "learning_rate": 7.479224376731302e-07,
2573
+ "loss": 0.1379,
2574
+ "mean_token_accuracy": 0.8334271032363176,
2575
+ "num_tokens": 21458944.0,
2576
+ "step": 2620
2577
+ },
2578
+ {
2579
+ "epoch": 2.8100454181138126,
2580
+ "grad_norm": 1.0691900253295898,
2581
+ "learning_rate": 7.083498219232292e-07,
2582
+ "loss": 0.1298,
2583
+ "mean_token_accuracy": 0.8295743606984616,
2584
+ "num_tokens": 21540864.0,
2585
+ "step": 2630
2586
+ },
2587
+ {
2588
+ "epoch": 2.8207320331285066,
2589
+ "grad_norm": 1.298315167427063,
2590
+ "learning_rate": 6.687772061733282e-07,
2591
+ "loss": 0.1742,
2592
+ "mean_token_accuracy": 0.8030210331082344,
2593
+ "num_tokens": 21622784.0,
2594
+ "step": 2640
2595
+ },
2596
+ {
2597
+ "epoch": 2.8314186481432007,
2598
+ "grad_norm": 1.385136604309082,
2599
+ "learning_rate": 6.29204590423427e-07,
2600
+ "loss": 0.1365,
2601
+ "mean_token_accuracy": 0.8225048929452896,
2602
+ "num_tokens": 21704704.0,
2603
+ "step": 2650
2604
+ },
2605
+ {
2606
+ "epoch": 2.8421052631578947,
2607
+ "grad_norm": 1.5449044704437256,
2608
+ "learning_rate": 5.89631974673526e-07,
2609
+ "loss": 0.1761,
2610
+ "mean_token_accuracy": 0.8003669247031212,
2611
+ "num_tokens": 21786624.0,
2612
+ "step": 2660
2613
+ },
2614
+ {
2615
+ "epoch": 2.8527918781725887,
2616
+ "grad_norm": 1.9411510229110718,
2617
+ "learning_rate": 5.500593589236249e-07,
2618
+ "loss": 0.1482,
2619
+ "mean_token_accuracy": 0.8060420740395784,
2620
+ "num_tokens": 21868544.0,
2621
+ "step": 2670
2622
+ },
2623
+ {
2624
+ "epoch": 2.8634784931872828,
2625
+ "grad_norm": 1.0051461458206177,
2626
+ "learning_rate": 5.104867431737238e-07,
2627
+ "loss": 0.1443,
2628
+ "mean_token_accuracy": 0.8296232867985964,
2629
+ "num_tokens": 21950464.0,
2630
+ "step": 2680
2631
+ },
2632
+ {
2633
+ "epoch": 2.874165108201977,
2634
+ "grad_norm": 1.6511443853378296,
2635
+ "learning_rate": 4.7091412742382274e-07,
2636
+ "loss": 0.1427,
2637
+ "mean_token_accuracy": 0.829244127869606,
2638
+ "num_tokens": 22032384.0,
2639
+ "step": 2690
2640
+ },
2641
+ {
2642
+ "epoch": 2.8848517232166713,
2643
+ "grad_norm": 0.8944096565246582,
2644
+ "learning_rate": 4.3134151167392167e-07,
2645
+ "loss": 0.1335,
2646
+ "step": 2700
2647
+ },
2648
+ {
2649
+ "epoch": 2.8848517232166713,
2650
+ "eval_loss": 0.35777753591537476,
2651
+ "eval_mean_token_accuracy": 0.9246739997313573,
2652
+ "eval_num_tokens": 22114304.0,
2653
+ "eval_runtime": 55.0863,
2654
+ "eval_samples_per_second": 30.207,
2655
+ "eval_steps_per_second": 1.888,
2656
+ "step": 2700
2657
+ },
2658
+ {
2659
+ "epoch": 2.8955383382313653,
2660
+ "grad_norm": 1.2431285381317139,
2661
+ "learning_rate": 3.917688959240206e-07,
2662
+ "loss": 0.1319,
2663
+ "mean_token_accuracy": 0.8427042558789253,
2664
+ "num_tokens": 22196224.0,
2665
+ "step": 2710
2666
+ },
2667
+ {
2668
+ "epoch": 2.9062249532460593,
2669
+ "grad_norm": 1.1439481973648071,
2670
+ "learning_rate": 3.5219628017411953e-07,
2671
+ "loss": 0.1556,
2672
+ "mean_token_accuracy": 0.82298189625144,
2673
+ "num_tokens": 22278144.0,
2674
+ "step": 2720
2675
+ },
2676
+ {
2677
+ "epoch": 2.9169115682607534,
2678
+ "grad_norm": 1.1549094915390015,
2679
+ "learning_rate": 3.1262366442421846e-07,
2680
+ "loss": 0.1336,
2681
+ "mean_token_accuracy": 0.8487646751105785,
2682
+ "num_tokens": 22360064.0,
2683
+ "step": 2730
2684
+ },
2685
+ {
2686
+ "epoch": 2.9275981832754474,
2687
+ "grad_norm": 1.5530976057052612,
2688
+ "learning_rate": 2.730510486743174e-07,
2689
+ "loss": 0.1643,
2690
+ "mean_token_accuracy": 0.8154109574854373,
2691
+ "num_tokens": 22441984.0,
2692
+ "step": 2740
2693
+ },
2694
+ {
2695
+ "epoch": 2.9382847982901414,
2696
+ "grad_norm": 1.057889699935913,
2697
+ "learning_rate": 2.3347843292441632e-07,
2698
+ "loss": 0.1249,
2699
+ "mean_token_accuracy": 0.8479207433760166,
2700
+ "num_tokens": 22523904.0,
2701
+ "step": 2750
2702
+ },
2703
+ {
2704
+ "epoch": 2.948971413304836,
2705
+ "grad_norm": 1.149165153503418,
2706
+ "learning_rate": 1.9390581717451524e-07,
2707
+ "loss": 0.1343,
2708
+ "mean_token_accuracy": 0.8305650681257248,
2709
+ "num_tokens": 22605824.0,
2710
+ "step": 2760
2711
+ },
2712
+ {
2713
+ "epoch": 2.95965802831953,
2714
+ "grad_norm": 1.325223684310913,
2715
+ "learning_rate": 1.5433320142461417e-07,
2716
+ "loss": 0.1251,
2717
+ "mean_token_accuracy": 0.8496453046798706,
2718
+ "num_tokens": 22687744.0,
2719
+ "step": 2770
2720
+ },
2721
+ {
2722
+ "epoch": 2.970344643334224,
2723
+ "grad_norm": 1.2108739614486694,
2724
+ "learning_rate": 1.1476058567471311e-07,
2725
+ "loss": 0.14,
2726
+ "mean_token_accuracy": 0.8448997050523758,
2727
+ "num_tokens": 22769664.0,
2728
+ "step": 2780
2729
+ },
2730
+ {
2731
+ "epoch": 2.981031258348918,
2732
+ "grad_norm": 1.3373411893844604,
2733
+ "learning_rate": 7.518796992481203e-08,
2734
+ "loss": 0.1486,
2735
+ "mean_token_accuracy": 0.8307362981140614,
2736
+ "num_tokens": 22851584.0,
2737
+ "step": 2790
2738
+ },
2739
+ {
2740
+ "epoch": 2.991717873363612,
2741
+ "grad_norm": 1.7406622171401978,
2742
+ "learning_rate": 3.5615354174910966e-08,
2743
+ "loss": 0.1905,
2744
+ "step": 2800
2745
+ },
2746
+ {
2747
+ "epoch": 2.991717873363612,
2748
+ "eval_loss": 0.3580096960067749,
2749
+ "eval_mean_token_accuracy": 0.924671647640375,
2750
+ "eval_num_tokens": 22933504.0,
2751
+ "eval_runtime": 54.8159,
2752
+ "eval_samples_per_second": 30.356,
2753
+ "eval_steps_per_second": 1.897,
2754
+ "step": 2800
2755
  }
2756
  ],
2757
  "logging_steps": 10,
 
2766
  "should_evaluate": false,
2767
  "should_log": false,
2768
  "should_save": true,
2769
+ "should_training_stop": true
2770
  },
2771
  "attributes": {}
2772
  }
2773
  },
2774
+ "total_flos": 6.077655937135411e+16,
2775
  "train_batch_size": 2,
2776
  "trial_name": null,
2777
  "trial_params": null