shulijia commited on
Commit
f69583f
·
verified ·
1 Parent(s): 9d4732a

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ca7c4e3749f06ebc1778c062b3d70c4f488a26b411ddd7c3d301ae4023802d1
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0036e61ddac96c13d28af5b7348463838da31642973c16e5370deba79e225fb7
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d45b61751a61e4b6f882922592537bf8e092f455741fa220a9008ab320f07ad
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81c5b9d8b1806de7455aa1e925033cdc78ae0c9f0b199eac035d87169284a120
3
  size 4768662910
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a81369904e00a468d2ec4beb1dd4e8f30c6191c2e29c4144f662ff07eadf5eab
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ba427af78f54355503e8fb146121e9f936d278226f07d5bf09468fc62083d77
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9182736455463728,
6
  "eval_steps": 100,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8558,6 +8558,456 @@
8558
  "mean_token_accuracy": 0.9684931464493275,
8559
  "num_tokens": 38912000.0,
8560
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8561
  }
8562
  ],
8563
  "logging_steps": 10,
@@ -8577,7 +9027,7 @@
8577
  "attributes": {}
8578
  }
8579
  },
8580
- "total_flos": 1.02836817887232e+17,
8581
  "train_batch_size": 1,
8582
  "trial_name": null,
8583
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9666038374172345,
6
  "eval_steps": 100,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8558
  "mean_token_accuracy": 0.9684931464493275,
8559
  "num_tokens": 38912000.0,
8560
  "step": 9500
8561
+ },
8562
+ {
8563
+ "epoch": 0.91924024938379,
8564
+ "grad_norm": 0.8302338123321533,
8565
+ "learning_rate": 4.489795918367347e-06,
8566
+ "loss": 0.1492,
8567
+ "mean_token_accuracy": 0.9706213280558587,
8568
+ "num_tokens": 38952960.0,
8569
+ "step": 9510
8570
+ },
8571
+ {
8572
+ "epoch": 0.9202068532212073,
8573
+ "grad_norm": 0.7152886390686035,
8574
+ "learning_rate": 4.436090225563911e-06,
8575
+ "loss": 0.123,
8576
+ "mean_token_accuracy": 0.9749999962747097,
8577
+ "num_tokens": 38993920.0,
8578
+ "step": 9520
8579
+ },
8580
+ {
8581
+ "epoch": 0.9211734570586245,
8582
+ "grad_norm": 0.5830357074737549,
8583
+ "learning_rate": 4.382384532760473e-06,
8584
+ "loss": 0.1219,
8585
+ "mean_token_accuracy": 0.9751712322235108,
8586
+ "num_tokens": 39034880.0,
8587
+ "step": 9530
8588
+ },
8589
+ {
8590
+ "epoch": 0.9221400608960417,
8591
+ "grad_norm": 0.8357058763504028,
8592
+ "learning_rate": 4.328678839957036e-06,
8593
+ "loss": 0.1569,
8594
+ "mean_token_accuracy": 0.9674412876367569,
8595
+ "num_tokens": 39075840.0,
8596
+ "step": 9540
8597
+ },
8598
+ {
8599
+ "epoch": 0.923106664733459,
8600
+ "grad_norm": 0.8819809556007385,
8601
+ "learning_rate": 4.274973147153598e-06,
8602
+ "loss": 0.143,
8603
+ "mean_token_accuracy": 0.9700097791850567,
8604
+ "num_tokens": 39116800.0,
8605
+ "step": 9550
8606
+ },
8607
+ {
8608
+ "epoch": 0.9240732685708762,
8609
+ "grad_norm": 0.6170474290847778,
8610
+ "learning_rate": 4.221267454350161e-06,
8611
+ "loss": 0.1453,
8612
+ "mean_token_accuracy": 0.9699363976716995,
8613
+ "num_tokens": 39157760.0,
8614
+ "step": 9560
8615
+ },
8616
+ {
8617
+ "epoch": 0.9250398724082934,
8618
+ "grad_norm": 0.7918187975883484,
8619
+ "learning_rate": 4.1675617615467236e-06,
8620
+ "loss": 0.1473,
8621
+ "mean_token_accuracy": 0.9694960817694664,
8622
+ "num_tokens": 39198720.0,
8623
+ "step": 9570
8624
+ },
8625
+ {
8626
+ "epoch": 0.9260064762457106,
8627
+ "grad_norm": 0.7999791502952576,
8628
+ "learning_rate": 4.113856068743287e-06,
8629
+ "loss": 0.12,
8630
+ "mean_token_accuracy": 0.9749755367636681,
8631
+ "num_tokens": 39239680.0,
8632
+ "step": 9580
8633
+ },
8634
+ {
8635
+ "epoch": 0.926973080083128,
8636
+ "grad_norm": 0.794882595539093,
8637
+ "learning_rate": 4.06015037593985e-06,
8638
+ "loss": 0.1322,
8639
+ "mean_token_accuracy": 0.9725782789289952,
8640
+ "num_tokens": 39280640.0,
8641
+ "step": 9590
8642
+ },
8643
+ {
8644
+ "epoch": 0.9279396839205452,
8645
+ "grad_norm": 0.6439830660820007,
8646
+ "learning_rate": 4.006444683136413e-06,
8647
+ "loss": 0.1463,
8648
+ "mean_token_accuracy": 0.9704745531082153,
8649
+ "num_tokens": 39321600.0,
8650
+ "step": 9600
8651
+ },
8652
+ {
8653
+ "epoch": 0.9289062877579624,
8654
+ "grad_norm": 0.7629963159561157,
8655
+ "learning_rate": 3.952738990332976e-06,
8656
+ "loss": 0.1556,
8657
+ "mean_token_accuracy": 0.9674657486379147,
8658
+ "num_tokens": 39362560.0,
8659
+ "step": 9610
8660
+ },
8661
+ {
8662
+ "epoch": 0.9298728915953797,
8663
+ "grad_norm": 0.7481008172035217,
8664
+ "learning_rate": 3.899033297529538e-06,
8665
+ "loss": 0.1459,
8666
+ "mean_token_accuracy": 0.9704256355762482,
8667
+ "num_tokens": 39403520.0,
8668
+ "step": 9620
8669
+ },
8670
+ {
8671
+ "epoch": 0.9308394954327969,
8672
+ "grad_norm": 0.7382989525794983,
8673
+ "learning_rate": 3.845327604726101e-06,
8674
+ "loss": 0.1481,
8675
+ "mean_token_accuracy": 0.9690802298486233,
8676
+ "num_tokens": 39444480.0,
8677
+ "step": 9630
8678
+ },
8679
+ {
8680
+ "epoch": 0.9318060992702141,
8681
+ "grad_norm": 0.7181780934333801,
8682
+ "learning_rate": 3.791621911922664e-06,
8683
+ "loss": 0.1432,
8684
+ "mean_token_accuracy": 0.9703767105937005,
8685
+ "num_tokens": 39485440.0,
8686
+ "step": 9640
8687
+ },
8688
+ {
8689
+ "epoch": 0.9327727031076314,
8690
+ "grad_norm": 0.7073920369148254,
8691
+ "learning_rate": 3.7379162191192266e-06,
8692
+ "loss": 0.1423,
8693
+ "mean_token_accuracy": 0.9701320916414261,
8694
+ "num_tokens": 39526400.0,
8695
+ "step": 9650
8696
+ },
8697
+ {
8698
+ "epoch": 0.9337393069450486,
8699
+ "grad_norm": 0.665108859539032,
8700
+ "learning_rate": 3.6842105263157892e-06,
8701
+ "loss": 0.1367,
8702
+ "mean_token_accuracy": 0.9713796436786651,
8703
+ "num_tokens": 39567360.0,
8704
+ "step": 9660
8705
+ },
8706
+ {
8707
+ "epoch": 0.9347059107824658,
8708
+ "grad_norm": 0.6856437921524048,
8709
+ "learning_rate": 3.6305048335123527e-06,
8710
+ "loss": 0.1487,
8711
+ "mean_token_accuracy": 0.9685665339231491,
8712
+ "num_tokens": 39608320.0,
8713
+ "step": 9670
8714
+ },
8715
+ {
8716
+ "epoch": 0.935672514619883,
8717
+ "grad_norm": 0.6998845934867859,
8718
+ "learning_rate": 3.5767991407089154e-06,
8719
+ "loss": 0.1325,
8720
+ "mean_token_accuracy": 0.9735078237950802,
8721
+ "num_tokens": 39649280.0,
8722
+ "step": 9680
8723
+ },
8724
+ {
8725
+ "epoch": 0.9366391184573003,
8726
+ "grad_norm": 0.7220867276191711,
8727
+ "learning_rate": 3.523093447905478e-06,
8728
+ "loss": 0.1322,
8729
+ "mean_token_accuracy": 0.9729941241443157,
8730
+ "num_tokens": 39690240.0,
8731
+ "step": 9690
8732
+ },
8733
+ {
8734
+ "epoch": 0.9376057222947175,
8735
+ "grad_norm": 0.750056803226471,
8736
+ "learning_rate": 3.469387755102041e-06,
8737
+ "loss": 0.1321,
8738
+ "mean_token_accuracy": 0.9733365938067436,
8739
+ "num_tokens": 39731200.0,
8740
+ "step": 9700
8741
+ },
8742
+ {
8743
+ "epoch": 0.9385723261321347,
8744
+ "grad_norm": 0.5716467499732971,
8745
+ "learning_rate": 3.415682062298604e-06,
8746
+ "loss": 0.1513,
8747
+ "mean_token_accuracy": 0.9702299371361732,
8748
+ "num_tokens": 39772160.0,
8749
+ "step": 9710
8750
+ },
8751
+ {
8752
+ "epoch": 0.939538929969552,
8753
+ "grad_norm": 0.7372239828109741,
8754
+ "learning_rate": 3.3619763694951665e-06,
8755
+ "loss": 0.1608,
8756
+ "mean_token_accuracy": 0.9661448121070861,
8757
+ "num_tokens": 39813120.0,
8758
+ "step": 9720
8759
+ },
8760
+ {
8761
+ "epoch": 0.9405055338069692,
8762
+ "grad_norm": 0.6863879561424255,
8763
+ "learning_rate": 3.308270676691729e-06,
8764
+ "loss": 0.141,
8765
+ "mean_token_accuracy": 0.9709148697555066,
8766
+ "num_tokens": 39854080.0,
8767
+ "step": 9730
8768
+ },
8769
+ {
8770
+ "epoch": 0.9414721376443864,
8771
+ "grad_norm": 0.7031144499778748,
8772
+ "learning_rate": 3.2545649838882926e-06,
8773
+ "loss": 0.1355,
8774
+ "mean_token_accuracy": 0.9717954933643341,
8775
+ "num_tokens": 39895040.0,
8776
+ "step": 9740
8777
+ },
8778
+ {
8779
+ "epoch": 0.9424387414818037,
8780
+ "grad_norm": 0.6682131886482239,
8781
+ "learning_rate": 3.2008592910848553e-06,
8782
+ "loss": 0.145,
8783
+ "mean_token_accuracy": 0.9700342446565628,
8784
+ "num_tokens": 39936000.0,
8785
+ "step": 9750
8786
+ },
8787
+ {
8788
+ "epoch": 0.9434053453192209,
8789
+ "grad_norm": 0.6650647521018982,
8790
+ "learning_rate": 3.147153598281418e-06,
8791
+ "loss": 0.1413,
8792
+ "mean_token_accuracy": 0.9703767091035843,
8793
+ "num_tokens": 39976960.0,
8794
+ "step": 9760
8795
+ },
8796
+ {
8797
+ "epoch": 0.9443719491566381,
8798
+ "grad_norm": 0.8530674576759338,
8799
+ "learning_rate": 3.0934479054779806e-06,
8800
+ "loss": 0.151,
8801
+ "mean_token_accuracy": 0.9692025408148766,
8802
+ "num_tokens": 40017920.0,
8803
+ "step": 9770
8804
+ },
8805
+ {
8806
+ "epoch": 0.9453385529940553,
8807
+ "grad_norm": 0.7780562043190002,
8808
+ "learning_rate": 3.0397422126745437e-06,
8809
+ "loss": 0.1308,
8810
+ "mean_token_accuracy": 0.9729207396507263,
8811
+ "num_tokens": 40058880.0,
8812
+ "step": 9780
8813
+ },
8814
+ {
8815
+ "epoch": 0.9463051568314726,
8816
+ "grad_norm": 0.613500714302063,
8817
+ "learning_rate": 2.9860365198711068e-06,
8818
+ "loss": 0.1301,
8819
+ "mean_token_accuracy": 0.9727984338998794,
8820
+ "num_tokens": 40099840.0,
8821
+ "step": 9790
8822
+ },
8823
+ {
8824
+ "epoch": 0.9472717606688899,
8825
+ "grad_norm": 0.629189670085907,
8826
+ "learning_rate": 2.9323308270676694e-06,
8827
+ "loss": 0.1198,
8828
+ "mean_token_accuracy": 0.9759295471012592,
8829
+ "num_tokens": 40140800.0,
8830
+ "step": 9800
8831
+ },
8832
+ {
8833
+ "epoch": 0.948238364506307,
8834
+ "grad_norm": 0.7294339537620544,
8835
+ "learning_rate": 2.878625134264232e-06,
8836
+ "loss": 0.1376,
8837
+ "mean_token_accuracy": 0.9716731905937195,
8838
+ "num_tokens": 40181760.0,
8839
+ "step": 9810
8840
+ },
8841
+ {
8842
+ "epoch": 0.9492049683437244,
8843
+ "grad_norm": 0.7860731482505798,
8844
+ "learning_rate": 2.8249194414607948e-06,
8845
+ "loss": 0.1398,
8846
+ "mean_token_accuracy": 0.9713307186961174,
8847
+ "num_tokens": 40222720.0,
8848
+ "step": 9820
8849
+ },
8850
+ {
8851
+ "epoch": 0.9501715721811416,
8852
+ "grad_norm": 0.6915313601493835,
8853
+ "learning_rate": 2.771213748657358e-06,
8854
+ "loss": 0.1543,
8855
+ "mean_token_accuracy": 0.9675146743655205,
8856
+ "num_tokens": 40263680.0,
8857
+ "step": 9830
8858
+ },
8859
+ {
8860
+ "epoch": 0.9511381760185588,
8861
+ "grad_norm": 0.6553166508674622,
8862
+ "learning_rate": 2.7175080558539205e-06,
8863
+ "loss": 0.143,
8864
+ "mean_token_accuracy": 0.9713307216763496,
8865
+ "num_tokens": 40304640.0,
8866
+ "step": 9840
8867
+ },
8868
+ {
8869
+ "epoch": 0.9521047798559761,
8870
+ "grad_norm": 0.5625308752059937,
8871
+ "learning_rate": 2.6638023630504836e-06,
8872
+ "loss": 0.1415,
8873
+ "mean_token_accuracy": 0.9711105637252331,
8874
+ "num_tokens": 40345600.0,
8875
+ "step": 9850
8876
+ },
8877
+ {
8878
+ "epoch": 0.9530713836933933,
8879
+ "grad_norm": 0.7295253872871399,
8880
+ "learning_rate": 2.6100966702470467e-06,
8881
+ "loss": 0.1567,
8882
+ "mean_token_accuracy": 0.9681262217462063,
8883
+ "num_tokens": 40386560.0,
8884
+ "step": 9860
8885
+ },
8886
+ {
8887
+ "epoch": 0.9540379875308105,
8888
+ "grad_norm": 0.5923715233802795,
8889
+ "learning_rate": 2.5563909774436093e-06,
8890
+ "loss": 0.1241,
8891
+ "mean_token_accuracy": 0.9743150658905506,
8892
+ "num_tokens": 40427520.0,
8893
+ "step": 9870
8894
+ },
8895
+ {
8896
+ "epoch": 0.9550045913682277,
8897
+ "grad_norm": 0.6577922105789185,
8898
+ "learning_rate": 2.502685284640172e-06,
8899
+ "loss": 0.1238,
8900
+ "mean_token_accuracy": 0.9746086105704308,
8901
+ "num_tokens": 40468480.0,
8902
+ "step": 9880
8903
+ },
8904
+ {
8905
+ "epoch": 0.955971195205645,
8906
+ "grad_norm": 0.607589066028595,
8907
+ "learning_rate": 2.4489795918367347e-06,
8908
+ "loss": 0.1403,
8909
+ "mean_token_accuracy": 0.9714041076600551,
8910
+ "num_tokens": 40509440.0,
8911
+ "step": 9890
8912
+ },
8913
+ {
8914
+ "epoch": 0.9569377990430622,
8915
+ "grad_norm": 0.620296835899353,
8916
+ "learning_rate": 2.3952738990332978e-06,
8917
+ "loss": 0.1438,
8918
+ "mean_token_accuracy": 0.9697896286845207,
8919
+ "num_tokens": 40550400.0,
8920
+ "step": 9900
8921
+ },
8922
+ {
8923
+ "epoch": 0.9579044028804794,
8924
+ "grad_norm": 0.783877968788147,
8925
+ "learning_rate": 2.3415682062298604e-06,
8926
+ "loss": 0.1415,
8927
+ "mean_token_accuracy": 0.9718688800930977,
8928
+ "num_tokens": 40591360.0,
8929
+ "step": 9910
8930
+ },
8931
+ {
8932
+ "epoch": 0.9588710067178967,
8933
+ "grad_norm": 0.6063189506530762,
8934
+ "learning_rate": 2.287862513426423e-06,
8935
+ "loss": 0.1497,
8936
+ "mean_token_accuracy": 0.9689579211175442,
8937
+ "num_tokens": 40632320.0,
8938
+ "step": 9920
8939
+ },
8940
+ {
8941
+ "epoch": 0.9598376105553139,
8942
+ "grad_norm": 0.7489660382270813,
8943
+ "learning_rate": 2.234156820622986e-06,
8944
+ "loss": 0.135,
8945
+ "mean_token_accuracy": 0.9724559679627418,
8946
+ "num_tokens": 40673280.0,
8947
+ "step": 9930
8948
+ },
8949
+ {
8950
+ "epoch": 0.9608042143927311,
8951
+ "grad_norm": 0.5833399891853333,
8952
+ "learning_rate": 2.1804511278195492e-06,
8953
+ "loss": 0.1408,
8954
+ "mean_token_accuracy": 0.9712328761816025,
8955
+ "num_tokens": 40714240.0,
8956
+ "step": 9940
8957
+ },
8958
+ {
8959
+ "epoch": 0.9617708182301484,
8960
+ "grad_norm": 0.6912499666213989,
8961
+ "learning_rate": 2.126745435016112e-06,
8962
+ "loss": 0.1114,
8963
+ "mean_token_accuracy": 0.975831700116396,
8964
+ "num_tokens": 40755200.0,
8965
+ "step": 9950
8966
+ },
8967
+ {
8968
+ "epoch": 0.9627374220675656,
8969
+ "grad_norm": 0.8001022934913635,
8970
+ "learning_rate": 2.0730397422126746e-06,
8971
+ "loss": 0.1424,
8972
+ "mean_token_accuracy": 0.9710371777415275,
8973
+ "num_tokens": 40796160.0,
8974
+ "step": 9960
8975
+ },
8976
+ {
8977
+ "epoch": 0.9637040259049828,
8978
+ "grad_norm": 0.6807326674461365,
8979
+ "learning_rate": 2.0193340494092377e-06,
8980
+ "loss": 0.1397,
8981
+ "mean_token_accuracy": 0.9719911940395832,
8982
+ "num_tokens": 40837120.0,
8983
+ "step": 9970
8984
+ },
8985
+ {
8986
+ "epoch": 0.9646706297424,
8987
+ "grad_norm": 0.603573203086853,
8988
+ "learning_rate": 1.9656283566058003e-06,
8989
+ "loss": 0.143,
8990
+ "mean_token_accuracy": 0.9709393292665481,
8991
+ "num_tokens": 40878080.0,
8992
+ "step": 9980
8993
+ },
8994
+ {
8995
+ "epoch": 0.9656372335798173,
8996
+ "grad_norm": 0.7243348360061646,
8997
+ "learning_rate": 1.911922663802363e-06,
8998
+ "loss": 0.1455,
8999
+ "mean_token_accuracy": 0.9693737730383873,
9000
+ "num_tokens": 40919040.0,
9001
+ "step": 9990
9002
+ },
9003
+ {
9004
+ "epoch": 0.9666038374172345,
9005
+ "grad_norm": 0.6356106996536255,
9006
+ "learning_rate": 1.8582169709989259e-06,
9007
+ "loss": 0.1502,
9008
+ "mean_token_accuracy": 0.968175146728754,
9009
+ "num_tokens": 40960000.0,
9010
+ "step": 10000
9011
  }
9012
  ],
9013
  "logging_steps": 10,
 
9027
  "attributes": {}
9028
  }
9029
  },
9030
+ "total_flos": 1.0824928198656e+17,
9031
  "train_batch_size": 1,
9032
  "trial_name": null,
9033
  "trial_params": null