shulijia commited on
Commit
9b441ac
·
verified ·
1 Parent(s): 304515b

Training in progress, step 2144, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e31a11b8093a7dc9efbf932e92cc2201b7cba0591316ca570ba6fc5cf01792fb
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d6179563befe7adfc76c6862ae789878edfc301425a31f7c05e460c293e42c
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a1c8f58f9175b9d2f69570c18408756e4438131f09244dd384c4cf26f3fe8f8
3
  size 4768662910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c5fb3a478c067735c5106afc5a92aafb91022f2494eb9c1a86a597f9aad06d7
3
  size 4768662910
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d8715e1d936b90bedf8fa9339815a5475f6dd3a74a7e90e59a4e0f3a7bbb964
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2021d3136a5f60737933a3df1beba61e770d3dfaf0f9f8fec5ae750dacb73b71
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9325096165054202,
6
  "eval_steps": 100,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1808,6 +1808,132 @@
1808
  "mean_token_accuracy": 0.9734833620488643,
1809
  "num_tokens": 8192000.0,
1810
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
  }
1812
  ],
1813
  "logging_steps": 10,
@@ -1822,12 +1948,12 @@
1822
  "should_evaluate": false,
1823
  "should_log": false,
1824
  "should_save": true,
1825
- "should_training_stop": false
1826
  },
1827
  "attributes": {}
1828
  }
1829
  },
1830
- "total_flos": 2.1649856397312e+16,
1831
  "train_batch_size": 1,
1832
  "trial_name": null,
1833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9996503088938105,
6
  "eval_steps": 100,
7
+ "global_step": 2144,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1808
  "mean_token_accuracy": 0.9734833620488643,
1809
  "num_tokens": 8192000.0,
1810
  "step": 2000
1811
+ },
1812
+ {
1813
+ "epoch": 0.9371721645879473,
1814
+ "grad_norm": 0.8202114701271057,
1815
+ "learning_rate": 3.4992223950233285e-06,
1816
+ "loss": 0.1578,
1817
+ "mean_token_accuracy": 0.9717221096158027,
1818
+ "num_tokens": 8232960.0,
1819
+ "step": 2010
1820
+ },
1821
+ {
1822
+ "epoch": 0.9418347126704744,
1823
+ "grad_norm": 0.731518566608429,
1824
+ "learning_rate": 3.2400207361327116e-06,
1825
+ "loss": 0.1481,
1826
+ "mean_token_accuracy": 0.972725048661232,
1827
+ "num_tokens": 8273920.0,
1828
+ "step": 2020
1829
+ },
1830
+ {
1831
+ "epoch": 0.9464972607530016,
1832
+ "grad_norm": 1.1924803256988525,
1833
+ "learning_rate": 2.9808190772420947e-06,
1834
+ "loss": 0.1497,
1835
+ "mean_token_accuracy": 0.9719667322933674,
1836
+ "num_tokens": 8314880.0,
1837
+ "step": 2030
1838
+ },
1839
+ {
1840
+ "epoch": 0.9511598088355286,
1841
+ "grad_norm": 0.8626778721809387,
1842
+ "learning_rate": 2.721617418351478e-06,
1843
+ "loss": 0.1553,
1844
+ "mean_token_accuracy": 0.9710861027240754,
1845
+ "num_tokens": 8355840.0,
1846
+ "step": 2040
1847
+ },
1848
+ {
1849
+ "epoch": 0.9558223569180557,
1850
+ "grad_norm": 0.7663152813911438,
1851
+ "learning_rate": 2.462415759460861e-06,
1852
+ "loss": 0.1342,
1853
+ "mean_token_accuracy": 0.9749266132712364,
1854
+ "num_tokens": 8396800.0,
1855
+ "step": 2050
1856
+ },
1857
+ {
1858
+ "epoch": 0.9604849050005828,
1859
+ "grad_norm": 0.9175160527229309,
1860
+ "learning_rate": 2.2032141005702436e-06,
1861
+ "loss": 0.1494,
1862
+ "mean_token_accuracy": 0.9725293479859829,
1863
+ "num_tokens": 8437760.0,
1864
+ "step": 2060
1865
+ },
1866
+ {
1867
+ "epoch": 0.9651474530831099,
1868
+ "grad_norm": 0.6397636532783508,
1869
+ "learning_rate": 1.9440124416796267e-06,
1870
+ "loss": 0.1353,
1871
+ "mean_token_accuracy": 0.9759050846099854,
1872
+ "num_tokens": 8478720.0,
1873
+ "step": 2070
1874
+ },
1875
+ {
1876
+ "epoch": 0.969810001165637,
1877
+ "grad_norm": 0.8350099325180054,
1878
+ "learning_rate": 1.6848107827890098e-06,
1879
+ "loss": 0.1424,
1880
+ "mean_token_accuracy": 0.974143834412098,
1881
+ "num_tokens": 8519680.0,
1882
+ "step": 2080
1883
+ },
1884
+ {
1885
+ "epoch": 0.9744725492481642,
1886
+ "grad_norm": 0.7387396097183228,
1887
+ "learning_rate": 1.4256091238983931e-06,
1888
+ "loss": 0.1371,
1889
+ "mean_token_accuracy": 0.9749266110360623,
1890
+ "num_tokens": 8560640.0,
1891
+ "step": 2090
1892
+ },
1893
+ {
1894
+ "epoch": 0.9791350973306913,
1895
+ "grad_norm": 0.7936656475067139,
1896
+ "learning_rate": 1.1664074650077762e-06,
1897
+ "loss": 0.1373,
1898
+ "mean_token_accuracy": 0.9743150658905506,
1899
+ "num_tokens": 8601600.0,
1900
+ "step": 2100
1901
+ },
1902
+ {
1903
+ "epoch": 0.9837976454132183,
1904
+ "grad_norm": 0.7077323198318481,
1905
+ "learning_rate": 9.072058061171591e-07,
1906
+ "loss": 0.1429,
1907
+ "mean_token_accuracy": 0.9742172166705132,
1908
+ "num_tokens": 8642560.0,
1909
+ "step": 2110
1910
+ },
1911
+ {
1912
+ "epoch": 0.9884601934957454,
1913
+ "grad_norm": 0.7181702256202698,
1914
+ "learning_rate": 6.480041472265423e-07,
1915
+ "loss": 0.1482,
1916
+ "mean_token_accuracy": 0.9726516641676426,
1917
+ "num_tokens": 8683520.0,
1918
+ "step": 2120
1919
+ },
1920
+ {
1921
+ "epoch": 0.9931227415782725,
1922
+ "grad_norm": 0.7022804021835327,
1923
+ "learning_rate": 3.888024883359254e-07,
1924
+ "loss": 0.1486,
1925
+ "mean_token_accuracy": 0.9724804282188415,
1926
+ "num_tokens": 8724480.0,
1927
+ "step": 2130
1928
+ },
1929
+ {
1930
+ "epoch": 0.9977852896607996,
1931
+ "grad_norm": 0.8867694735527039,
1932
+ "learning_rate": 1.2960082944530845e-07,
1933
+ "loss": 0.1601,
1934
+ "mean_token_accuracy": 0.9704256311058999,
1935
+ "num_tokens": 8765440.0,
1936
+ "step": 2140
1937
  }
1938
  ],
1939
  "logging_steps": 10,
 
1948
  "should_evaluate": false,
1949
  "should_log": false,
1950
  "should_save": true,
1951
+ "should_training_stop": true
1952
  },
1953
  "attributes": {}
1954
  }
1955
  },
1956
+ "total_flos": 2.3208646057918464e+16,
1957
  "train_batch_size": 1,
1958
  "trial_name": null,
1959
  "trial_params": null