shulijia commited on
Commit
09b1852
·
verified ·
1 Parent(s): e296930

Training in progress, step 2340, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a987788873c021c9d673558815b71bbfcf77d57a0a39a1d0c79e832fc62b27f
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e25481045a998d4e040b2d3114349e4f2ea96e62e32a39f2adeb51d25de7a48
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11aa88d182a9ec6576ec4228fa3402e4d824bd766717f06c8bdc910de1884d09
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ffcde3cd0ac7235d16ff44cbb1d360ca8c7288c11306fb4a4310cba5b85f523
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47785a56208855ad691ee4751f427a600446221eaa9a2b5467a89680132ac2e0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74b6236cf0b98164ed85bb41f9abc6098fd976f945ac9580a1a26ceb2561076b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.5646454924606994,
6
  "eval_steps": 100,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1808,6 +1808,312 @@
1808
  "mean_token_accuracy": 0.8290239717811346,
1809
  "num_tokens": 16371712.0,
1810
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1811
  }
1812
  ],
1813
  "logging_steps": 10,
@@ -1822,12 +2128,12 @@
1822
  "should_evaluate": false,
1823
  "should_log": false,
1824
  "should_save": true,
1825
- "should_training_stop": false
1826
  },
1827
  "attributes": {}
1828
  }
1829
  },
1830
- "total_flos": 4.326723801002803e+16,
1831
  "train_batch_size": 2,
1832
  "trial_name": null,
1833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 2340,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1808
  "mean_token_accuracy": 0.8290239717811346,
1809
  "num_tokens": 16371712.0,
1810
  "step": 2000
1811
+ },
1812
+ {
1813
+ "epoch": 2.577478344562079,
1814
+ "grad_norm": 1.7604620456695557,
1815
+ "learning_rate": 1.5716999050332385e-06,
1816
+ "loss": 0.1638,
1817
+ "mean_token_accuracy": 0.7912915859371423,
1818
+ "num_tokens": 16453632.0,
1819
+ "step": 2010
1820
+ },
1821
+ {
1822
+ "epoch": 2.5903111966634587,
1823
+ "grad_norm": 0.9279603958129883,
1824
+ "learning_rate": 1.5242165242165245e-06,
1825
+ "loss": 0.1439,
1826
+ "mean_token_accuracy": 0.8103840544819831,
1827
+ "num_tokens": 16535552.0,
1828
+ "step": 2020
1829
+ },
1830
+ {
1831
+ "epoch": 2.603144048764838,
1832
+ "grad_norm": 1.5552164316177368,
1833
+ "learning_rate": 1.4767331433998102e-06,
1834
+ "loss": 0.1597,
1835
+ "mean_token_accuracy": 0.8110934458673,
1836
+ "num_tokens": 16617472.0,
1837
+ "step": 2030
1838
+ },
1839
+ {
1840
+ "epoch": 2.6159769008662175,
1841
+ "grad_norm": 1.2231167554855347,
1842
+ "learning_rate": 1.429249762583096e-06,
1843
+ "loss": 0.16,
1844
+ "mean_token_accuracy": 0.7961350310593843,
1845
+ "num_tokens": 16699392.0,
1846
+ "step": 2040
1847
+ },
1848
+ {
1849
+ "epoch": 2.628809752967597,
1850
+ "grad_norm": 1.4283727407455444,
1851
+ "learning_rate": 1.3817663817663818e-06,
1852
+ "loss": 0.1861,
1853
+ "mean_token_accuracy": 0.7857387486845255,
1854
+ "num_tokens": 16781312.0,
1855
+ "step": 2050
1856
+ },
1857
+ {
1858
+ "epoch": 2.6416426050689767,
1859
+ "grad_norm": 1.2813409566879272,
1860
+ "learning_rate": 1.3342830009496678e-06,
1861
+ "loss": 0.1599,
1862
+ "mean_token_accuracy": 0.7960371825844049,
1863
+ "num_tokens": 16863232.0,
1864
+ "step": 2060
1865
+ },
1866
+ {
1867
+ "epoch": 2.6544754571703564,
1868
+ "grad_norm": 1.814128041267395,
1869
+ "learning_rate": 1.2867996201329535e-06,
1870
+ "loss": 0.1769,
1871
+ "mean_token_accuracy": 0.7898116439580918,
1872
+ "num_tokens": 16945152.0,
1873
+ "step": 2070
1874
+ },
1875
+ {
1876
+ "epoch": 2.6673083092717356,
1877
+ "grad_norm": 1.8410574197769165,
1878
+ "learning_rate": 1.2393162393162394e-06,
1879
+ "loss": 0.1609,
1880
+ "mean_token_accuracy": 0.8141511753201485,
1881
+ "num_tokens": 17027072.0,
1882
+ "step": 2080
1883
+ },
1884
+ {
1885
+ "epoch": 2.680141161373115,
1886
+ "grad_norm": 1.4829483032226562,
1887
+ "learning_rate": 1.1918328584995251e-06,
1888
+ "loss": 0.1742,
1889
+ "mean_token_accuracy": 0.7898361060768366,
1890
+ "num_tokens": 17108992.0,
1891
+ "step": 2090
1892
+ },
1893
+ {
1894
+ "epoch": 2.692974013474495,
1895
+ "grad_norm": 1.1964958906173706,
1896
+ "learning_rate": 1.144349477682811e-06,
1897
+ "loss": 0.1827,
1898
+ "mean_token_accuracy": 0.7777886476367712,
1899
+ "num_tokens": 17190912.0,
1900
+ "step": 2100
1901
+ },
1902
+ {
1903
+ "epoch": 2.705806865575874,
1904
+ "grad_norm": 1.4900107383728027,
1905
+ "learning_rate": 1.096866096866097e-06,
1906
+ "loss": 0.1289,
1907
+ "mean_token_accuracy": 0.8270180996507406,
1908
+ "num_tokens": 17272832.0,
1909
+ "step": 2110
1910
+ },
1911
+ {
1912
+ "epoch": 2.7186397176772537,
1913
+ "grad_norm": 1.7298692464828491,
1914
+ "learning_rate": 1.0493827160493827e-06,
1915
+ "loss": 0.1671,
1916
+ "mean_token_accuracy": 0.787377692013979,
1917
+ "num_tokens": 17354752.0,
1918
+ "step": 2120
1919
+ },
1920
+ {
1921
+ "epoch": 2.7314725697786333,
1922
+ "grad_norm": 1.5013244152069092,
1923
+ "learning_rate": 1.0018993352326686e-06,
1924
+ "loss": 0.1639,
1925
+ "mean_token_accuracy": 0.7969789639115333,
1926
+ "num_tokens": 17436672.0,
1927
+ "step": 2130
1928
+ },
1929
+ {
1930
+ "epoch": 2.744305421880013,
1931
+ "grad_norm": 1.1740142107009888,
1932
+ "learning_rate": 9.544159544159546e-07,
1933
+ "loss": 0.1611,
1934
+ "mean_token_accuracy": 0.794043542444706,
1935
+ "num_tokens": 17518592.0,
1936
+ "step": 2140
1937
+ },
1938
+ {
1939
+ "epoch": 2.7571382739813926,
1940
+ "grad_norm": 1.4351530075073242,
1941
+ "learning_rate": 9.069325735992403e-07,
1942
+ "loss": 0.1642,
1943
+ "mean_token_accuracy": 0.8051247522234917,
1944
+ "num_tokens": 17600512.0,
1945
+ "step": 2150
1946
+ },
1947
+ {
1948
+ "epoch": 2.7699711260827717,
1949
+ "grad_norm": 1.5310108661651611,
1950
+ "learning_rate": 8.594491927825262e-07,
1951
+ "loss": 0.1683,
1952
+ "mean_token_accuracy": 0.8000000014901161,
1953
+ "num_tokens": 17682432.0,
1954
+ "step": 2160
1955
+ },
1956
+ {
1957
+ "epoch": 2.7828039781841514,
1958
+ "grad_norm": 1.3762239217758179,
1959
+ "learning_rate": 8.11965811965812e-07,
1960
+ "loss": 0.1641,
1961
+ "mean_token_accuracy": 0.7894324846565723,
1962
+ "num_tokens": 17764352.0,
1963
+ "step": 2170
1964
+ },
1965
+ {
1966
+ "epoch": 2.795636830285531,
1967
+ "grad_norm": 1.4585622549057007,
1968
+ "learning_rate": 7.644824311490979e-07,
1969
+ "loss": 0.1591,
1970
+ "mean_token_accuracy": 0.7889432465657592,
1971
+ "num_tokens": 17846272.0,
1972
+ "step": 2180
1973
+ },
1974
+ {
1975
+ "epoch": 2.8084696823869106,
1976
+ "grad_norm": 1.2188738584518433,
1977
+ "learning_rate": 7.169990503323837e-07,
1978
+ "loss": 0.1447,
1979
+ "mean_token_accuracy": 0.8227128185331821,
1980
+ "num_tokens": 17928192.0,
1981
+ "step": 2190
1982
+ },
1983
+ {
1984
+ "epoch": 2.8213025344882903,
1985
+ "grad_norm": 1.2576043605804443,
1986
+ "learning_rate": 6.695156695156696e-07,
1987
+ "loss": 0.1439,
1988
+ "mean_token_accuracy": 0.8094300415366888,
1989
+ "num_tokens": 18010112.0,
1990
+ "step": 2200
1991
+ },
1992
+ {
1993
+ "epoch": 2.8341353865896695,
1994
+ "grad_norm": 1.0408450365066528,
1995
+ "learning_rate": 6.220322886989554e-07,
1996
+ "loss": 0.2027,
1997
+ "mean_token_accuracy": 0.7656066533178091,
1998
+ "num_tokens": 18092032.0,
1999
+ "step": 2210
2000
+ },
2001
+ {
2002
+ "epoch": 2.846968238691049,
2003
+ "grad_norm": 1.3711516857147217,
2004
+ "learning_rate": 5.745489078822413e-07,
2005
+ "loss": 0.1703,
2006
+ "mean_token_accuracy": 0.7800391372293234,
2007
+ "num_tokens": 18173952.0,
2008
+ "step": 2220
2009
+ },
2010
+ {
2011
+ "epoch": 2.8598010907924287,
2012
+ "grad_norm": 1.3574531078338623,
2013
+ "learning_rate": 5.270655270655271e-07,
2014
+ "loss": 0.1682,
2015
+ "mean_token_accuracy": 0.7911692764610052,
2016
+ "num_tokens": 18255872.0,
2017
+ "step": 2230
2018
+ },
2019
+ {
2020
+ "epoch": 2.872633942893808,
2021
+ "grad_norm": 1.412182331085205,
2022
+ "learning_rate": 4.795821462488129e-07,
2023
+ "loss": 0.1805,
2024
+ "mean_token_accuracy": 0.788820942863822,
2025
+ "num_tokens": 18337792.0,
2026
+ "step": 2240
2027
+ },
2028
+ {
2029
+ "epoch": 2.8854667949951875,
2030
+ "grad_norm": 1.5910438299179077,
2031
+ "learning_rate": 4.320987654320988e-07,
2032
+ "loss": 0.1588,
2033
+ "mean_token_accuracy": 0.8040606647729873,
2034
+ "num_tokens": 18419712.0,
2035
+ "step": 2250
2036
+ },
2037
+ {
2038
+ "epoch": 2.898299647096567,
2039
+ "grad_norm": 1.3554024696350098,
2040
+ "learning_rate": 3.846153846153847e-07,
2041
+ "loss": 0.1601,
2042
+ "mean_token_accuracy": 0.801565557718277,
2043
+ "num_tokens": 18501632.0,
2044
+ "step": 2260
2045
+ },
2046
+ {
2047
+ "epoch": 2.911132499197947,
2048
+ "grad_norm": 1.2987529039382935,
2049
+ "learning_rate": 3.371320037986705e-07,
2050
+ "loss": 0.1377,
2051
+ "mean_token_accuracy": 0.8144080229103565,
2052
+ "num_tokens": 18583552.0,
2053
+ "step": 2270
2054
+ },
2055
+ {
2056
+ "epoch": 2.9239653512993264,
2057
+ "grad_norm": 1.268310308456421,
2058
+ "learning_rate": 2.8964862298195633e-07,
2059
+ "loss": 0.1426,
2060
+ "mean_token_accuracy": 0.8171110570430755,
2061
+ "num_tokens": 18665472.0,
2062
+ "step": 2280
2063
+ },
2064
+ {
2065
+ "epoch": 2.9367982034007056,
2066
+ "grad_norm": 1.5489550828933716,
2067
+ "learning_rate": 2.4216524216524215e-07,
2068
+ "loss": 0.1686,
2069
+ "mean_token_accuracy": 0.8061276923865079,
2070
+ "num_tokens": 18747392.0,
2071
+ "step": 2290
2072
+ },
2073
+ {
2074
+ "epoch": 2.9496310555020853,
2075
+ "grad_norm": 1.5315167903900146,
2076
+ "learning_rate": 1.9468186134852803e-07,
2077
+ "loss": 0.1734,
2078
+ "mean_token_accuracy": 0.8045376718044281,
2079
+ "num_tokens": 18829312.0,
2080
+ "step": 2300
2081
+ },
2082
+ {
2083
+ "epoch": 2.962463907603465,
2084
+ "grad_norm": 1.887633204460144,
2085
+ "learning_rate": 1.4719848053181388e-07,
2086
+ "loss": 0.1881,
2087
+ "mean_token_accuracy": 0.7852617412805557,
2088
+ "num_tokens": 18911232.0,
2089
+ "step": 2310
2090
+ },
2091
+ {
2092
+ "epoch": 2.9752967597048445,
2093
+ "grad_norm": 1.3356051445007324,
2094
+ "learning_rate": 9.971509971509972e-08,
2095
+ "loss": 0.1747,
2096
+ "mean_token_accuracy": 0.7836839504539966,
2097
+ "num_tokens": 18993152.0,
2098
+ "step": 2320
2099
+ },
2100
+ {
2101
+ "epoch": 2.988129611806224,
2102
+ "grad_norm": 1.3567869663238525,
2103
+ "learning_rate": 5.223171889838557e-08,
2104
+ "loss": 0.1577,
2105
+ "mean_token_accuracy": 0.8161815080791712,
2106
+ "num_tokens": 19075072.0,
2107
+ "step": 2330
2108
+ },
2109
+ {
2110
+ "epoch": 3.0,
2111
+ "grad_norm": 3.3493268489837646,
2112
+ "learning_rate": 4.7483380816714155e-09,
2113
+ "loss": 0.1777,
2114
+ "mean_token_accuracy": 0.7969138461190302,
2115
+ "num_tokens": 19150848.0,
2116
+ "step": 2340
2117
  }
2118
  ],
2119
  "logging_steps": 10,
 
2128
  "should_evaluate": false,
2129
  "should_log": false,
2130
  "should_save": true,
2131
+ "should_training_stop": true
2132
  },
2133
  "attributes": {}
2134
  }
2135
  },
2136
+ "total_flos": 5.061195179281613e+16,
2137
  "train_batch_size": 2,
2138
  "trial_name": null,
2139
  "trial_params": null