shulijia commited on
Commit
87c8dde
·
verified ·
1 Parent(s): 6843419

Training in progress, step 2000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26aeee8082b762a47da2b73c5109b724ff428ec39574f7e7869b2955c2742438
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383e7ccd95f2dc8638fcfa2e1aea691b7a5012239f94f88b0c027486d3e81534
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffb78fbe6df8d154184d04fe4a38dada15fb2b41a26e20271bd06e2ac87f8479
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e8832080f4ad58fc5492e49ef30f7b5a9c5492859d5d3ba2bfa8cc2e763339b
3
  size 4768663315
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14d9e6012c6b97b605d67334319a24f115c4686d9f7afc657c65afaed6893946
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c708d5f614ec082a45510d833ad3c779e04045466fdb236d84ee02df75e1e2
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:155edb0600918d427d776d6c6d2b7d0773bb9551ba0efb64457f101d8ff17495
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09649a50475eb2d1586d1fccd870b8855df045b32800fa9e87df238979448da3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.136873997504901,
6
  "eval_steps": 100,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1478,6 +1478,496 @@
1478
  "eval_samples_per_second": 30.2,
1479
  "eval_steps_per_second": 1.889,
1480
  "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1481
  }
1482
  ],
1483
  "logging_steps": 10,
@@ -1497,7 +1987,7 @@
1497
  "attributes": {}
1498
  }
1499
  },
1500
- "total_flos": 3.2445016043421696e+16,
1501
  "train_batch_size": 2,
1502
  "trial_name": null,
1503
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.849759401176261,
6
  "eval_steps": 100,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1478
  "eval_samples_per_second": 30.2,
1479
  "eval_steps_per_second": 1.889,
1480
  "step": 1500
1481
+ },
1482
+ {
1483
+ "epoch": 2.151131705578328,
1484
+ "grad_norm": 1.1753822565078735,
1485
+ "learning_rate": 3.1503957783641167e-06,
1486
+ "loss": 0.1709,
1487
+ "mean_token_accuracy": 0.7948324346914888,
1488
+ "num_tokens": 12358656.0,
1489
+ "step": 1510
1490
+ },
1491
+ {
1492
+ "epoch": 2.1653894136517553,
1493
+ "grad_norm": 1.3583290576934814,
1494
+ "learning_rate": 3.0976253298153036e-06,
1495
+ "loss": 0.1516,
1496
+ "mean_token_accuracy": 0.7987769071012736,
1497
+ "num_tokens": 12440576.0,
1498
+ "step": 1520
1499
+ },
1500
+ {
1501
+ "epoch": 2.1796471217251825,
1502
+ "grad_norm": 1.6773642301559448,
1503
+ "learning_rate": 3.044854881266491e-06,
1504
+ "loss": 0.1582,
1505
+ "mean_token_accuracy": 0.8161937419325114,
1506
+ "num_tokens": 12522496.0,
1507
+ "step": 1530
1508
+ },
1509
+ {
1510
+ "epoch": 2.1939048297986097,
1511
+ "grad_norm": 1.700421929359436,
1512
+ "learning_rate": 2.9920844327176783e-06,
1513
+ "loss": 0.1651,
1514
+ "mean_token_accuracy": 0.7837084148079156,
1515
+ "num_tokens": 12604416.0,
1516
+ "step": 1540
1517
+ },
1518
+ {
1519
+ "epoch": 2.208162537872037,
1520
+ "grad_norm": 1.278611183166504,
1521
+ "learning_rate": 2.9393139841688656e-06,
1522
+ "loss": 0.1459,
1523
+ "mean_token_accuracy": 0.8016634039580822,
1524
+ "num_tokens": 12686336.0,
1525
+ "step": 1550
1526
+ },
1527
+ {
1528
+ "epoch": 2.2224202459454645,
1529
+ "grad_norm": 1.3623602390289307,
1530
+ "learning_rate": 2.8865435356200525e-06,
1531
+ "loss": 0.1754,
1532
+ "mean_token_accuracy": 0.7952054802328348,
1533
+ "num_tokens": 12768256.0,
1534
+ "step": 1560
1535
+ },
1536
+ {
1537
+ "epoch": 2.2366779540188917,
1538
+ "grad_norm": 1.1797006130218506,
1539
+ "learning_rate": 2.8337730870712403e-06,
1540
+ "loss": 0.1854,
1541
+ "mean_token_accuracy": 0.7857632093131542,
1542
+ "num_tokens": 12850176.0,
1543
+ "step": 1570
1544
+ },
1545
+ {
1546
+ "epoch": 2.250935662092319,
1547
+ "grad_norm": 1.2017779350280762,
1548
+ "learning_rate": 2.7810026385224277e-06,
1549
+ "loss": 0.1482,
1550
+ "mean_token_accuracy": 0.8103106629103423,
1551
+ "num_tokens": 12932096.0,
1552
+ "step": 1580
1553
+ },
1554
+ {
1555
+ "epoch": 2.265193370165746,
1556
+ "grad_norm": 1.1322146654129028,
1557
+ "learning_rate": 2.7282321899736154e-06,
1558
+ "loss": 0.1539,
1559
+ "mean_token_accuracy": 0.8084882564842701,
1560
+ "num_tokens": 13014016.0,
1561
+ "step": 1590
1562
+ },
1563
+ {
1564
+ "epoch": 2.2794510782391733,
1565
+ "grad_norm": 1.2803654670715332,
1566
+ "learning_rate": 2.6754617414248023e-06,
1567
+ "loss": 0.1495,
1568
+ "step": 1600
1569
+ },
1570
+ {
1571
+ "epoch": 2.2794510782391733,
1572
+ "eval_loss": 0.423663467168808,
1573
+ "eval_mean_token_accuracy": 0.9079369283639468,
1574
+ "eval_num_tokens": 13095936.0,
1575
+ "eval_runtime": 41.2866,
1576
+ "eval_samples_per_second": 30.203,
1577
+ "eval_steps_per_second": 1.889,
1578
+ "step": 1600
1579
+ },
1580
+ {
1581
+ "epoch": 2.2937087863126004,
1582
+ "grad_norm": 1.110379934310913,
1583
+ "learning_rate": 2.6226912928759897e-06,
1584
+ "loss": 0.157,
1585
+ "mean_token_accuracy": 0.8001467704772949,
1586
+ "num_tokens": 13177856.0,
1587
+ "step": 1610
1588
+ },
1589
+ {
1590
+ "epoch": 2.3079664943860276,
1591
+ "grad_norm": 1.2236034870147705,
1592
+ "learning_rate": 2.5699208443271775e-06,
1593
+ "loss": 0.1566,
1594
+ "mean_token_accuracy": 0.807118396833539,
1595
+ "num_tokens": 13259776.0,
1596
+ "step": 1620
1597
+ },
1598
+ {
1599
+ "epoch": 2.322224202459455,
1600
+ "grad_norm": 1.439042329788208,
1601
+ "learning_rate": 2.5171503957783644e-06,
1602
+ "loss": 0.1979,
1603
+ "mean_token_accuracy": 0.7804794508963824,
1604
+ "num_tokens": 13341696.0,
1605
+ "step": 1630
1606
+ },
1607
+ {
1608
+ "epoch": 2.336481910532882,
1609
+ "grad_norm": 1.3598966598510742,
1610
+ "learning_rate": 2.4643799472295517e-06,
1611
+ "loss": 0.1514,
1612
+ "mean_token_accuracy": 0.8212695695459843,
1613
+ "num_tokens": 13423616.0,
1614
+ "step": 1640
1615
+ },
1616
+ {
1617
+ "epoch": 2.350739618606309,
1618
+ "grad_norm": 1.401573896408081,
1619
+ "learning_rate": 2.411609498680739e-06,
1620
+ "loss": 0.1588,
1621
+ "mean_token_accuracy": 0.8089774928987026,
1622
+ "num_tokens": 13505536.0,
1623
+ "step": 1650
1624
+ },
1625
+ {
1626
+ "epoch": 2.3649973266797364,
1627
+ "grad_norm": 1.6068435907363892,
1628
+ "learning_rate": 2.3588390501319264e-06,
1629
+ "loss": 0.1647,
1630
+ "mean_token_accuracy": 0.8134907066822052,
1631
+ "num_tokens": 13587456.0,
1632
+ "step": 1660
1633
+ },
1634
+ {
1635
+ "epoch": 2.3792550347531636,
1636
+ "grad_norm": 1.2568259239196777,
1637
+ "learning_rate": 2.3060686015831133e-06,
1638
+ "loss": 0.1664,
1639
+ "mean_token_accuracy": 0.7954256378114224,
1640
+ "num_tokens": 13669376.0,
1641
+ "step": 1670
1642
+ },
1643
+ {
1644
+ "epoch": 2.3935127428265908,
1645
+ "grad_norm": 1.6980928182601929,
1646
+ "learning_rate": 2.253298153034301e-06,
1647
+ "loss": 0.1707,
1648
+ "mean_token_accuracy": 0.7994985327124595,
1649
+ "num_tokens": 13751296.0,
1650
+ "step": 1680
1651
+ },
1652
+ {
1653
+ "epoch": 2.407770450900018,
1654
+ "grad_norm": 1.6247879266738892,
1655
+ "learning_rate": 2.2005277044854884e-06,
1656
+ "loss": 0.1579,
1657
+ "mean_token_accuracy": 0.7971624247729778,
1658
+ "num_tokens": 13833216.0,
1659
+ "step": 1690
1660
+ },
1661
+ {
1662
+ "epoch": 2.422028158973445,
1663
+ "grad_norm": 1.6872649192810059,
1664
+ "learning_rate": 2.1477572559366753e-06,
1665
+ "loss": 0.1703,
1666
+ "step": 1700
1667
+ },
1668
+ {
1669
+ "epoch": 2.422028158973445,
1670
+ "eval_loss": 0.4227621853351593,
1671
+ "eval_mean_token_accuracy": 0.9080967650963709,
1672
+ "eval_num_tokens": 13915136.0,
1673
+ "eval_runtime": 41.2123,
1674
+ "eval_samples_per_second": 30.258,
1675
+ "eval_steps_per_second": 1.893,
1676
+ "step": 1700
1677
+ },
1678
+ {
1679
+ "epoch": 2.4362858670468723,
1680
+ "grad_norm": 1.6167148351669312,
1681
+ "learning_rate": 2.094986807387863e-06,
1682
+ "loss": 0.1801,
1683
+ "mean_token_accuracy": 0.7794765178114176,
1684
+ "num_tokens": 13997056.0,
1685
+ "step": 1710
1686
+ },
1687
+ {
1688
+ "epoch": 2.4505435751202995,
1689
+ "grad_norm": 1.2795140743255615,
1690
+ "learning_rate": 2.0422163588390505e-06,
1691
+ "loss": 0.1466,
1692
+ "mean_token_accuracy": 0.8015288673341274,
1693
+ "num_tokens": 14078976.0,
1694
+ "step": 1720
1695
+ },
1696
+ {
1697
+ "epoch": 2.4648012831937267,
1698
+ "grad_norm": 1.2836272716522217,
1699
+ "learning_rate": 1.989445910290238e-06,
1700
+ "loss": 0.1587,
1701
+ "mean_token_accuracy": 0.7941046960651874,
1702
+ "num_tokens": 14160896.0,
1703
+ "step": 1730
1704
+ },
1705
+ {
1706
+ "epoch": 2.479058991267154,
1707
+ "grad_norm": 1.1510287523269653,
1708
+ "learning_rate": 1.9366754617414247e-06,
1709
+ "loss": 0.1807,
1710
+ "mean_token_accuracy": 0.7942025430500508,
1711
+ "num_tokens": 14242816.0,
1712
+ "step": 1740
1713
+ },
1714
+ {
1715
+ "epoch": 2.493316699340581,
1716
+ "grad_norm": 1.2959060668945312,
1717
+ "learning_rate": 1.8839050131926123e-06,
1718
+ "loss": 0.187,
1719
+ "mean_token_accuracy": 0.7789016582071782,
1720
+ "num_tokens": 14324736.0,
1721
+ "step": 1750
1722
+ },
1723
+ {
1724
+ "epoch": 2.5075744074140083,
1725
+ "grad_norm": 1.0948452949523926,
1726
+ "learning_rate": 1.8311345646437998e-06,
1727
+ "loss": 0.1995,
1728
+ "mean_token_accuracy": 0.761827296577394,
1729
+ "num_tokens": 14406656.0,
1730
+ "step": 1760
1731
+ },
1732
+ {
1733
+ "epoch": 2.5218321154874355,
1734
+ "grad_norm": 1.3183213472366333,
1735
+ "learning_rate": 1.778364116094987e-06,
1736
+ "loss": 0.1709,
1737
+ "mean_token_accuracy": 0.7887353241443634,
1738
+ "num_tokens": 14488576.0,
1739
+ "step": 1770
1740
+ },
1741
+ {
1742
+ "epoch": 2.5360898235608627,
1743
+ "grad_norm": 1.2092057466506958,
1744
+ "learning_rate": 1.7255936675461743e-06,
1745
+ "loss": 0.1325,
1746
+ "mean_token_accuracy": 0.8213796474039554,
1747
+ "num_tokens": 14570496.0,
1748
+ "step": 1780
1749
+ },
1750
+ {
1751
+ "epoch": 2.55034753163429,
1752
+ "grad_norm": 1.418562889099121,
1753
+ "learning_rate": 1.6728232189973616e-06,
1754
+ "loss": 0.1827,
1755
+ "mean_token_accuracy": 0.7853595890104771,
1756
+ "num_tokens": 14652416.0,
1757
+ "step": 1790
1758
+ },
1759
+ {
1760
+ "epoch": 2.564605239707717,
1761
+ "grad_norm": 1.0960406064987183,
1762
+ "learning_rate": 1.6200527704485488e-06,
1763
+ "loss": 0.1758,
1764
+ "step": 1800
1765
+ },
1766
+ {
1767
+ "epoch": 2.564605239707717,
1768
+ "eval_loss": 0.4227621257305145,
1769
+ "eval_mean_token_accuracy": 0.9082627732020158,
1770
+ "eval_num_tokens": 14734336.0,
1771
+ "eval_runtime": 41.1309,
1772
+ "eval_samples_per_second": 30.318,
1773
+ "eval_steps_per_second": 1.896,
1774
+ "step": 1800
1775
+ },
1776
+ {
1777
+ "epoch": 2.578862947781144,
1778
+ "grad_norm": 1.5267870426177979,
1779
+ "learning_rate": 1.5672823218997363e-06,
1780
+ "loss": 0.1732,
1781
+ "mean_token_accuracy": 0.7900256833992898,
1782
+ "num_tokens": 14816256.0,
1783
+ "step": 1810
1784
+ },
1785
+ {
1786
+ "epoch": 2.5931206558545714,
1787
+ "grad_norm": 2.303779125213623,
1788
+ "learning_rate": 1.5145118733509237e-06,
1789
+ "loss": 0.1717,
1790
+ "mean_token_accuracy": 0.8003057725727558,
1791
+ "num_tokens": 14898176.0,
1792
+ "step": 1820
1793
+ },
1794
+ {
1795
+ "epoch": 2.6073783639279986,
1796
+ "grad_norm": 1.3814704418182373,
1797
+ "learning_rate": 1.4617414248021108e-06,
1798
+ "loss": 0.1691,
1799
+ "mean_token_accuracy": 0.8011741682887077,
1800
+ "num_tokens": 14980096.0,
1801
+ "step": 1830
1802
+ },
1803
+ {
1804
+ "epoch": 2.621636072001426,
1805
+ "grad_norm": 1.4888346195220947,
1806
+ "learning_rate": 1.4089709762532984e-06,
1807
+ "loss": 0.1665,
1808
+ "mean_token_accuracy": 0.7911203544586897,
1809
+ "num_tokens": 15062016.0,
1810
+ "step": 1840
1811
+ },
1812
+ {
1813
+ "epoch": 2.635893780074853,
1814
+ "grad_norm": 1.7252527475357056,
1815
+ "learning_rate": 1.3562005277044857e-06,
1816
+ "loss": 0.1462,
1817
+ "mean_token_accuracy": 0.8204623281955719,
1818
+ "num_tokens": 15143936.0,
1819
+ "step": 1850
1820
+ },
1821
+ {
1822
+ "epoch": 2.65015148814828,
1823
+ "grad_norm": 1.3731549978256226,
1824
+ "learning_rate": 1.3034300791556728e-06,
1825
+ "loss": 0.1469,
1826
+ "mean_token_accuracy": 0.8153620343655348,
1827
+ "num_tokens": 15225856.0,
1828
+ "step": 1860
1829
+ },
1830
+ {
1831
+ "epoch": 2.6644091962217074,
1832
+ "grad_norm": 1.1390541791915894,
1833
+ "learning_rate": 1.2506596306068602e-06,
1834
+ "loss": 0.1511,
1835
+ "mean_token_accuracy": 0.7933586105704308,
1836
+ "num_tokens": 15307776.0,
1837
+ "step": 1870
1838
+ },
1839
+ {
1840
+ "epoch": 2.6786669042951345,
1841
+ "grad_norm": 1.3843096494674683,
1842
+ "learning_rate": 1.1978891820580475e-06,
1843
+ "loss": 0.1743,
1844
+ "mean_token_accuracy": 0.7874510768800974,
1845
+ "num_tokens": 15389696.0,
1846
+ "step": 1880
1847
+ },
1848
+ {
1849
+ "epoch": 2.6929246123685617,
1850
+ "grad_norm": 1.4261775016784668,
1851
+ "learning_rate": 1.1451187335092349e-06,
1852
+ "loss": 0.1775,
1853
+ "mean_token_accuracy": 0.7992783728986979,
1854
+ "num_tokens": 15471616.0,
1855
+ "step": 1890
1856
+ },
1857
+ {
1858
+ "epoch": 2.707182320441989,
1859
+ "grad_norm": 1.4358237981796265,
1860
+ "learning_rate": 1.0923482849604222e-06,
1861
+ "loss": 0.1488,
1862
+ "step": 1900
1863
+ },
1864
+ {
1865
+ "epoch": 2.707182320441989,
1866
+ "eval_loss": 0.4216897487640381,
1867
+ "eval_mean_token_accuracy": 0.9083614570972247,
1868
+ "eval_num_tokens": 15553536.0,
1869
+ "eval_runtime": 41.1549,
1870
+ "eval_samples_per_second": 30.3,
1871
+ "eval_steps_per_second": 1.895,
1872
+ "step": 1900
1873
+ },
1874
+ {
1875
+ "epoch": 2.721440028515416,
1876
+ "grad_norm": 1.4193668365478516,
1877
+ "learning_rate": 1.0395778364116096e-06,
1878
+ "loss": 0.1432,
1879
+ "mean_token_accuracy": 0.8027458423748612,
1880
+ "num_tokens": 15635456.0,
1881
+ "step": 1910
1882
+ },
1883
+ {
1884
+ "epoch": 2.7356977365888433,
1885
+ "grad_norm": 1.3984283208847046,
1886
+ "learning_rate": 9.86807387862797e-07,
1887
+ "loss": 0.1751,
1888
+ "mean_token_accuracy": 0.7997309185564518,
1889
+ "num_tokens": 15717376.0,
1890
+ "step": 1920
1891
+ },
1892
+ {
1893
+ "epoch": 2.7499554446622705,
1894
+ "grad_norm": 1.2041066884994507,
1895
+ "learning_rate": 9.340369393139842e-07,
1896
+ "loss": 0.2063,
1897
+ "mean_token_accuracy": 0.770768103376031,
1898
+ "num_tokens": 15799296.0,
1899
+ "step": 1930
1900
+ },
1901
+ {
1902
+ "epoch": 2.7642131527356977,
1903
+ "grad_norm": 1.4668165445327759,
1904
+ "learning_rate": 8.812664907651716e-07,
1905
+ "loss": 0.1496,
1906
+ "mean_token_accuracy": 0.7937133066356182,
1907
+ "num_tokens": 15881216.0,
1908
+ "step": 1940
1909
+ },
1910
+ {
1911
+ "epoch": 2.778470860809125,
1912
+ "grad_norm": 1.1798230409622192,
1913
+ "learning_rate": 8.284960422163589e-07,
1914
+ "loss": 0.1696,
1915
+ "mean_token_accuracy": 0.7978228941559792,
1916
+ "num_tokens": 15963136.0,
1917
+ "step": 1950
1918
+ },
1919
+ {
1920
+ "epoch": 2.792728568882552,
1921
+ "grad_norm": 1.4253802299499512,
1922
+ "learning_rate": 7.757255936675462e-07,
1923
+ "loss": 0.1602,
1924
+ "mean_token_accuracy": 0.8014432441443204,
1925
+ "num_tokens": 16045056.0,
1926
+ "step": 1960
1927
+ },
1928
+ {
1929
+ "epoch": 2.8069862769559792,
1930
+ "grad_norm": 1.3596400022506714,
1931
+ "learning_rate": 7.229551451187336e-07,
1932
+ "loss": 0.1672,
1933
+ "mean_token_accuracy": 0.808916338160634,
1934
+ "num_tokens": 16126976.0,
1935
+ "step": 1970
1936
+ },
1937
+ {
1938
+ "epoch": 2.8212439850294064,
1939
+ "grad_norm": 1.4225387573242188,
1940
+ "learning_rate": 6.701846965699208e-07,
1941
+ "loss": 0.1767,
1942
+ "mean_token_accuracy": 0.7800391383469105,
1943
+ "num_tokens": 16208896.0,
1944
+ "step": 1980
1945
+ },
1946
+ {
1947
+ "epoch": 2.8355016931028336,
1948
+ "grad_norm": 1.8448420763015747,
1949
+ "learning_rate": 6.174142480211082e-07,
1950
+ "loss": 0.1846,
1951
+ "mean_token_accuracy": 0.792747063189745,
1952
+ "num_tokens": 16290816.0,
1953
+ "step": 1990
1954
+ },
1955
+ {
1956
+ "epoch": 2.849759401176261,
1957
+ "grad_norm": 1.4115536212921143,
1958
+ "learning_rate": 5.646437994722955e-07,
1959
+ "loss": 0.1398,
1960
+ "step": 2000
1961
+ },
1962
+ {
1963
+ "epoch": 2.849759401176261,
1964
+ "eval_loss": 0.42159923911094666,
1965
+ "eval_mean_token_accuracy": 0.9084225067725549,
1966
+ "eval_num_tokens": 16372736.0,
1967
+ "eval_runtime": 41.1801,
1968
+ "eval_samples_per_second": 30.282,
1969
+ "eval_steps_per_second": 1.894,
1970
+ "step": 2000
1971
  }
1972
  ],
1973
  "logging_steps": 10,
 
1987
  "attributes": {}
1988
  }
1989
  },
1990
+ "total_flos": 4.32699442420777e+16,
1991
  "train_batch_size": 2,
1992
  "trial_name": null,
1993
  "trial_params": null