Spaces:
Running
Running
bourdoiscatie
commited on
Upload 35 files
Browse files- .gitattributes +3 -0
- README.md +6 -6
- dist/assets/BWD-causal-True.png +0 -0
- dist/assets/BWD-causal-True_dark.png +0 -0
- dist/assets/CE.png +0 -0
- dist/assets/CE_dark.png +0 -0
- dist/assets/FAT5.gif +3 -0
- dist/assets/FAT5_dark.gif +3 -0
- dist/assets/FWD-causal-True.png +0 -0
- dist/assets/FWD-causal-True_dark.png +0 -0
- dist/assets/LN.png +0 -0
- dist/assets/LN_dark.png +0 -0
- dist/assets/LinFAT5_dark.gif +3 -0
- dist/assets/bwd-bfloat16-b16-dark.png +0 -0
- dist/assets/bwd-bfloat16-b16.png +0 -0
- dist/assets/convergence_masked_accuracy_FAT5.png +0 -0
- dist/assets/fwd-bfloat16-b16-dark.png +0 -0
- dist/assets/fwd-bfloat16-b16.png +0 -0
- dist/assets/loss_eval.png +0 -0
- dist/assets/loss_train.png +0 -0
- dist/assets/mem-bfloat16-b32-dark.png +0 -0
- dist/assets/mem-bfloat16-b32.png +0 -0
- dist/assets/mem-bfloat16-b8-dark.png +0 -0
- dist/assets/mem-bfloat16-b8.png +0 -0
- dist/assets/nvidiasmi.png +0 -0
- dist/bibliography.bib +590 -0
- dist/distill.bundle.js +0 -0
- dist/distill.bundle.js.map +0 -0
- dist/index.html +1380 -0
- dist/main.bundle.js +0 -0
- dist/main.bundle.js.LICENSE.txt +19 -0
- dist/main.bundle.js.map +0 -0
- dist/style.css +349 -0
- package-lock.json +0 -0
- package.json +32 -0
- webpack.config.js +96 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
dist/assets/FAT5_dark.gif filter=lfs diff=lfs merge=lfs -text
|
37 |
+
dist/assets/FAT5.gif filter=lfs diff=lfs merge=lfs -text
|
38 |
+
dist/assets/LinFAT5_dark.gif filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title: FAT5
|
3 |
emoji: ⚡
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: static
|
7 |
pinned: false
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
1 |
---
|
2 |
+
title: 'FAT5 (Flash Attention T5) report'
|
3 |
emoji: ⚡
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: red
|
6 |
sdk: static
|
7 |
pinned: false
|
8 |
+
header: default
|
9 |
+
app_file: dist/index.html
|
10 |
+
---
|
dist/assets/BWD-causal-True.png
ADDED
dist/assets/BWD-causal-True_dark.png
ADDED
dist/assets/CE.png
ADDED
dist/assets/CE_dark.png
ADDED
dist/assets/FAT5.gif
ADDED
Git LFS Details
|
dist/assets/FAT5_dark.gif
ADDED
Git LFS Details
|
dist/assets/FWD-causal-True.png
ADDED
dist/assets/FWD-causal-True_dark.png
ADDED
dist/assets/LN.png
ADDED
dist/assets/LN_dark.png
ADDED
dist/assets/LinFAT5_dark.gif
ADDED
Git LFS Details
|
dist/assets/bwd-bfloat16-b16-dark.png
ADDED
dist/assets/bwd-bfloat16-b16.png
ADDED
dist/assets/convergence_masked_accuracy_FAT5.png
ADDED
dist/assets/fwd-bfloat16-b16-dark.png
ADDED
dist/assets/fwd-bfloat16-b16.png
ADDED
dist/assets/loss_eval.png
ADDED
dist/assets/loss_train.png
ADDED
dist/assets/mem-bfloat16-b32-dark.png
ADDED
dist/assets/mem-bfloat16-b32.png
ADDED
dist/assets/mem-bfloat16-b8-dark.png
ADDED
dist/assets/mem-bfloat16-b8.png
ADDED
dist/assets/nvidiasmi.png
ADDED
dist/bibliography.bib
ADDED
@@ -0,0 +1,590 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@article{JMLR:v21:20-074,
|
2 |
+
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
|
3 |
+
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
|
4 |
+
journal = {Journal of Machine Learning Research},
|
5 |
+
year = {2020},
|
6 |
+
volume = {21},
|
7 |
+
number = {140},
|
8 |
+
pages = {1--67},
|
9 |
+
url = {http://jmlr.org/papers/v21/20-074.html}
|
10 |
+
}
|
11 |
+
@misc{chia2023instructeval,
|
12 |
+
title={INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models},
|
13 |
+
author={Yew Ken Chia and Pengfei Hong and Lidong Bing and Soujanya Poria},
|
14 |
+
year={2023},
|
15 |
+
url={https://arxiv.org/abs/2306.04757},
|
16 |
+
archivePrefix={arXiv},
|
17 |
+
primaryClass={cs.CL}
|
18 |
+
}
|
19 |
+
@misc{fu2024tiny,
|
20 |
+
title={Tiny Titans: Can Smaller Large Language Models Punch Above Their Weight in the Real World for Meeting Summarization?},
|
21 |
+
author={Xue-Yong Fu and Md Tahmid Rahman Laskar and Elena Khasanova and Cheng Chen and Shashi Bhushan TN},
|
22 |
+
year={2024},
|
23 |
+
url={https://arxiv.org/abs/2402.00841},
|
24 |
+
archivePrefix={arXiv},
|
25 |
+
primaryClass={cs.CL}
|
26 |
+
}
|
27 |
+
@misc{hsieh2023distilling,
|
28 |
+
title={Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes},
|
29 |
+
author={Cheng-Yu Hsieh and Chun-Liang Li and Chih-Kuan Yeh and Hootan Nakhost and Yasuhisa Fujii and Alexander Ratner and Ranjay Krishna and Chen-Yu Lee and Tomas Pfister},
|
30 |
+
year={2023},
|
31 |
+
url={https://arxiv.org/abs/2305.02301},
|
32 |
+
archivePrefix={arXiv},
|
33 |
+
primaryClass={cs.CL}
|
34 |
+
}
|
35 |
+
@misc{nawrot2023nanot5,
|
36 |
+
title={nanoT5: A PyTorch Framework for Pre-training and Fine-tuning T5-style Models with Limited Resources},
|
37 |
+
author={Piotr Nawrot},
|
38 |
+
year={2023},
|
39 |
+
url={https://arxiv.org/abs/2309.02373},
|
40 |
+
archivePrefix={arXiv},
|
41 |
+
primaryClass={cs.CL}
|
42 |
+
}
|
43 |
+
@misc{tay2023ul2,
|
44 |
+
title={UL2: Unifying Language Learning Paradigms},
|
45 |
+
author={Yi Tay and Mostafa Dehghani and Vinh Q. Tran and Xavier Garcia and Jason Wei and Xuezhi Wang and Hyung Won Chung and Siamak Shakeri and Dara Bahri and Tal Schuster and Huaixiu Steven Zheng and Denny Zhou and Neil Houlsby and Donald Metzler},
|
46 |
+
year={2023},
|
47 |
+
url={https://arxiv.org/abs/2205.05131},
|
48 |
+
archivePrefix={arXiv},
|
49 |
+
primaryClass={cs.CL}
|
50 |
+
}
|
51 |
+
@misc{dao2022flashattention,
|
52 |
+
title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
|
53 |
+
author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
|
54 |
+
year={2022},
|
55 |
+
url={https://arxiv.org/abs/2205.14135},
|
56 |
+
archivePrefix={arXiv},
|
57 |
+
primaryClass={cs.LG}
|
58 |
+
}
|
59 |
+
@misc{turbot5,
|
60 |
+
title={TurboT5},
|
61 |
+
author={Knowledgator},
|
62 |
+
year={2024},
|
63 |
+
publisher={GitHub},
|
64 |
+
url={https://github.com/Knowledgator/TurboT5},
|
65 |
+
}
|
66 |
+
@misc{nguyen2023culturax,
|
67 |
+
title={CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages},
|
68 |
+
author={Thuat Nguyen and Chien Van Nguyen and Viet Dac Lai and Hieu Man and Nghia Trung Ngo and Franck Dernoncourt and Ryan A. Rossi and Thien Huu Nguyen},
|
69 |
+
year={2023},
|
70 |
+
url={https://arxiv.org/abs/2309.09400},
|
71 |
+
archivePrefix={arXiv},
|
72 |
+
primaryClass={cs.CL}
|
73 |
+
}
|
74 |
+
@misc{leclerc2023ffcv,
|
75 |
+
title={FFCV: Accelerating Training by Removing Data Bottlenecks},
|
76 |
+
author={Guillaume Leclerc and Andrew Ilyas and Logan Engstrom and Sung Min Park and Hadi Salman and Aleksander Madry},
|
77 |
+
year={2023},
|
78 |
+
url={https://arxiv.org/abs/2306.12517},
|
79 |
+
archivePrefix={arXiv},
|
80 |
+
primaryClass={cs.LG}
|
81 |
+
}
|
82 |
+
@misc{lhoest2021datasets,
|
83 |
+
title={Datasets: A Community Library for Natural Language Processing},
|
84 |
+
author={Quentin Lhoest and Albert Villanova del Moral and Yacine Jernite and Abhishek Thakur and Patrick von Platen and Suraj Patil and Julien Chaumond and Mariama Drame and Julien Plu and Lewis Tunstall and Joe Davison and Mario Šaško and Gunjan Chhablani and Bhavitvya Malik and Simon Brandeis and Teven Le Scao and Victor Sanh and Canwen Xu and Nicolas Patry and Angelina McMillan-Major and Philipp Schmid and Sylvain Gugger and Clément Delangue and Théo Matussière and Lysandre Debut and Stas Bekman and Pierric Cistac and Thibault Goehringer and Victor Mustar and François Lagunas and Alexander M. Rush and Thomas Wolf},
|
85 |
+
year={2021},
|
86 |
+
url={https://arxiv.org/abs/2109.02846},
|
87 |
+
archivePrefix={arXiv},
|
88 |
+
primaryClass={cs.CL}
|
89 |
+
}
|
90 |
+
@misc{paszke2019pytorch,
|
91 |
+
title={PyTorch: An Imperative Style, High-Performance Deep Learning Library},
|
92 |
+
author={Adam Paszke and Sam Gross and Francisco Massa and Adam Lerer and James Bradbury and Gregory Chanan and Trevor Killeen and Zeming Lin and Natalia Gimelshein and Luca Antiga and Alban Desmaison and Andreas Köpf and Edward Yang and Zach DeVito and Martin Raison and Alykhan Tejani and Sasank Chilamkurthy and Benoit Steiner and Lu Fang and Junjie Bai and Soumith Chintala},
|
93 |
+
year={2019},
|
94 |
+
url={https://arxiv.org/abs/1912.01703},
|
95 |
+
archivePrefix={arXiv},
|
96 |
+
primaryClass={cs.LG}
|
97 |
+
}
|
98 |
+
@inproceedings{10.1145/2833157.2833162,
|
99 |
+
author = {Lam, Siu Kwan and Pitrou, Antoine and Seibert, Stanley},
|
100 |
+
title = {Numba: a LLVM-based Python JIT compiler},
|
101 |
+
year = {2015},
|
102 |
+
isbn = {9781450340052},
|
103 |
+
publisher = {Association for Computing Machinery},
|
104 |
+
address = {New York, NY, USA},
|
105 |
+
url = {https://doi.org/10.1145/2833157.2833162},
|
106 |
+
doi = {10.1145/2833157.2833162},
|
107 |
+
abstract = {Dynamic, interpreted languages, like Python, are attractive for domain-experts and scientists experimenting with new ideas. However, the performance of the interpreter is often a barrier when scaling to larger data sets. This paper presents a just-in-time compiler for Python that focuses in scientific and array-oriented computing. Starting with the simple syntax of Python, Numba compiles a subset of the language into efficient machine code that is comparable in performance to a traditional compiled language. In addition, we share our experience in building a JIT compiler using LLVM[1].},
|
108 |
+
booktitle = {Proceedings of the Second Workshop on the LLVM Compiler Infrastructure in HPC},
|
109 |
+
articleno = {7},
|
110 |
+
numpages = {6},
|
111 |
+
keywords = {compiler, Python, LLVM},
|
112 |
+
location = {Austin, Texas},
|
113 |
+
series = {LLVM '15}
|
114 |
+
}
|
115 |
+
@inproceedings{10.1145/3315508.3329973,
|
116 |
+
author = {Tillet, Philippe and Kung, H. T. and Cox, David},
|
117 |
+
title = {Triton: an intermediate language and compiler for tiled neural network computations},
|
118 |
+
year = {2019},
|
119 |
+
isbn = {9781450367196},
|
120 |
+
publisher = {Association for Computing Machinery},
|
121 |
+
address = {New York, NY, USA},
|
122 |
+
url = {https://doi.org/10.1145/3315508.3329973},
|
123 |
+
doi = {10.1145/3315508.3329973},
|
124 |
+
abstract = {The validation and deployment of novel research ideas in the field of Deep Learning is often limited by the availability of efficient compute kernels for certain basic primitives. In particular, operations that cannot leverage existing vendor libraries (e.g., cuBLAS, cuDNN) are at risk of facing poor device utilization unless custom implementations are written by experts – usually at the expense of portability. For this reason, the development of new programming abstractions for specifying custom Deep Learning workloads at a minimal performance cost has become crucial. We present Triton, a language and compiler centered around the concept of tile, i.e., statically shaped multi-dimensional sub-arrays. Our approach revolves around (1) a C-based language and an LLVM-based intermediate representation (IR) for expressing tensor programs in terms of operations on parametric tile variables and (2) a set of novel tile-level optimization passes for compiling these programs into efficient GPU code. We demonstrate how Triton can be used to build portable implementations of matrix multiplication and convolution kernels on par with hand-tuned vendor libraries (cuBLAS / cuDNN), or for efficiently implementing recent research ideas such as shift convolutions.},
|
125 |
+
booktitle = {Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages},
|
126 |
+
pages = {10–19},
|
127 |
+
numpages = {10},
|
128 |
+
keywords = {GPU, compiler, neural networks},
|
129 |
+
location = {Phoenix, AZ, USA},
|
130 |
+
series = {MAPL 2019}
|
131 |
+
}
|
132 |
+
@misc{flagattention,
|
133 |
+
title={FlagAttention},
|
134 |
+
author={FlagOpen},
|
135 |
+
year={2023},
|
136 |
+
publisher={GitHub},
|
137 |
+
title={https://github.com/FlagOpen/FlagAttention}
|
138 |
+
}
|
139 |
+
@misc{nvidiapex,
|
140 |
+
title={Apex},
|
141 |
+
author={NVIDIA},
|
142 |
+
year={2018},
|
143 |
+
publisher={GitHub},
|
144 |
+
url={https://github.com/NVIDIA/apex},
|
145 |
+
}
|
146 |
+
@misc{jansen2022perplexed,
|
147 |
+
title={Perplexed by Quality: A Perplexity-based Method for Adult and Harmful Content Detection in Multilingual Heterogeneous Web Data},
|
148 |
+
author={Tim Jansen and Yangling Tong and Victoria Zevallos and Pedro Ortiz Suarez},
|
149 |
+
year={2022},
|
150 |
+
url={https://arxiv.org/abs/2212.10440},
|
151 |
+
archivePrefix={arXiv},
|
152 |
+
primaryClass={cs.CL}
|
153 |
+
}
|
154 |
+
@misc{micikevicius2018mixed,
|
155 |
+
title={Mixed Precision Training},
|
156 |
+
author={Paulius Micikevicius and Sharan Narang and Jonah Alben and Gregory Diamos and Erich Elsen and David Garcia and Boris Ginsburg and Michael Houston and Oleksii Kuchaiev and Ganesh Venkatesh and Hao Wu},
|
157 |
+
year={2018},
|
158 |
+
url={https://arxiv.org/abs/1710.03740},
|
159 |
+
archivePrefix={arXiv},
|
160 |
+
primaryClass={cs.AI}
|
161 |
+
}
|
162 |
+
@misc{you2020large,
|
163 |
+
title={Large Batch Optimization for Deep Learning: Training BERT in 76 minutes},
|
164 |
+
author={Yang You and Jing Li and Sashank Reddi and Jonathan Hseu and Sanjiv Kumar and Srinadh Bhojanapalli and Xiaodan Song and James Demmel and Kurt Keutzer and Cho-Jui Hsieh},
|
165 |
+
year={2020},
|
166 |
+
url={https://arxiv.org/abs/1904.00962},
|
167 |
+
archivePrefix={arXiv},
|
168 |
+
primaryClass={cs.LG}
|
169 |
+
}
|
170 |
+
@misc{liu2024sophia,
|
171 |
+
title={Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training},
|
172 |
+
author={Hong Liu and Zhiyuan Li and David Hall and Percy Liang and Tengyu Ma},
|
173 |
+
year={2024},
|
174 |
+
url={https://arxiv.org/abs/2305.14342},
|
175 |
+
archivePrefix={arXiv},
|
176 |
+
primaryClass={cs.LG}
|
177 |
+
}
|
178 |
+
@misc{rajbhandari2020zero,
|
179 |
+
title={ZeRO: Memory Optimizations Toward Training Trillion Parameter Models},
|
180 |
+
author={Samyam Rajbhandari and Jeff Rasley and Olatunji Ruwase and Yuxiong He},
|
181 |
+
year={2020},
|
182 |
+
url={https://arxiv.org/abs/1910.02054},
|
183 |
+
archivePrefix={arXiv},
|
184 |
+
primaryClass={cs.LG}
|
185 |
+
}
|
186 |
+
@misc{debrébisson2016zloss,
|
187 |
+
title={The Z-loss: a shift and scale invariant classification loss belonging to the Spherical Family},
|
188 |
+
author={Alexandre de Brébisson and Pascal Vincent},
|
189 |
+
year={2016},
|
190 |
+
url={https://arxiv.org/abs/1604.08859},
|
191 |
+
archivePrefix={arXiv},
|
192 |
+
primaryClass={cs.LG}
|
193 |
+
}
|
194 |
+
@misc{zhang2019root,
|
195 |
+
title={Root Mean Square Layer Normalization},
|
196 |
+
author={Biao Zhang and Rico Sennrich},
|
197 |
+
year={2019},
|
198 |
+
url={https://arxiv.org/abs/1910.07467},
|
199 |
+
archivePrefix={arXiv},
|
200 |
+
primaryClass={cs.LG}
|
201 |
+
}
|
202 |
+
@misc{liu2021pay,
|
203 |
+
title={Pay Attention to MLPs},
|
204 |
+
author={Hanxiao Liu and Zihang Dai and David R. So and Quoc V. Le},
|
205 |
+
year={2021},
|
206 |
+
url={https://arxiv.org/abs/2105.08050},
|
207 |
+
archivePrefix={arXiv},
|
208 |
+
primaryClass={cs.LG}
|
209 |
+
}
|
210 |
+
@misc{wolf2020huggingfaces,
|
211 |
+
title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
|
212 |
+
author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
|
213 |
+
year={2020},
|
214 |
+
url={https://arxiv.org/abs/1910.03771},
|
215 |
+
archivePrefix={arXiv},
|
216 |
+
primaryClass={cs.CL}
|
217 |
+
}
|
218 |
+
@misc{liu2022enct5,
|
219 |
+
title={EncT5: A Framework for Fine-tuning T5 as Non-autoregressive Models},
|
220 |
+
author={Frederick Liu and Terry Huang and Shihang Lyu and Siamak Shakeri and Hongkun Yu and Jing Li},
|
221 |
+
year={2022},
|
222 |
+
url={https://arxiv.org/abs/2110.08426},
|
223 |
+
archivePrefix={arXiv},
|
224 |
+
primaryClass={cs.CL}
|
225 |
+
}
|
226 |
+
@misc{shaw2018selfattention,
|
227 |
+
title={Self-Attention with Relative Position Representations},
|
228 |
+
author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
|
229 |
+
year={2018},
|
230 |
+
url={https://arxiv.org/abs/1803.02155},
|
231 |
+
archivePrefix={arXiv},
|
232 |
+
primaryClass={cs.CL}
|
233 |
+
}
|
234 |
+
@misc{press2022train,
|
235 |
+
title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
|
236 |
+
author={Ofir Press and Noah A. Smith and Mike Lewis},
|
237 |
+
year={2022},
|
238 |
+
url={https://arxiv.org/abs/2108.12409},
|
239 |
+
archivePrefix={arXiv},
|
240 |
+
primaryClass={cs.CL}
|
241 |
+
}
|
242 |
+
@misc{su2023roformer,
|
243 |
+
title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
|
244 |
+
author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
|
245 |
+
year={2023},
|
246 |
+
url={https://arxiv.org/abs/2104.09864},
|
247 |
+
archivePrefix={arXiv},
|
248 |
+
primaryClass={cs.CL}
|
249 |
+
}
|
250 |
+
@misc{li2024functional,
|
251 |
+
title={Functional Interpolation for Relative Positions Improves Long Context Transformers},
|
252 |
+
author={Shanda Li and Chong You and Guru Guruganesh and Joshua Ainslie and Santiago Ontanon and Manzil Zaheer and Sumit Sanghai and Yiming Yang and Sanjiv Kumar and Srinadh Bhojanapalli},
|
253 |
+
year={2024},
|
254 |
+
url={https://arxiv.org/abs/2310.04418},
|
255 |
+
archivePrefix={arXiv},
|
256 |
+
primaryClass={cs.LG}
|
257 |
+
}
|
258 |
+
@misc{lacoste2019quantifying,
|
259 |
+
title={Quantifying the Carbon Emissions of Machine Learning},
|
260 |
+
author={Alexandre Lacoste and Alexandra Luccioni and Victor Schmidt and Thomas Dandres},
|
261 |
+
year={2019},
|
262 |
+
url={https://arxiv.org/abs/1910.09700},
|
263 |
+
archivePrefix={arXiv},
|
264 |
+
primaryClass={cs.CY}
|
265 |
+
}
|
266 |
+
@misc{devlin2019bert,
|
267 |
+
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
|
268 |
+
author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
|
269 |
+
year={2019},
|
270 |
+
url={https://arxiv.org/abs/1810.04805},
|
271 |
+
archivePrefix={arXiv},
|
272 |
+
primaryClass={cs.CL}
|
273 |
+
}
|
274 |
+
@misc{eddine2021barthez,
|
275 |
+
title={BARThez: a Skilled Pretrained French Sequence-to-Sequence Model},
|
276 |
+
author={Moussa Kamal Eddine and Antoine J. -P. Tixier and Michalis Vazirgiannis},
|
277 |
+
year={2021},
|
278 |
+
url={https://arxiv.org/abs/2010.12321},
|
279 |
+
archivePrefix={arXiv},
|
280 |
+
primaryClass={cs.CL}
|
281 |
+
}
|
282 |
+
}
|
283 |
+
@misc{allocine,
|
284 |
+
title={French sentiment analysis with BERT},
|
285 |
+
author={Théophile Blard},
|
286 |
+
year={2020},
|
287 |
+
url={https://arxiv.org/abs/https://github.com/TheophileBlard/french-sentiment-analysis-with-bert},
|
288 |
+
}
|
289 |
+
@misc{delestre2022distilcamembert,
|
290 |
+
title={DistilCamemBERT: a distillation of the French model CamemBERT},
|
291 |
+
author={Cyrile Delestre and Abibatou Amar},
|
292 |
+
year={2022},
|
293 |
+
url={https://arxiv.org/abs/2205.11111},
|
294 |
+
archivePrefix={arXiv},
|
295 |
+
primaryClass={cs.CL}
|
296 |
+
}
|
297 |
+
@misc {frenchQA2023,
|
298 |
+
author = { ALBAR, Boris and BEDU, Pierre and BOURDOIS, Loïck },
|
299 |
+
organization = { Centre Aquitain des Technologies de l'Information et Electroniques },
|
300 |
+
title = { frenchQA (Revision 6249cd5) },
|
301 |
+
year = 2023,
|
302 |
+
url = { https://huggingface.co/CATIE-AQ/frenchQA },
|
303 |
+
doi = { 10.57967/hf/0862 },
|
304 |
+
publisher = { Hugging Face }
|
305 |
+
}
|
306 |
+
@misc {frenchNER2024,
|
307 |
+
author = { BOURDOIS, Loïck },
|
308 |
+
organization = { Centre Aquitain des Technologies de l'Information et Electroniques },
|
309 |
+
title = { frenchNER_4entities (Revision f1e8fef) },
|
310 |
+
year = 2024,
|
311 |
+
url = { https://huggingface.co/datasets/CATIE-AQ/frenchNER_4entities },
|
312 |
+
doi = { 10.57967/hf/1751 },
|
313 |
+
publisher = { Hugging Face }
|
314 |
+
}
|
315 |
+
@misc{lacoste2019quantifying,
|
316 |
+
title={Quantifying the Carbon Emissions of Machine Learning},
|
317 |
+
author={Alexandre Lacoste and Alexandra Luccioni and Victor Schmidt and Thomas Dandres},
|
318 |
+
year={2019},
|
319 |
+
url={https://arxiv.org/abs/1910.09700},
|
320 |
+
archivePrefix={arXiv},
|
321 |
+
primaryClass={cs.CY}
|
322 |
+
}
|
323 |
+
@inproceedings{Martin_2020,
|
324 |
+
title={CamemBERT: a Tasty French Language Model},
|
325 |
+
url={http://dx.doi.org/10.18653/v1/2020.acl-main.645},
|
326 |
+
DOI={10.18653/v1/2020.acl-main.645},
|
327 |
+
booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
|
328 |
+
publisher={Association for Computational Linguistics},
|
329 |
+
author={Martin, Louis and Muller, Benjamin and Ortiz Suárez, Pedro Javier and Dupont, Yoann and Romary, Laurent and de la Clergerie, Éric and Seddah, Djamé and Sagot, Benoît},
|
330 |
+
year={2020} }
|
331 |
+
@misc{le2020flaubert,
|
332 |
+
title={FlauBERT: Unsupervised Language Model Pre-training for French},
|
333 |
+
author={Hang Le and Loïc Vial and Jibril Frej and Vincent Segonne and Maximin Coavoux and Benjamin Lecouteux and Alexandre Allauzen and Benoît Crabbé and Laurent Besacier and Didier Schwab},
|
334 |
+
year={2020},
|
335 |
+
url={https://arxiv.org/abs/1912.05372},
|
336 |
+
archivePrefix={arXiv},
|
337 |
+
primaryClass={cs.CL}
|
338 |
+
}
|
339 |
+
@misc{chung2022scaling,
|
340 |
+
title={Scaling Instruction-Finetuned Language Models},
|
341 |
+
author={Hyung Won Chung and Le Hou and Shayne Longpre and Barret Zoph and Yi Tay and William Fedus and Yunxuan Li and Xuezhi Wang and Mostafa Dehghani and Siddhartha Brahma and Albert Webson and Shixiang Shane Gu and Zhuyun Dai and Mirac Suzgun and Xinyun Chen and Aakanksha Chowdhery and Alex Castro-Ros and Marie Pellat and Kevin Robinson and Dasha Valter and Sharan Narang and Gaurav Mishra and Adams Yu and Vincent Zhao and Yanping Huang and Andrew Dai and Hongkun Yu and Slav Petrov and Ed H. Chi and Jeff Dean and Jacob Devlin and Adam Roberts and Denny Zhou and Quoc V. Le and Jason Wei},
|
342 |
+
year={2022},
|
343 |
+
url={https://arxiv.org/abs/2210.11416},
|
344 |
+
archivePrefix={arXiv},
|
345 |
+
primaryClass={cs.LG}
|
346 |
+
}
|
347 |
+
@misc{guo2022longt5,
|
348 |
+
title={LongT5: Efficient Text-To-Text Transformer for Long Sequences},
|
349 |
+
author={Mandy Guo and Joshua Ainslie and David Uthus and Santiago Ontanon and Jianmo Ni and Yun-Hsuan Sung and Yinfei Yang},
|
350 |
+
year={2022},
|
351 |
+
url={https://arxiv.org/abs/2112.07916},
|
352 |
+
archivePrefix={arXiv},
|
353 |
+
primaryClass={cs.CL}
|
354 |
+
}
|
355 |
+
@misc{arora2024simple,
|
356 |
+
title={Simple linear attention language models balance the recall-throughput tradeoff},
|
357 |
+
author={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and Dylan Zinsley and James Zou and Atri Rudra and Christopher Ré},
|
358 |
+
year={2024},
|
359 |
+
url={https://arxiv.org/abs/2402.18668},
|
360 |
+
archivePrefix={arXiv},
|
361 |
+
primaryClass={cs.CL}
|
362 |
+
}
|
363 |
+
@misc{zhang2024hedgehog,
|
364 |
+
title={The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax Mimicry},
|
365 |
+
author={Michael Zhang and Kush Bhatia and Hermann Kumbong and Christopher Ré},
|
366 |
+
year={2024},
|
367 |
+
url={https://arxiv.org/abs/2402.04347},
|
368 |
+
archivePrefix={arXiv},
|
369 |
+
primaryClass={cs.LG}
|
370 |
+
}
|
371 |
+
@misc{thunderkittens ,
|
372 |
+
title={ThunderKittens},
|
373 |
+
author={HazyResearch},
|
374 |
+
year={2024},
|
375 |
+
publisher={GitHub},
|
376 |
+
url={https://github.com/HazyResearch/ThunderKittens},
|
377 |
+
}
|
378 |
+
@misc{flashdecoding ,
|
379 |
+
title={Flash-Decoding for long-context inference},
|
380 |
+
author={Tri Dao and Daniel Haziza and Francisco Massa and Grigory Sizov},
|
381 |
+
year={2023},
|
382 |
+
url={https://crfm.stanford.edu/2023/10/12/flashdecoding.html},
|
383 |
+
}
|
384 |
+
@InProceedings{huggingface:dataset:stsb_multi_mt,
|
385 |
+
title = {Machine translated multilingual STS benchmark dataset.},
|
386 |
+
author={Philip May},
|
387 |
+
year={2021},
|
388 |
+
url={https://github.com/PhilipMay/stsb-multi-mt}
|
389 |
+
}
|
390 |
+
@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
|
391 |
+
title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis},
|
392 |
+
author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini},
|
393 |
+
year={2024},
|
394 |
+
eprint={2405.20468},
|
395 |
+
archivePrefix={arXiv},
|
396 |
+
primaryClass={cs.CL},
|
397 |
+
url={https://arxiv.org/abs/2405.20468},
|
398 |
+
}
|
399 |
+
@misc{dubey2024llama3herdmodels,
|
400 |
+
title={The Llama 3 Herd of Models},
|
401 |
+
author={Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony Hartshorn and Aobo Yang and Archi Mitra and Archie Sravankumar and Artem Korenev and Arthur Hinsvark and Arun Rao and Aston Zhang and Aurelien Rodriguez and Austen Gregerson and Ava Spataru and Baptiste Roziere and Bethany Biron and Binh Tang and Bobbie Chern and Charlotte Caucheteux and Chaya Nayak and Chloe Bi and Chris Marra and Chris McConnell and Christian Keller and Christophe Touret and Chunyang Wu and Corinne Wong and Cristian Canton Ferrer and Cyrus Nikolaidis and Damien Allonsius and Daniel Song and Danielle Pintz and Danny Livshits and David Esiobu and Dhruv Choudhary and Dhruv Mahajan and Diego Garcia-Olano and Diego Perino and Dieuwke Hupkes and Egor Lakomkin and Ehab AlBadawy and Elina Lobanova and Emily Dinan and Eric Michael Smith and Filip Radenovic and Frank Zhang and Gabriel Synnaeve and Gabrielle Lee and Georgia Lewis Anderson and Graeme Nail and Gregoire Mialon and Guan Pang and Guillem Cucurell and Hailey Nguyen and Hannah Korevaar and Hu Xu and Hugo Touvron and Iliyan Zarov and Imanol Arrieta Ibarra and Isabel Kloumann and Ishan Misra and Ivan Evtimov and Jade Copet and Jaewon Lee and Jan Geffert and Jana Vranes and Jason Park and Jay Mahadeokar and Jeet Shah and Jelmer van der Linde and Jennifer Billock and Jenny Hong and Jenya Lee and Jeremy Fu and Jianfeng Chi and Jianyu Huang and Jiawen Liu and Jie Wang and Jiecao Yu and Joanna Bitton and Joe Spisak and Jongsoo Park and Joseph Rocca and Joshua Johnstun and Joshua Saxe and Junteng Jia and Kalyan Vasuden Alwala and Kartikeya Upasani and Kate Plawiak and Ke Li and Kenneth Heafield and Kevin Stone and Khalid El-Arini and Krithika Iyer and Kshitiz Malik and Kuenley Chiu and Kunal Bhalla and Lauren Rantala-Yeary and Laurens van der Maaten and Lawrence Chen and Liang Tan and Liz Jenkins and Louis Martin and Lovish Madaan and Lubo Malo and Lukas Blecher and Lukas Landzaat and Luke de Oliveira and Madeline Muzzi and Mahesh Pasupuleti and Mannat Singh and Manohar Paluri and Marcin Kardas and Mathew Oldham and Mathieu Rita and Maya Pavlova and Melanie Kambadur and Mike Lewis and Min Si and Mitesh Kumar Singh and Mona Hassan and Naman Goyal and Narjes Torabi and Nikolay Bashlykov and Nikolay Bogoychev and Niladri Chatterji and Olivier Duchenne and Onur Çelebi and Patrick Alrassy and Pengchuan Zhang and Pengwei Li and Petar Vasic and Peter Weng and Prajjwal Bhargava and Pratik Dubal and Praveen Krishnan and Punit Singh Koura and Puxin Xu and Qing He and Qingxiao Dong and Ragavan Srinivasan and Raj Ganapathy and Ramon Calderer and Ricardo Silveira Cabral and Robert Stojnic and Roberta Raileanu and Rohit Girdhar and Rohit Patel and Romain Sauvestre and Ronnie Polidoro and Roshan Sumbaly and Ross Taylor and Ruan Silva and Rui Hou and Rui Wang and Saghar Hosseini and Sahana Chennabasappa and Sanjay Singh and Sean Bell and Seohyun Sonia Kim and Sergey Edunov and Shaoliang Nie and Sharan Narang and Sharath Raparthy and Sheng Shen and Shengye Wan and Shruti Bhosale and Shun Zhang and Simon Vandenhende and Soumya Batra and Spencer Whitman and Sten Sootla and Stephane Collot and Suchin Gururangan and Sydney Borodinsky and Tamar Herman and Tara Fowler and Tarek Sheasha and Thomas Georgiou and Thomas Scialom and Tobias Speckbacher and Todor Mihaylov and Tong Xiao and Ujjwal Karn and Vedanuj Goswami and Vibhor Gupta and Vignesh Ramanathan and Viktor Kerkez and Vincent Gonguet and Virginie Do and Vish Vogeti and Vladan Petrovic and Weiwei Chu and Wenhan Xiong and Wenyin Fu and Whitney Meers and Xavier Martinet and Xiaodong Wang and Xiaoqing Ellen Tan and Xinfeng Xie and Xuchao Jia and Xuewei Wang and Yaelle Goldschlag and Yashesh Gaur and Yasmine Babaei and Yi Wen and Yiwen Song and Yuchen Zhang and Yue Li and Yuning Mao and Zacharie Delpierre Coudert and Zheng Yan and Zhengxing Chen and Zoe Papakipos and Aaditya Singh and Aaron Grattafiori and Abha Jain and Adam Kelsey and Adam Shajnfeld and Adithya Gangidi and Adolfo Victoria and Ahuva Goldstand and Ajay Menon and Ajay Sharma and Alex Boesenberg and Alex Vaughan and Alexei Baevski and Allie Feinstein and Amanda Kallet and Amit Sangani and Anam Yunus and Andrei Lupu and Andres Alvarado and Andrew Caples and Andrew Gu and Andrew Ho and Andrew Poulton and Andrew Ryan and Ankit Ramchandani and Annie Franco and Aparajita Saraf and Arkabandhu Chowdhury and Ashley Gabriel and Ashwin Bharambe and Assaf Eisenman and Azadeh Yazdan and Beau James and Ben Maurer and Benjamin Leonhardi and Bernie Huang and Beth Loyd and Beto De Paola and Bhargavi Paranjape and Bing Liu and Bo Wu and Boyu Ni and Braden Hancock and Bram Wasti and Brandon Spence and Brani Stojkovic and Brian Gamido and Britt Montalvo and Carl Parker and Carly Burton and Catalina Mejia and Changhan Wang and Changkyu Kim and Chao Zhou and Chester Hu and Ching-Hsiang Chu and Chris Cai and Chris Tindal and Christoph Feichtenhofer and Damon Civin and Dana Beaty and Daniel Kreymer and Daniel Li and Danny Wyatt and David Adkins and David Xu and Davide Testuggine and Delia David and Devi Parikh and Diana Liskovich and Didem Foss and Dingkang Wang and Duc Le and Dustin Holland and Edward Dowling and Eissa Jamil and Elaine Montgomery and Eleonora Presani and Emily Hahn and Emily Wood and Erik Brinkman and Esteban Arcaute and Evan Dunbar and Evan Smothers and Fei Sun and Felix Kreuk and Feng Tian and Firat Ozgenel and Francesco Caggioni and Francisco Guzmán and Frank Kanayet and Frank Seide and Gabriela Medina Florez and Gabriella Schwarz and Gada Badeer and Georgia Swee and Gil Halpern and Govind Thattai and Grant Herman and Grigory Sizov and Guangyi and Zhang and Guna Lakshminarayanan and Hamid Shojanazeri and Han Zou and Hannah Wang and Hanwen Zha and Haroun Habeeb and Harrison Rudolph and Helen Suk and Henry Aspegren and Hunter Goldman and Ibrahim Damlaj and Igor Molybog and Igor Tufanov and Irina-Elena Veliche and Itai Gat and Jake Weissman and James Geboski and James Kohli and Japhet Asher and Jean-Baptiste Gaya and Jeff Marcus and Jeff Tang and Jennifer Chan and Jenny Zhen and Jeremy Reizenstein and Jeremy Teboul and Jessica Zhong and Jian Jin and Jingyi Yang and Joe Cummings and Jon Carvill and Jon Shepard and Jonathan McPhie and Jonathan Torres and Josh Ginsburg and Junjie Wang and Kai Wu and Kam Hou U and Karan Saxena and Karthik Prasad and Kartikay Khandelwal and Katayoun Zand and Kathy Matosich and Kaushik Veeraraghavan and Kelly Michelena and Keqian Li and Kun Huang and Kunal Chawla and Kushal Lakhotia and Kyle Huang and Lailin Chen and Lakshya Garg and Lavender A and Leandro Silva and Lee Bell and Lei Zhang and Liangpeng Guo and Licheng Yu and Liron Moshkovich and Luca Wehrstedt and Madian Khabsa and Manav Avalani and Manish Bhatt and Maria Tsimpoukelli and Martynas Mankus and Matan Hasson and Matthew Lennie and Matthias Reso and Maxim Groshev and Maxim Naumov and Maya Lathi and Meghan Keneally and Michael L. Seltzer and Michal Valko and Michelle Restrepo and Mihir Patel and Mik Vyatskov and Mikayel Samvelyan and Mike Clark and Mike Macey and Mike Wang and Miquel Jubert Hermoso and Mo Metanat and Mohammad Rastegari and Munish Bansal and Nandhini Santhanam and Natascha Parks and Natasha White and Navyata Bawa and Nayan Singhal and Nick Egebo and Nicolas Usunier and Nikolay Pavlovich Laptev and Ning Dong and Ning Zhang and Norman Cheng and Oleg Chernoguz and Olivia Hart and Omkar Salpekar and Ozlem Kalinli and Parkin Kent and Parth Parekh and Paul Saab and Pavan Balaji and Pedro Rittner and Philip Bontrager and Pierre Roux and Piotr Dollar and Polina Zvyagina and Prashant Ratanchandani and Pritish Yuvraj and Qian Liang and Rachad Alao and Rachel Rodriguez and Rafi Ayub and Raghotham Murthy and Raghu Nayani and Rahul Mitra and Raymond Li and Rebekkah Hogan and Robin Battey and Rocky Wang and Rohan Maheswari and Russ Howes and Ruty Rinott and Sai Jayesh Bondu and Samyak Datta and Sara Chugh and Sara Hunt and Sargun Dhillon and Sasha Sidorov and Satadru Pan and Saurabh Verma and Seiji Yamamoto and Sharadh Ramaswamy and Shaun Lindsay and Shaun Lindsay and Sheng Feng and Shenghao Lin and Shengxin Cindy Zha and Shiva Shankar and Shuqiang Zhang and Shuqiang Zhang and Sinong Wang and Sneha Agarwal and Soji Sajuyigbe and Soumith Chintala and Stephanie Max and Stephen Chen and Steve Kehoe and Steve Satterfield and Sudarshan Govindaprasad and Sumit Gupta and Sungmin Cho and Sunny Virk and Suraj Subramanian and Sy Choudhury and Sydney Goldman and Tal Remez and Tamar Glaser and Tamara Best and Thilo Kohler and Thomas Robinson and Tianhe Li and Tianjun Zhang and Tim Matthews and Timothy Chou and Tzook Shaked and Varun Vontimitta and Victoria Ajayi and Victoria Montanez and Vijai Mohan and Vinay Satish Kumar and Vishal Mangla and Vítor Albiero and Vlad Ionescu and Vlad Poenaru and Vlad Tiberiu Mihailescu and Vladimir Ivanov and Wei Li and Wenchen Wang and Wenwen Jiang and Wes Bouaziz and Will Constable and Xiaocheng Tang and Xiaofang Wang and Xiaojian Wu and Xiaolan Wang and Xide Xia and Xilun Wu and Xinbo Gao and Yanjun Chen and Ye Hu and Ye Jia and Ye Qi and Yenda Li and Yilin Zhang and Ying Zhang and Yossi Adi and Youngjin Nam and Yu and Wang and Yuchen Hao and Yundi Qian and Yuzi He and Zach Rait and Zachary DeVito and Zef Rosnbrick and Zhaoduo Wen and Zhenyu Yang and Zhiwei Zhao},
|
402 |
+
year={2024},
|
403 |
+
eprint={2407.21783},
|
404 |
+
archivePrefix={arXiv},
|
405 |
+
primaryClass={cs.AI},
|
406 |
+
url={https://arxiv.org/abs/2407.21783},
|
407 |
+
}
|
408 |
+
@Misc{accelerate,
|
409 |
+
title = {Accelerate: Training and inference at scale made simple, efficient and adaptable.},
|
410 |
+
author = {Sylvain Gugger and Lysandre Debut and Thomas Wolf and Philipp Schmid and Zachary Mueller and Sourab Mangrulkar and Marc Sun and Benjamin Bossan},
|
411 |
+
howpublished = {\url{https://github.com/huggingface/accelerate}},
|
412 |
+
year = {2022}
|
413 |
+
}
|
414 |
+
@misc{wang2022languagemodelarchitecturepretraining,
|
415 |
+
title={What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?},
|
416 |
+
author={Thomas Wang and Adam Roberts and Daniel Hesslow and Teven Le Scao and Hyung Won Chung and Iz Beltagy and Julien Launay and Colin Raffel},
|
417 |
+
year={2022},
|
418 |
+
eprint={2204.05832},
|
419 |
+
archivePrefix={arXiv},
|
420 |
+
primaryClass={cs.CL},
|
421 |
+
url={https://arxiv.org/abs/2204.05832},
|
422 |
+
}
|
423 |
+
@misc{kaddour2023minipilechallengedataefficientlanguage,
|
424 |
+
title={The MiniPile Challenge for Data-Efficient Language Models},
|
425 |
+
author={Jean Kaddour},
|
426 |
+
year={2023},
|
427 |
+
eprint={2304.08442},
|
428 |
+
archivePrefix={arXiv},
|
429 |
+
primaryClass={cs.CL},
|
430 |
+
url={https://arxiv.org/abs/2304.08442},
|
431 |
+
}
|
432 |
+
@article{Kocetkov2022TheStack,
|
433 |
+
title={The Stack: 3 TB of permissively licensed source code},
|
434 |
+
author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Muñoz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm},
|
435 |
+
journal={Preprint},
|
436 |
+
year={2022}
|
437 |
+
}
|
438 |
+
@misc{sennrich2016neuralmachinetranslationrare,
|
439 |
+
title={Neural Machine Translation of Rare Words with Subword Units},
|
440 |
+
author={Rico Sennrich and Barry Haddow and Alexandra Birch},
|
441 |
+
year={2016},
|
442 |
+
eprint={1508.07909},
|
443 |
+
archivePrefix={arXiv},
|
444 |
+
primaryClass={cs.CL},
|
445 |
+
url={https://arxiv.org/abs/1508.07909},
|
446 |
+
}
|
447 |
+
@ONLINE{wikidump,
|
448 |
+
author = "Wikimedia Foundation",
|
449 |
+
title = "Wikimedia Downloads",
|
450 |
+
url = "https://dumps.wikimedia.org"
|
451 |
+
}
|
452 |
+
@misc {justice_fr,
|
453 |
+
author = { ECKENDOERFFER, Guillaume },
|
454 |
+
title = { justice_fr },
|
455 |
+
year = 2023,
|
456 |
+
url = { https://huggingface.co/datasets/eckendoerffer/justice_fr },
|
457 |
+
publisher = { Hugging Face }
|
458 |
+
}
|
459 |
+
@software{unsloth,
|
460 |
+
author = {Daniel Han, Michael Han and Unsloth team},
|
461 |
+
title = {Unsloth},
|
462 |
+
url = {http://github.com/unslothai/unsloth},
|
463 |
+
year = {2023}
|
464 |
+
}
|
465 |
+
@misc{antoun2024camembert20smarterfrench,
|
466 |
+
title={CamemBERT 2.0: A Smarter French Language Model Aged to Perfection},
|
467 |
+
author={Wissam Antoun and Francis Kulumba and Rian Touchent and Éric de la Clergerie and Benoît Sagot and Djamé Seddah},
|
468 |
+
year={2024},
|
469 |
+
eprint={2411.08868},
|
470 |
+
archivePrefix={arXiv},
|
471 |
+
primaryClass={cs.CL},
|
472 |
+
url={https://arxiv.org/abs/2411.08868},
|
473 |
+
}
|
474 |
+
@article{wijmans2024cut,
|
475 |
+
author = {Erik Wijmans and
|
476 |
+
Brody Huval and
|
477 |
+
Alexander Hertzberg and
|
478 |
+
Vladlen Koltun and
|
479 |
+
Philipp Kr\"ahenb\"uhl},
|
480 |
+
title = {Cut Your Losses in Large-Vocabulary Language Models},
|
481 |
+
journal = {arXiv},
|
482 |
+
year = {2024},
|
483 |
+
url = {https://arxiv.org/abs/2411.09009},
|
484 |
+
}
|
485 |
+
@misc{rafailov2024directpreferenceoptimizationlanguage,
|
486 |
+
title={Direct Preference Optimization: Your Language Model is Secretly a Reward Model},
|
487 |
+
author={Rafael Rafailov and Archit Sharma and Eric Mitchell and Stefano Ermon and Christopher D. Manning and Chelsea Finn},
|
488 |
+
year={2024},
|
489 |
+
eprint={2305.18290},
|
490 |
+
archivePrefix={arXiv},
|
491 |
+
primaryClass={cs.LG},
|
492 |
+
url={https://arxiv.org/abs/2305.18290},
|
493 |
+
}
|
494 |
+
@misc{muennighoff2023crosslingualgeneralizationmultitaskfinetuning,
|
495 |
+
title={Crosslingual Generalization through Multitask Finetuning},
|
496 |
+
author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
|
497 |
+
year={2023},
|
498 |
+
eprint={2211.01786},
|
499 |
+
archivePrefix={arXiv},
|
500 |
+
primaryClass={cs.CL},
|
501 |
+
url={https://arxiv.org/abs/2211.01786},
|
502 |
+
}
|
503 |
+
@misc {centre_aquitain_des_technologies_de_l'information_et_electroniques_2023,
|
504 |
+
author = { {BOURDOIS, Loïck} },
|
505 |
+
organization = { {Centre Aquitain des Technologies de l'Information et Electroniques} },
|
506 |
+
title = { Dataset of French Prompts (DFP) (Revision 1d24c09) },
|
507 |
+
year = 2023,
|
508 |
+
url = { https://huggingface.co/datasets/CATIE-AQ/DFP },
|
509 |
+
doi = { 10.57967/hf/1200 },
|
510 |
+
publisher = { Hugging Face } }
|
511 |
+
@misc{wang2022languagemodelarchitecturepretraining,
|
512 |
+
title={What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?},
|
513 |
+
author={Thomas Wang and Adam Roberts and Daniel Hesslow and Teven Le Scao and Hyung Won Chung and Iz Beltagy and Julien Launay and Colin Raffel},
|
514 |
+
year={2022},
|
515 |
+
eprint={2204.05832},
|
516 |
+
archivePrefix={arXiv},
|
517 |
+
primaryClass={cs.CL},
|
518 |
+
url={https://arxiv.org/abs/2204.05832},
|
519 |
+
}
|
520 |
+
@misc{zhang2024lolcatslowranklinearizinglarge,
|
521 |
+
title={LoLCATs: On Low-Rank Linearizing of Large Language Models},
|
522 |
+
author={Michael Zhang and Simran Arora and Rahul Chalamala and Alan Wu and Benjamin Spector and Aaryan Singhal and Krithik Ramesh and Christopher Ré},
|
523 |
+
year={2024},
|
524 |
+
eprint={2410.10254},
|
525 |
+
archivePrefix={arXiv},
|
526 |
+
primaryClass={cs.LG},
|
527 |
+
url={https://arxiv.org/abs/2410.10254},
|
528 |
+
}
|
529 |
+
@misc{hu2021loralowrankadaptationlarge,
|
530 |
+
title={LoRA: Low-Rank Adaptation of Large Language Models},
|
531 |
+
author={Edward J. Hu and Yelong Shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Lu Wang and Weizhu Chen},
|
532 |
+
year={2021},
|
533 |
+
eprint={2106.09685},
|
534 |
+
archivePrefix={arXiv},
|
535 |
+
primaryClass={cs.CL},
|
536 |
+
url={https://arxiv.org/abs/2106.09685},
|
537 |
+
}
|
538 |
+
@misc{zhang2024sageattention2efficientattentionthorough,
|
539 |
+
title={SageAttention2: Efficient Attention with Thorough Outlier Smoothing and Per-thread INT4 Quantization},
|
540 |
+
author={Jintao Zhang and Haofeng Huang and Pengle Zhang and Jia Wei and Jun Zhu and Jianfei Chen},
|
541 |
+
year={2024},
|
542 |
+
eprint={2411.10958},
|
543 |
+
archivePrefix={arXiv},
|
544 |
+
primaryClass={cs.LG},
|
545 |
+
url={https://arxiv.org/abs/2411.10958},
|
546 |
+
}
|
547 |
+
@software{penedo2024fineweb-2,
|
548 |
+
author = {Penedo, Guilherme and Kydlíček, Hynek and Sabolčec, Vinko and Messmer, Bettina and Foroutan, Negar and Jaggi, Martin and von Werra, Leandro and Wolf, Thomas},
|
549 |
+
title = {FineWeb2: A sparkling update with 1000s of languages},
|
550 |
+
month = dec,
|
551 |
+
year = 2024,
|
552 |
+
doi = { },
|
553 |
+
url = {https://huggingface.co/datasets/HuggingFaceFW/fineweb-2}
|
554 |
+
}
|
555 |
+
@misc{lewis2019bartdenoisingsequencetosequencepretraining,
|
556 |
+
title={BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
|
557 |
+
author={Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Ves Stoyanov and Luke Zettlemoyer},
|
558 |
+
year={2019},
|
559 |
+
eprint={1910.13461},
|
560 |
+
archivePrefix={arXiv},
|
561 |
+
primaryClass={cs.CL},
|
562 |
+
url={https://arxiv.org/abs/1910.13461},
|
563 |
+
}
|
564 |
+
@misc{gloeckle2024betterfasterlarge,
|
565 |
+
title={Better & Faster Large Language Models via Multi-token Prediction},
|
566 |
+
author={Fabian Gloeckle and Badr Youbi Idrissi and Baptiste Rozière and David Lopez-Paz and Gabriel Synnaeve},
|
567 |
+
year={2024},
|
568 |
+
eprint={2404.19737},
|
569 |
+
archivePrefix={arXiv},
|
570 |
+
primaryClass={cs.CL},
|
571 |
+
url={https://arxiv.org/abs/2404.19737},
|
572 |
+
}
|
573 |
+
@misc{he2023debertav3improvingdebertausing,
|
574 |
+
title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
|
575 |
+
author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
|
576 |
+
year={2023},
|
577 |
+
eprint={2111.09543},
|
578 |
+
archivePrefix={arXiv},
|
579 |
+
primaryClass={cs.CL},
|
580 |
+
url={https://arxiv.org/abs/2111.09543},
|
581 |
+
}
|
582 |
+
@misc{liu2019robertarobustlyoptimizedbert,
|
583 |
+
title={RoBERTa: A Robustly Optimized BERT Pretraining Approach},
|
584 |
+
author={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},
|
585 |
+
year={2019},
|
586 |
+
eprint={1907.11692},
|
587 |
+
archivePrefix={arXiv},
|
588 |
+
primaryClass={cs.CL},
|
589 |
+
url={https://arxiv.org/abs/1907.11692},
|
590 |
+
}
|
dist/distill.bundle.js
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dist/distill.bundle.js.map
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dist/index.html
ADDED
@@ -0,0 +1,1380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
|
5 |
+
<script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7 |
+
<meta charset="utf8">
|
8 |
+
<base target="_blank">
|
9 |
+
<title>FAT5 : Flash Attention T5</title>
|
10 |
+
<link rel="stylesheet" href="style.css">
|
11 |
+
</head>
|
12 |
+
|
13 |
+
<body>
|
14 |
+
<d-front-matter>
|
15 |
+
<script id='distill-front-matter' type="text/json">{
|
16 |
+
"title": "FAT5 : Flash Attention T5",
|
17 |
+
"description": "",
|
18 |
+
"published": "May 28, 2024",
|
19 |
+
"authors": [
|
20 |
+
{
|
21 |
+
"author":"Boris ALBAR",
|
22 |
+
"authorURL":"https://github.com/b-albar",
|
23 |
+
"affiliation": [{"name": "CATIE", "url": "https://catie.fr"}]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"author":"Loïck BOURDOIS",
|
27 |
+
"authorURL":"https://github.com/lbourdois",
|
28 |
+
"affiliation": [{"name": "CATIE", "url": "https://catie.fr"}]
|
29 |
+
}
|
30 |
+
],
|
31 |
+
"color": "#9CA3AF",
|
32 |
+
"katex": {
|
33 |
+
"delimiters": [
|
34 |
+
{"left": "$$", "right": "$$", "display": false}
|
35 |
+
]
|
36 |
+
}
|
37 |
+
}
|
38 |
+
</script>
|
39 |
+
</d-front-matter>
|
40 |
+
<d-title>
|
41 |
+
<h1 class="l-page" style="text-align: center;">FAT5 : Flash Attention T5</h1>
|
42 |
+
<p><img src="./assets/FAT5_dark.gif" alt="FAT5" width="100%"></p>
|
43 |
+
</d-title>
|
44 |
+
|
45 |
+
<d-article>
|
46 |
+
<d-contents>
|
47 |
+
</d-contents>
|
48 |
+
|
49 |
+
<div class="note"> For a better experience, we do not recommend reading on a cell phone. </div>
|
50 |
+
|
51 |
+
<h2 id="motivation">Motivation</h2>
|
52 |
+
<p class="width_125">
|
53 |
+
While much effort has been devoted to optimising decoder transformers, thus abandoning the encoder, we believe it is essential to maintain an encoder-decoder architecture.<br>
|
54 |
+
|
55 |
+
Indeed, this architecture, which offers interesting performance for instruction tuning <d-cite bibtex-key="chia2023instructeval"></d-cite>, is suitable for distillation <d-cite bibtex-key="hsieh2023distilling"></d-cite> and seems superior to decoder models when finetuned <d-cite bibtex-key="fu2024tiny"></d-cite>.
|
56 |
+
It has also been shown that encoder-decoder models trained with masked language modelling achieve better zero-shot performance after multitasking finetuning compared with a decoder model <d-cite bibtex-key="wang2022languagemodelarchitecturepretraining"></d-cite>.<br>
|
57 |
+
Beyond NLP, which is the focus of this blog post, encoder-decoder architecture is widely used in other fields such as audio or time series, for example. Note also that the encoder of such architecture is also used in some diffusion models.<br>
|
58 |
+
|
59 |
+
That's why we've decided to focus on the T5 <d-cite bibtex-key="JMLR:v21:20-074"></d-cite>.<br><br>
|
60 |
+
|
61 |
+
This article presents the optimisations we have implemented to efficiently pre-train a T5 with 147M French parameters in a reasonable time (1,461 H for 419B tokens) and with limited resources (1 single A100; i.e. a computing budget of around 2,200 euros).
|
62 |
+
To achieve this, we designed CUDA/Triton kernels to make Flash Attention compatible with T5 and provide linear inference, thus extending the context size that can be taken into account by the model.<br><br>
|
63 |
+
<strong>The pre-training code is available in our <a class="link" href="https://github.com/catie-aq/flashT5">GitHub repository</a> under Apache-2.0 license and weights on our <a class="link" href="https://hf.co/CATIE-AQ">Hugging Face</a> account.</strong>
|
64 |
+
<p class="width_125"><br><br><br></p>
|
65 |
+
|
66 |
+
|
67 |
+
<h2 id="vue-d-ensemble-de-notre-travail">Overview of our work</h2>
|
68 |
+
<p class="width_125">We therefore chose to work with a T5 and in practice with the nanoT5 <d-cite bibtex-key="nawrot2023nanot5"></d-cite>.<br>
|
69 |
+
For pretext tasks during pre-training, we followed the UL2 ones <d-cite bibtex-key='tay2023ul2'></d-cite> with the following 7 tasks:</p>
|
70 |
+
<pre><code class="lang-py">
|
71 |
+
denoiser_list=[
|
72 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">3.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[R]"</span>},
|
73 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">8.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[R]"</span>},
|
74 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">4.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">0</span>, <span class="hljs-string">"max_spans"</span>: <span class="hljs-number">1</span>, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[S]"</span>},
|
75 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">3.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">5</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
|
76 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">8.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
|
77 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">64.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">15</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>},
|
78 |
+
{<span class="hljs-string">"mu"</span>: <span class="hljs-number">64.0</span>, <span class="hljs-string">"r"</span>: <span class="hljs-number">0</span>.<span class="hljs-number">5</span>, <span class="hljs-string">"max_spans"</span>: max_token_length, <span class="hljs-string">"prefix"</span>: <span class="hljs-string">"[X]"</span>}]
|
79 |
+
denoiser_proportions=[<span class="hljs-number">0</span>.<span class="hljs-number">165</span>, <span class="hljs-number">0</span>.<span class="hljs-number">165</span>, <span class="hljs-number">0</span>.<span class="hljs-number">34</span>, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825, <span class="hljs-number">0</span>.0825]
|
80 |
+
</code></pre>
|
81 |
+
<p class="width_125">with <code>mu</code> the n-gram size, <code>r</code> the masking percentage in the n-gram and <code>prefix</code> the type of pretext task.
|
82 |
+
The meaning of letters <code>[R]</code>, <code>[S]</code> and <code>[X]</code> is described <a class="link" href="https://huggingface.co/google/ul2#mixture-of-denoisers">here</a>
|
83 |
+
and we invite you to take a look at the following <a class="link" href="https://raw.githubusercontent.com/google-research/google-research/master/ul2/figs/mod.png">image</a> in particular.</p>
|
84 |
+
<p class="width_125">For a quick training, we decided to focus on the Flash Attention <d-cite bibtex-key="dao2022flashattention"></d-cite>.
|
85 |
+
However, as it does not manage the attentional (additive) biases of the T5, we had to extend it by developing a custom kernel.
|
86 |
+
More specifically, we successively developed two versions of this kernel.
|
87 |
+
In the first version, at the start of our work, we transmitted the bias matrix to the kernel.
|
88 |
+
In the current version, inspired by TurboT5 <d-cite bibtex-key='turbot5'></d-cite>, we only communicate a tensor containing the merged biases in order to materialise the bias matrix on the fly.
|
89 |
+
This makes it possible to switch from a T5 with quadratic memory to a T5 with linear memory, and consequently greatly increases the size of context that the model can support.</p>
|
90 |
+
|
91 |
+
<p class="width_125">Our work resulted in the pre-training of a T5 in French with 147M parameters: the FAT5 <i>small</i>.<br>
|
92 |
+
The dataset we used is made up of the French part of the CulturaX corpus <d-cite bibtex-key='nguyen2023culturax'></d-cite> (the main source with over 1258 GB of text),
|
93 |
+
the French part of Wikipedia <d-cite bibtex-key="wikidump"></d-cite> (dump 20231101),
|
94 |
+
justice_fr (French legal texts) <d-cite bibtex-key="justice_fr"></d-cite>
|
95 |
+
and 25,000,000 lines from TheStack <d-cite bibtex-key="Kocetkov2022TheStack"></d-cite>
|
96 |
+
(the idea here is to show our model a bit of code, although this is not our main objective).<br>
|
97 |
+
|
98 |
+
This model was evaluated on five tasks: text summarization, binary classification, question answering, named entity recognition and sentence similarity.</p>
|
99 |
+
<p class="width_125"><br><br><br></p>
|
100 |
+
|
101 |
+
<h2 id="les-d-tails-de-la-recette">Recipe details</h2>
|
102 |
+
<p class="width_125">With only two A100 (one 80GB and one 40GB), we had to spend some time implementing optimisations to get the best out of our hardware.
|
103 |
+
Indeed, before even training a model, or even modifying its architecture, we need to ensure that we are optimising the use of our GPUs' computing capacity.
|
104 |
+
There are several factors that can explain sub-optimal training of a deep learning model:<br>
|
105 |
+
• Disk-bounded<br>
|
106 |
+
• Memory-bounded<br>
|
107 |
+
• Compute-bounded</p>
|
108 |
+
|
109 |
+
<p class="width_125">Ideally, we would like the model to be limited by the speed of calculation, i.e. the GPU to be used at full capacity.
|
110 |
+
With this in mind, we worked on three main points: <br>
|
111 |
+
• GPU disk optimisation <br>
|
112 |
+
• GPU memory bandwidth optimisation <br>
|
113 |
+
• Optimisation of the use of Tensor Cores<br>
|
114 |
+
</p>
|
115 |
+
|
116 |
+
<p class="width_125">So it's a combination of hardware and software issues.</p>
|
117 |
+
<p></p>
|
118 |
+
<p class="width_125">In the rest of this section, everything we have done/implemented to address the limitations encountered is available in a green box. Notes/comments can be found in a blue box.
|
119 |
+
<br><br></p>
|
120 |
+
<h3 id="optimisation-du-disque-du-gpu">GPU disk optimisation</h3>
|
121 |
+
<p class="width_125">Disk limitation occurs either during data loading or during pre-processing operations.
|
122 |
+
In both cases, the problem manifests itself as slowness.
|
123 |
+
<br></p>
|
124 |
+
<h4 id="acc-s-disques">Disk access</h4>
|
125 |
+
<p class="width_125">If the limitation comes from disk access, there are several possible solutions:</p>
|
126 |
+
<ul>
|
127 |
+
<li><p class="width_125"><u>Put data in RAM</u><br>
|
128 |
+
This solves the problem radically, but assumes that the database fits into RAM, which is far from obvious given its small size.</p>
|
129 |
+
<div class="tip"><p>So this is not the solution we have chosen.</p></div>
|
130 |
+
</li>
|
131 |
+
<li><p class="width_125"><u>Put data on a faster and/or less-used disk</u><br>
|
132 |
+
If you have physical access to your GPU server, it is very useful to integrate <a class="link" href="https://fr.wikipedia.org/wiki/NVM_Express">NVMe</a> in its configuration.</p>
|
133 |
+
<p class="width_125">You also need to be careful not to have too many processes from different training pulling on the same disc.
|
134 |
+
It is therefore preferable to have several small discs rather than one large one.</p>
|
135 |
+
<div class="note"><p>A beneficial indirect effect is that such a configuration costs less 😉</p></div>
|
136 |
+
</li>
|
137 |
+
</ul>
|
138 |
+
<ul>
|
139 |
+
<li class="width_125"><u>Use more efficient file formats, particularly in terms of random accesses</u><br>
|
140 |
+
For example <code>.parquet</code> files are more efficient than <code>.csv</code>.
|
141 |
+
We can also use formats specifically developed for this purpose, such as the <code>.beton</code> from ffcv <d-cite bibtex-key="leclerc2023ffcv"></d-cite>.</li>
|
142 |
+
<div class="tip"><p>We use the Datasets library <d-cite bibtex-key="lhoest2021datasets"></d-cite> to load and process the data we use.
|
143 |
+
With this library, the data is decompressed locally in the <code>Arrow</code> format.
|
144 |
+
Moreover, if the data loaded from the Hugging Face Hub has been added using the <code>push_to_hub()</code> function,
|
145 |
+
then the dataset is converted by default in <code>parquet</code>.</p></div>
|
146 |
+
</ul>
|
147 |
+
<ul>
|
148 |
+
<li class="width_125"><u>Pre-tokenise data</u><br>
|
149 |
+
The most effective option is probably to pre-tokenise the data in order to optimise access.
|
150 |
+
In other words, tokenisation takes place in a preliminary stage and not on the fly.
|
151 |
+
</li>
|
152 |
+
<div class="tip"><p>Readers are invited to consult the following
|
153 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/main/examples/minipile/pretokenize_minipile.py">code</a> which
|
154 |
+
illustrates how we proceed in our FAT5 tutorial applied to the Minipile dataset <d-cite bibtex-key="kaddour2023minipilechallengedataefficientlanguage"></d-cite>.</p></div>
|
155 |
+
</ul>
|
156 |
+
<p><br></p>
|
157 |
+
<h4 id="traitement-des-donn-es">Data processing</h4>
|
158 |
+
<p class="width_125">If the limitation comes from the processing of data after they have been uploaded:</p>
|
159 |
+
<ul>
|
160 |
+
<li><p class="width_125"><u>Several processes can be used to process data in parallel</u><br>
|
161 |
+
For example, the parameter <code>num_workers</code> of the <code>Dataloader</code> of PyTorch <d-cite bibtex-key="paszke2019pytorch"></d-cite>.</p></li>
|
162 |
+
<div class="tip"><p>You can find in our code the values we use for this parameter for our FAT5 small <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/configs/fr/fat5-fr-small.yaml#L42">small</a>.</div>
|
163 |
+
</ul>
|
164 |
+
<ul>
|
165 |
+
<li><p class="width_125"><u>The bottleneck can also come from the <code>DataCollator</code></u><br>
|
166 |
+
This is especially the case when there are complex tasks to perform (image masking or multiple denoisers on NLP tasks).<br>
|
167 |
+
We can then build a custom <code>DataCollator</code> for the task.
|
168 |
+
On appliquera les méthodes traditionnelles pour optimiser la vitesse de celui-ci.
|
169 |
+
Similarly, using Numpy's vectorisation will allow lists to be processed more quickly than with <code>for</code> loops.
|
170 |
+
Generally speaking, Numpy is faster than PyTorch for this type of task.
|
171 |
+
You can also use compilation methods such as numba <d-cite bibtex-key="10.1145/2833157.2833162"></d-cite> for Python, for example.</p></li>
|
172 |
+
<div class="tip"><p>We followed this principle and developed a custom <code>DataCollator</code> for our FAT5.
|
173 |
+
You can find the code <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/data/data_collator_ul2.py">here</a>.
|
174 |
+
It manages UL2 pretext tasks and has a dynamic batch mechanism to reduce padding (more information in the next section)</p></div>
|
175 |
+
<div class="note"><p>As there was no implementation of UL2's <code>DataCollator</code> available in PyTorch until now,
|
176 |
+
we hope this may be useful for other work.</p></div>
|
177 |
+
</ul>
|
178 |
+
<ul>
|
179 |
+
<li><p class="width_125"><u>Effective padding</u><br>
|
180 |
+
<p class="width_125">When working with sequences, there is a natural tendency to pad a set of sequences in order to build batches.
|
181 |
+
The padding tokens then generate unnecessary calculations.<br>
|
182 |
+
The first thing to do is to limit padding to the maximum size sequence and not to a maximum value.
|
183 |
+
This is the <a class="link" href="https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt#dynamic-padding">dynamic padding</a> technique.<br>
|
184 |
+
With this approach, padding tokens may still remain. There are two ways of managing them:<br>
|
185 |
+
• use a method for grouping data of similar sizes
|
186 |
+
(for example, <a class="link" href="https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.group_by_length">this parameter</a>
|
187 |
+
in the Transformers library <d-cite bibtex-key="wolf2020huggingfaces"></d-cite> or
|
188 |
+
<a class="link" href="https://discuss.huggingface.co/t/how-to-implement-trainers-group-by-length-in-pytorch/9232">by retrieving this sampler</a> for PyTorch)<br>
|
189 |
+
• concatenate different examples in a custom DataCollator.</p>
|
190 |
+
<div class="tip"><p>We have opted for the second option and refer the reader back to the
|
191 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/data/data_collator_ul2.py">code</a> our DataCollator.</p></div>
|
192 |
+
<div class="note"><p>More optimised heuristics probably need to be put in place.
|
193 |
+
We carried out a test by proposing a
|
194 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/data/data_collator_ul2.py#L45">function</a>
|
195 |
+
in the <code>DataCollator</code> to sort <code>input_ids</code> and <code>labels</code> by descending length.
|
196 |
+
However, this is rather time-consuming for a minimal packaging gain.
|
197 |
+
More work needs to be done on this point.
|
198 |
+
</p></div>
|
199 |
+
</ul>
|
200 |
+
<p class="width_125"><br><br></p>
|
201 |
+
|
202 |
+
|
203 |
+
<h3 id="optimisation-de-la-bande-passante-de-la-m-moire-du-gpu">GPU memory bandwidth optimisation</h3>
|
204 |
+
<p class="width_125">Memory bandwidth limitation is more difficult to deal with.
|
205 |
+
A memory-limited operation is one whose overall execution time is restricted by memory accesses.
|
206 |
+
This is particularly the case for LLMs, especially at the inference level.
|
207 |
+
The diagnosis can be made from the <a class="link" href="https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html">PyTorch profiler</a>:</p>
|
208 |
+
<figure class="width_125">
|
209 |
+
<img src="https://pytorch.org/tutorials/_static/img/profiler_overview1.png" alt="profiler_overview1.png" width="100%">
|
210 |
+
<figcaption><center><i>Source: <a class="link" href="https://pytorch.org/tutorials/_static/img/profiler_overview1.png">https://pytorch.org/tutorials/_static/img/profiler_overview1.png</a></i></center></figcaption>
|
211 |
+
</figure>
|
212 |
+
<br><br><br>
|
213 |
+
<p class="width_125">Another way of establishing a diagnosis is to use a simple <code>nvidia-smi</code>:</p>
|
214 |
+
<figure class="width_125">
|
215 |
+
<img src="./assets/nvidiasmi.png" alt="nvidiasmi.png" width="100%">
|
216 |
+
</figure>
|
217 |
+
<br>
|
218 |
+
<p class="width_125">Useful for finding out if a problem is present, but gives limited information about the nature of the problem.
|
219 |
+
That's why we prefer the profiler.</p>
|
220 |
+
<p><br></p>
|
221 |
+
<h4 id="noyau-cuda">CUDA kernel</h4>
|
222 |
+
<p class="width_125">The main technique for optimising GPU memory bandwidth is to develop a CUDA kernel that merges several limiting operations into SRAM.
|
223 |
+
This can limit the copying of large matrices into the HBM and then immediately reloading them into SRAM.
|
224 |
+
This is now a common feature of decoder transformers thanks to the <a class="link" href="https://github.com/Dao-AILab/flash-attention">Flash Attention</a>.</p>
|
225 |
+
<div class="tip"><p>
|
226 |
+
As Flash Attention does not manage the (additive) attentional biases of the T5, we extended it by developing a custom CUDA kernel.
|
227 |
+
As mentioned in the introduction, we actually implemented two successive versions of this kernel.
|
228 |
+
Without going into the details of the 650 lines of code in the implementation of the first version (which can be consulted
|
229 |
+
<a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/617">here</a>),
|
230 |
+
the general and simplified idea (for a forward pass) is as follows:</p>
|
231 |
+
<ul>
|
232 |
+
<li>The expected output O, initialised with 0's, is loaded from the HBM to the SRAM, as well as the query Q, the key K, the value V and the biases B.</li>
|
233 |
+
<li>Our CUDA kernel calculates the following steps:<br>
|
234 |
+
• Compute the matrix S using the matrix product of Q and the transpose of K<br>
|
235 |
+
• Compute S', which is the sum of the S matrix and the bias matrix B<br>
|
236 |
+
• Compute P, which is the softmax (cumulative under the hood) of S’<br>
|
237 |
+
• Compute the output O, which is the matrix product of P and V<br>
|
238 |
+
</li>
|
239 |
+
<li>Output O is loaded on the HBM and the SRAM is cleared.
|
240 |
+
<picture>
|
241 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/FAT5_dark.gif">
|
242 |
+
<img alt="FAT5 animation" src="./assets/FAT5.gif" width="100%">
|
243 |
+
</picture>
|
244 |
+
</ul>
|
245 |
+
<br>
|
246 |
+
<p>While the first version of the kernel is generic, the second (available <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/956">here</a>)
|
247 |
+
is specific to the working of models with relative positional encoding (which is the case of the T5).
|
248 |
+
The general and simplified idea (for a forward pass) is as follows:</p>
|
249 |
+
<ul>
|
250 |
+
<li>In the HBM, we have the expected output O (output) initialised with 0s, as well as the query Q (query), the key K (key) and the value V (value).
|
251 |
+
However, we don't have the bias matrix B as before, but the bucket of tensors T.</li>
|
252 |
+
<li>O is loaded from the HBM to the SRAM, along with the query Q, the key K, the value V and the tensor bucket T.</li>
|
253 |
+
<li>Our CUDA kernel calculates the following steps:<br>
|
254 |
+
• Compute the matrix S using the matrix product of Q and the transpose of K<br>
|
255 |
+
• Compute S', which is the sum of the matrix S and a matrix filled with the elements of T<br>
|
256 |
+
• Compute P, which is the softmax (cumulative under the hood) of S’<br>
|
257 |
+
• Compute the output O, which is the matrix product of P and V<br>
|
258 |
+
</li>
|
259 |
+
<li>Output O is loaded on the HBM and the SRAM is cleared.
|
260 |
+
</ul>
|
261 |
+
<p>
|
262 |
+
In this way, whereas the first version of the B bias matrix required a quadratic memory,
|
263 |
+
here we are back to a linear memory enabling inferences to be performed on tens of thousands of tokens.<br>
|
264 |
+
To design this second version, we were inspired by the TurboT5's Triton kernel, which we ported to CUDA and extended to full BF16.
|
265 |
+
</p>
|
266 |
+
</div>
|
267 |
+
<br>
|
268 |
+
|
269 |
+
<div class="tip"><p>Note that the two versions developed can be used with several positional encodings.<br>
|
270 |
+
We invite the reader to consult this <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/positional_encoding.py">file</a>
|
271 |
+
containing classes compatible with Flash Attention for the
|
272 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L10">RelativePositionalEncoding</a>
|
273 |
+
<d-cite bibtex-key="shaw2018selfattention"></d-cite>,
|
274 |
+
the <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L113">ALiBiPositionalEncoding</a>
|
275 |
+
<d-cite bibtex-key="press2022train"></d-cite>,
|
276 |
+
the <a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L205">RotaryPositionalEncoding</a>
|
277 |
+
<d-cite bibtex-key="su2023roformer"></d-cite> and
|
278 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/dfe10d498ae0b39082182f807acb509e91992360/src/utils/positional_encoding.py#L341">FIRE</a> <d-cite bibtex-key="li2024functional"></d-cite>.</p></div>
|
279 |
+
|
280 |
+
<div class="note"><p>At the time of writing, the two pull requests (one for each kernel version,
|
281 |
+
available <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/617">here</a>
|
282 |
+
and <a class="link" href="https://github.com/Dao-AILab/flash-attention/pull/956">here</a>)
|
283 |
+
opened on the official Flash Attention repository have not been merged.
|
284 |
+
Readers will therefore have to temporarily recompile our custom Flash Attention patches to be able to use our models.<br>
|
285 |
+
Readers are invited to consult the Benchmark section further below to see the improvements brought by these two kernels.</p></div>
|
286 |
+
<br>
|
287 |
+
<div class="note"><p>Although we didn't use them, it should be noted that some libraries contain merged implementations of common operators, for example Apex <d-cite bibtex-key="nvidiapex"></d-cite>.</p></div>
|
288 |
+
|
289 |
+
<p><br></p>
|
290 |
+
<h4 id="noyau-triton">Triton kernel</h4>
|
291 |
+
<p class="width_125">Triton <d-cite bibtex-key="10.1145/3315508.3329973"></d-cite> is a maintained programming language that allows Python code to be compiled efficiently, like CUDA, but with the advantage of being (from our point of view) easier to learn. Unlike CUDA, which requires an in-depth understanding of GPU hardware architecture, Triton ignores many low-level details such as memory coalescing, shared memory management and scheduling within CUDA thread blocks.</p>
|
292 |
+
|
293 |
+
<div class="tip"><p>A Triton implementation of the
|
294 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/flash_attention_v2_bias.py">Flash Attention 2 managing attention bias</a>
|
295 |
+
is provided for those who do not wish to recompile a custom patch for Flash Attention.
|
296 |
+
To do this, we based ourselves on the FlagAttention repository <d-cite bibtex-key="flagattention"></d-cite>.
|
297 |
+
<br>
|
298 |
+
<br>
|
299 |
+
In addition to this implementation (whose use is optional), other parts of the architecture have been optimised using ad hoc Triton kernels, namely:
|
300 |
+
<br>
|
301 |
+
• the <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/cross_entropy_loss.py">cross entropy loss</a> (and the loss z <d-cite bibtex-key="debrébisson2016zloss"></d-cite>) <br>
|
302 |
+
• the <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/ops/rms_norm.py">RMSNorm layer</a> <d-cite bibtex-key="zhang2019root"></d-cite> <br>
|
303 |
+
<br>
|
304 |
+
We drew inspiration from <a class="link" href="https://github.com/unslothai/unsloth">Unsloth</a> <d-cite bibtex-key="unsloth"></d-cite>.<br>
|
305 |
+
<br>
|
306 |
+
Readers are invited to refer to the Benchmark section below to see the impact of this optimisation.</div>
|
307 |
+
|
308 |
+
<p><br></p>
|
309 |
+
<h4 id="utiliser-torch-compile-">Use <code>torch.compile</code></h4>
|
310 |
+
<p class="width_125">A simpler approach is to compile the models with <code>torch.compile</code>.
|
311 |
+
PyTorch then takes care of making the possible merges, possibly by reordering operations.
|
312 |
+
This involves hunting down breaks in the compilation graph, which are returns to an eager execution mode that have a negative impact on the performance of the operation.</p>
|
313 |
+
<div class="note"><p>See the <a class="link" href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html">official documentation</a> for more details.</p></div>
|
314 |
+
<p class="width_125">Another possibility is to use both a custom kernel and <code>torch.compile</code>.
|
315 |
+
The implementation of this option has been greatly simplified since the
|
316 |
+
<a class="link" href="https://github.com/pytorch/pytorch/releases/tag/v2.4.0">version 2.4 of PyTorch</a>.</p>
|
317 |
+
<p class="width_125">Readers are invited to refer to the benchmark section at the end of the article to measure the memory performance
|
318 |
+
performance of the various techniques described.</p>
|
319 |
+
<p class="width_125"><br><br></p>
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
<h3 id="optimisation-de-l-utilisation-des-tensor-cores">Optimisation of the use of Tensor Cores</h3>
|
324 |
+
<p class="width_125">Recent GPUs have units dedicated to tensorial operations: the TensorCore. Using them correctly is essential.</p>
|
325 |
+
<p class="width_125">Once again, to establish a diagnosis, it is advisable to refer to the PyTorch profiler, which indicates the proportion of TensorCore used for each CUDA kernel:</p>
|
326 |
+
<p><figure class="width_125">
|
327 |
+
<img src="https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png" alt="profiler_kernel_view.png" width="100%">
|
328 |
+
<figcaption><center><i>Source: <a class="link" href="https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png">https://pytorch.org/tutorials/_static/img/profiler_kernel_view.png</a></i></center></figcaption>
|
329 |
+
</figure>
|
330 |
+
<br><br>
|
331 |
+
<p class="width_125">The optimisations that can be made are:<br></p>
|
332 |
+
<h4 id="puissances-de-2">Use multiples of 8 or 64</h4>
|
333 |
+
<p class="width_125">The first is to use tensor sizes that are multiples of 8 or 64.
|
334 |
+
Please refer to the Nvidia documentation,
|
335 |
+
in particular this <a class="link" href="https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc">article</a>
|
336 |
+
and this <a class="link" href="https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/">article</a>
|
337 |
+
to determine the multiple to select according to the desired precision.</p>
|
338 |
+
<div class="tip"><p>With this in mind, we trained a tokenizer of size 32 768 (8**5),
|
339 |
+
following <a class="link" href="https://twitter.com/karpathy/status/1621578354024677377">this observation by KARPATHY</a>.
|
340 |
+
This is a BPE tokenizer <d-cite bibtex-key="sennrich2016neuralmachinetranslationrare"></d-cite> trained on CulturaX and The Stack, using 256 extra_tokens and the numbers are separated.<br>
|
341 |
+
Readers can find the code used <a class="link" href=https://github.com/catie-aq/flashT5/blob/main/examples/fat5-fr/train_tokenizer.py">here</a>.
|
342 |
+
</p></div>
|
343 |
+
<p><br></p>
|
344 |
+
|
345 |
+
<h4 id="utiliser-le-bon-optimiseur">Use the right optimiser</h4>
|
346 |
+
<p class="width_125">Changing optimisers from the initial implementation of the model can be a good way of speeding up convergence of the model (although it may prevent the results of the original paper from being reproduced).<br>
|
347 |
+
Optimisers speed up convergence by allowing large batch sizes, as in the case of LAMB <d-cite bibtex-key="you2020large"></d-cite>
|
348 |
+
or the use of higher learning rates such as Sophia <d-cite bibtex-key="liu2024sophia"></d-cite>.<br>
|
349 |
+
More efficient versions of the optimisers can also be used, such as the <code>fused</code> option
|
350 |
+
in the <a class="link" href="https://pytorch.org/docs/stable/generated/torch.optim.Adam.html">Adam optimiser</a> available in PyTorch
|
351 |
+
or the optimisers available in <a class="link" href="https://github.com/NVIDIA/apex">Apex</a>.</p>
|
352 |
+
<div class="tip"><p>
|
353 |
+
We used the original T5 optimiser, <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/adamw_scaled.py">AdamWScale</a>.
|
354 |
+
For hyperparameter values, we use <code>lr = 5e-3</code>, <code>betas = (0.9, 0.999)</code>, <code>eps = 1e-6</code> et <code>weight_decay = 0.0</code>
|
355 |
+
based on the observations of <a class="link" href="https://github.com/PiotrNawrot/nanoT5/issues/25#issuecomment-1922731400">Wilson Wongso</a>.
|
356 |
+
Indeed, it turns out that not all the alternative optimisers tested converged.</p></div>
|
357 |
+
<div class="note"><p>We have added the parameter <code>foreach</code> in our version of AdamWScale.</p>
|
358 |
+
</div>
|
359 |
+
<p><br></p>
|
360 |
+
|
361 |
+
<h4 id="entra-ner-ses-mod-les-en-bf16-ou-fp16-">Training models in <code>bf16</code></h4>
|
362 |
+
<p class="width_125">Recent GPUs make it possible to full exploit the use of reduced precision
|
363 |
+
(enabling a gain of a factor of 2 in throughput compared to the <code>fp32</code> precision).
|
364 |
+
The <code>bf16</code> is only available on Ampere or more recent architectures, but allows you to avoid loss scaling
|
365 |
+
<d-cite bibtex-key="micikevicius2018mixed"></d-cite> method which is generally necessary in <code>fp16</code>
|
366 |
+
thanks to a wider dynamic range (the exponent is coded on 8 bits like the <code>fp32</code>).</p>
|
367 |
+
<div class="tip"><p>With this in mind, we train our models in <code>bf16</code>.
|
368 |
+
More specifically, while at the beginning of our experiments we used <code>bf16-mixed</code>, we have used the
|
369 |
+
<a class="link" href="https://en.wikipedia.org/wiki/Kahan_summation_algorithm">Kahan summation algorithm</a>
|
370 |
+
so that we can use <code>full bf16</code> in our optimizer.<br>
|
371 |
+
Once again, the code for our optimizer is accessible <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/utils/adamw_scaled.py">here</a>.
|
372 |
+
</p></div>
|
373 |
+
<p><br></p>
|
374 |
+
|
375 |
+
<h4 id="utiliser-moins-de-m-moire-du-gpu">Use less GPU memory</h4>
|
376 |
+
<p class="width_125">Certain techniques exist to limit the use of GPU memory by the model, such as the
|
377 |
+
<a class="link" href="https://pytorch.org/docs/stable/checkpoint.html">gradient checkpointing</a>
|
378 |
+
or ZeRO-type methods <d-cite bibtex-key="rajbhandari2020zero"></d-cite> implemented in
|
379 |
+
<a class="link" href="https://github.com/microsoft/DeepSpeed">DeepSpeed</a>.
|
380 |
+
By limiting the amount of memory used, larger batch sizes can be used to speed up model training.</p>
|
381 |
+
<p class="width_125"><br><br></p>
|
382 |
+
|
383 |
+
|
384 |
+
<h3 id="autres">Other</h3>
|
385 |
+
<h4 id="le-parall-lisme">Parallelism</h4>
|
386 |
+
<p class="width_125">Using several GPUs is tricky.
|
387 |
+
Done naively, it can result in lower performance than implementation on a single GPU, wasting computing resources.
|
388 |
+
This is particularly the case when bottlenecks occur in communications between GPUs.
|
389 |
+
The aim is to ensure that the model is not limited by the bandwidth between the cards, or to ensure that the cards are connected with sufficient
|
390 |
+
bandwidths via techniques such as <a class="link" href="https://en.wikipedia.org/wiki/NVLink">NVLink</a> for example. </p>
|
391 |
+
<p class="width_125">It should also be noted that optimisation techniques generally require all the GPUs to be synchronised at the end of a batch.
|
392 |
+
As a result, if one GPU is slower than the others (or is being used by another process), the model is limited to the speed of the slowest GPU in the group. </p>
|
393 |
+
<div class="note"><p>
|
394 |
+
Having pre-trained our model on a single 80GB A100, we were unable to experiment with parallelism.</p>
|
395 |
+
</div>
|
396 |
+
<p><br></p>
|
397 |
+
|
398 |
+
<h4 id="les-t-tes-pour-le-finetuning">Finetuning heads</h4>
|
399 |
+
<p class="width_125">We looked at the elements listed above with a view to optimising the pre-training of our model.
|
400 |
+
In practice, we then need to fine-tune it to specialise on the final tasks that interest us.
|
401 |
+
To do this, we use heads. For the <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5">vanilla T5</a>,
|
402 |
+
five are available in Transformers to perform all feasible tasks:
|
403 |
+
<a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForConditionalGeneration"><code>T5ForConditionalGeneration</code></a>,
|
404 |
+
<a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForSequenceClassification"><code>T5ForSequenceClassification</code></a>,
|
405 |
+
<a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForTokenClassification"><code>T5ForTokenClassification</code></a>,
|
406 |
+
<a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5ForQuestionAnswering"><code>T5ForQuestionAnswering</code></a>
|
407 |
+
et <a class="link" href="https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel"><code>T5EncoderModel</code></a>.<br><br>
|
408 |
+
Here again, optimisation work can be carried out.<br>
|
409 |
+
For conditional generation, the main point is to ensure that the generation process is efficient.<br>
|
410 |
+
For heads involved in classification tasks (sequence, NER and QA), it is necessary to ensure that the encoder part
|
411 |
+
of the T5 is used, since the decoder is not essential for these tasks, as shown in EncT5 <d-cite bibtex-key="liu2022enct5"></d-cite>.
|
412 |
+
The decoder weights take up unnecessary memory space, and the execution time of the finetuning code is doubled unnecessarily.<br>
|
413 |
+
The last head is simply used to retain only the encoder part of an encoder-decoder model. It therefore does not need to be optimised.</p>
|
414 |
+
<div class="tip"><p>
|
415 |
+
About the head<code>ForConditionalGeneration</code>, our
|
416 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/684d02640464ea8bd2339689ce37da2d4e3b5f0b/src/model/modeling_flash_t5.py#L593">implementation</a>
|
417 |
+
is based on the generation process available in the
|
418 |
+
<a class="link" href="https://github.com/PiotrNawrot/nanoT5/blob/1c82d67bf8dea635be68a3b2a68a43b68b665193/nanoT5/utils/t5_model.py#L407">nanoT5</a>
|
419 |
+
because it is 14% faster than the Hugging Face implementation.<br>
|
420 |
+
For classification heads, the implementation is available in this
|
421 |
+
<a class="link" href="https://github.com/catie-aq/flashT5/blob/main/src/model/custom_heads_flash_t5.py">file</a>.
|
422 |
+
This file is separate from the modelling file because our implementations differ from those available in Transformers.
|
423 |
+
Indeed, heads <code>T5ForSequenceClassification</code> and <code>T5ForQuestionAnswering</code> available in Transformers are based
|
424 |
+
on the T5 encoder and decoder, which is inefficient.
|
425 |
+
We therefore recoded these two heads to use only the encoder.
|
426 |
+
We then followed the same structure as the <code>T5ForTokenClassification</code> head available in Transformers,
|
427 |
+
which also only uses the encoder, and so have used as is.</p>
|
428 |
+
</div>
|
429 |
+
|
430 |
+
|
431 |
+
<p class="width_125"><br><br><br></p>
|
432 |
+
<h2 id="benchmark">Benchmark</h2>
|
433 |
+
|
434 |
+
<h3 id="TFLOPS">TFLOPS</h3>
|
435 |
+
<p class="width_125">
|
436 |
+
The number of TFLOPS (trillions of floating-point calculations a processor can perform in one second) is probably the most telling metric to demonstrate the impact of the optimizations carried out.<br>
|
437 |
+
We compare four approaches:<br>
|
438 |
+
• SPDA (Scaled Dot Product Attention) implementation with full bias,<br>
|
439 |
+
• the same implementation but in Triton,<br>
|
440 |
+
• the Flash implementation Attention RPE, i.e. the second kernel we've developed (can be seen as turboT5 but in C++/Cuda),<br>
|
441 |
+
• the Flash implementation Attention i.e. without bias. We've included it for reference but it's unusable in practice for a T5.<br>
|
442 |
+
<br>
|
443 |
+
For the forward pass, we have:
|
444 |
+
</p>
|
445 |
+
<p class="width_125">
|
446 |
+
<picture>
|
447 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/FWD-causal-True_dark.png" width="100%">
|
448 |
+
<img alt="Benchmark memory backward pass" src="./assets/FWD-causal-True.png" width="100%">
|
449 |
+
</picture>
|
450 |
+
|
451 |
+
<div class="width_125"><p>For the forward pass, the Triton approach achieves 1.34 times more FLOPS than the SPDA approach, while the Flash Attention RPE approach achieves 1.99 times more FLOPS than the SPDA approach.<br>
|
452 |
+
We can also see that our bf16 implementation is equivalent to fp16 (doing even better at size 512).<br>
|
453 |
+
Following this benchmark, we decided to train our French model in bf16, head_dim = 128 and with a sequence of 1024.</p></div>
|
454 |
+
|
455 |
+
<br>
|
456 |
+
<p class="width_125">
|
457 |
+
For the backward pass, we have:
|
458 |
+
</p>
|
459 |
+
|
460 |
+
<p class="width_125">
|
461 |
+
<picture>
|
462 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/BWD-causal-True_dark.png" width="100%">
|
463 |
+
<img alt="Benchmark memory backward pass" src="./assets/BWD-causal-True.png" width="100%">
|
464 |
+
</picture>
|
465 |
+
|
466 |
+
<div class="width_125"><p>For the backward pass, the Triton implementation performed worse than SPDA, with 0.71 times the FLOPS of SPDA. The Flash Attention RPE implementation is more or less equivalent to SPDA (1.018 times more FLOPS).<br>
|
467 |
+
We can also observe that Triton in head_dim 64 is more efficient than Triton in head_dim 128.</p></div>
|
468 |
+
|
469 |
+
<p><br></p>
|
470 |
+
<h4 id="torchvstriton">Torch vs Triton</h4>
|
471 |
+
<p class="width_125">
|
472 |
+
We mentioned previously that we had optimised parts of the architecture using ad hoc Triton kernels, namely the cross-entropy and RMSNorm layer.
|
473 |
+
The following benchmarks should illustrate why.<br>
|
474 |
+
|
475 |
+
For cross-entropy, we get a forward pass 7 to 11.4 times faster, a backward pass 3.26 to 3.75 times faster as well as a memory reduced by a factor of 4:</p>.
|
476 |
+
|
477 |
+
<p class="width_125">
|
478 |
+
<picture>
|
479 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/CE_dark.png" width="100%">
|
480 |
+
<img alt="Benchmark memory backward pass" src="./assets/CE.png" width="100%">
|
481 |
+
</picture>
|
482 |
+
|
483 |
+
<p class="width_125">
|
484 |
+
For the RMSNorm layer, we get a forward pass 3 to 5 times faster, a backward pass 2.33 to 4.33 times faster as well as a memory reduced by a factor of 3.2:</p>.
|
485 |
+
|
486 |
+
<p class="width_125">
|
487 |
+
<picture>
|
488 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/LN_dark.png" width="100%">
|
489 |
+
<img alt="Benchmark memory backward pass" src="./assets/BLN.png" width="100%">
|
490 |
+
</picture>
|
491 |
+
|
492 |
+
|
493 |
+
<p class="note">
|
494 |
+
Note that all the benchmark graphs can be generated automatically using the following <a href="https://github.com/catie-aq/flashT5/tree/main/benchmarks">code</a>.
|
495 |
+
</p>
|
496 |
+
|
497 |
+
|
498 |
+
<p><br><br></p>
|
499 |
+
<h3 id="mod-le-en-fran-ais">Model in French</h3>
|
500 |
+
<p class="width_125">We applied our work to French by pre-training a 147M parameter model. <br>
|
501 |
+
The dataset we used is a mixture of CulturaX, Wikipedia, justice_fr and The Stack. <br>
|
502 |
+
Our tokenizer of size 32,768 (8**5) is trained on CulturaX and The Stack.<br>
|
503 |
+
Our model is pre-trained on a sequence of 1,024 tokens.</p>
|
504 |
+
|
505 |
+
<p class="width_125">
|
506 |
+
We wanted to compare the performance of our model with other previously published French-language models, such as CamemBERT <d-cite bibtex-key="Martin_2020"></d-cite> for classification tasks and BARThez <d-cite bibtex-key="eddine2021barthez"></d-cite> for generation tasks.<br>
|
507 |
+
For this reason, we thought it important to make comparisons with an equivalent number of tokens seen.
|
508 |
+
We therefore tried to estimate the number of tokens seen by these two models using the formula number of steps × sequence size × batch size. We couldn't find the information in the BARThez publication to do this. For CamemBERT, we estimate a maximum of 419.4B tokens. This figure could actually be lower, as we don't know the number of padding tokens seen by this model (where in our case, we don't use any). So we have pre-trained our model on the maximum number of tokens seen by the CamemBERT.<br></p>
|
509 |
+
|
510 |
+
<p><br></p>
|
511 |
+
|
512 |
+
<p class="width_125">
|
513 |
+
<picture>
|
514 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/loss_train.png" width="49%">
|
515 |
+
<img alt="Convergence masked accuracy FAT5" src="./assets/loss_train.png" width="49%">
|
516 |
+
</picture>
|
517 |
+
<picture>
|
518 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/loss_eval.png" width="49%">
|
519 |
+
<img alt="Convergence masked accuracy FAT5" src="./assets/loss_eval.png" width="49%">
|
520 |
+
</picture>
|
521 |
+
</p>
|
522 |
+
|
523 |
+
<p><br></p>
|
524 |
+
|
525 |
+
<p class="width_125">
|
526 |
+
We were also interested in comparing our model against itself, i.e. we evaluated its performance on downstream tasks every 100,000 steps (~26 billion tokens) during pre-training.<br>
|
527 |
+
In the table below, we have listed the number of tokens equivalent to each interval of 100,000 steps.<br>
|
528 |
+
</p>
|
529 |
+
|
530 |
+
<table class="width_125">
|
531 |
+
<thead>
|
532 |
+
<tr>
|
533 |
+
<th>Model</th>
|
534 |
+
<th>Number of tokens ✝</th>
|
535 |
+
</tr>
|
536 |
+
</thead>
|
537 |
+
<tbody>
|
538 |
+
<tr>
|
539 |
+
<td>FAT5-small-100K</td>
|
540 |
+
<td>26,214,400,000 (100,000 × 1,024 × 256)</td>
|
541 |
+
</tr>
|
542 |
+
<tr>
|
543 |
+
<td>FAT5-small-200K</td>
|
544 |
+
<td>52,428,800,000 (200,000 × 1,024 × 256)</td>
|
545 |
+
</tr>
|
546 |
+
<tr>
|
547 |
+
<td>FAT5-small-300K</td>
|
548 |
+
<td>78,643,200,000 (300,000 × 1,024 × 256)</td>
|
549 |
+
</tr>
|
550 |
+
<tr>
|
551 |
+
<td>FAT5-small-400K</td>
|
552 |
+
<td>104,857,600,000 (400,000 × 1,024 × 256)</td>
|
553 |
+
</tr>
|
554 |
+
<tr>
|
555 |
+
<td>FAT5-small-500K</td>
|
556 |
+
<td>131,072,000,000 (500,000 × 1,024 × 256)</td>
|
557 |
+
</tr>
|
558 |
+
<tr>
|
559 |
+
<td>FAT5-small-600K</td>
|
560 |
+
<td>157,286,400,000 (600,000 × 1,024 × 256)</td>
|
561 |
+
</tr>
|
562 |
+
<tr>
|
563 |
+
<td>FAT5-small-700K</td>
|
564 |
+
<td>183,500,800,000 (700,000 × 1,024 × 256)</td>
|
565 |
+
</tr>
|
566 |
+
<tr>
|
567 |
+
<td>FAT5-small-800K</td>
|
568 |
+
<td>209,715,200,000 (800,000 × 1,024 × 256)</td>
|
569 |
+
</tr>
|
570 |
+
<tr>
|
571 |
+
<td>FAT5-small-900K</td>
|
572 |
+
<td>235,929,600,000 (900,000 × 1,024 × 256)</td>
|
573 |
+
</tr>
|
574 |
+
<tr>
|
575 |
+
<td>FAT5-small-1000K</td>
|
576 |
+
<td>262,144,000,000 (1,000,000 × 1,024 × 256)</td>
|
577 |
+
</tr>
|
578 |
+
<tr>
|
579 |
+
<td>FAT5-small-1100K</td>
|
580 |
+
<td>288,358,400,000 (1,100,000× 1,024 × 256)</td>
|
581 |
+
</tr>
|
582 |
+
<tr>
|
583 |
+
<td>FAT5-small-1200K</td>
|
584 |
+
<td>314,572,800,000 (1,200,000 × 1,024 × 256)</td>
|
585 |
+
</tr>
|
586 |
+
<tr>
|
587 |
+
<td>FAT5-small-1300K</td>
|
588 |
+
<td>340,787,200,000 (1,300,000 × 1,024 × 256)</td>
|
589 |
+
</tr>
|
590 |
+
<tr>
|
591 |
+
<td>FAT5-small-1400K</td>
|
592 |
+
<td>367,001,600,000 (1,400,000 × 1,024 × 256)</td>
|
593 |
+
</tr>
|
594 |
+
<tr>
|
595 |
+
<td>FAT5-small-1500K</td>
|
596 |
+
<td>393,216,000,000 (1,500,000 × 1,024 × 256)</td>
|
597 |
+
</tr>
|
598 |
+
<tr>
|
599 |
+
<td>FAT5-small-1600K</td>
|
600 |
+
<td>419,430,400,000 (1,600,000 × 1,024 × 256)</td>
|
601 |
+
</tr>
|
602 |
+
<tr>
|
603 |
+
<td><a class="link" href="https://hf.co/almanach/camembert-base">camembert (base ou large)</a></td>
|
604 |
+
<td>419,430,400,000 (100,000 × 512 × 8,192)</td>
|
605 |
+
</tr>
|
606 |
+
</tbody>
|
607 |
+
</table>
|
608 |
+
<p class="width_125">✝ equivalent to number of steps × sequence size × batch size</p>
|
609 |
+
|
610 |
+
|
611 |
+
<p><br></p>
|
612 |
+
<h4 id="finetuning">Finetuning</h4>
|
613 |
+
<p class="width_125">We focused on five tasks:<br>
|
614 |
+
• Summarising texts to illustrate the use of the head <code>T5ForConditionalGeneration</code>,<br>
|
615 |
+
• Binary classification to illustrate the use of the head <code>T5ForSequenceClassification</code>,<br>
|
616 |
+
• Named entity recognition to illustrate the use of the head <code>T5ForTokenClassification</code>,<br>
|
617 |
+
• Question answering to illustrate the use of the head <code>T5ForQuestionAnswering</code>.<br>
|
618 |
+
• Sentence similarity to illustrate the use of the head <code>T5EncoderModel</code>.</p>
|
619 |
+
|
620 |
+
<p class="width_125"> Classification tasks seem to us important to evaluate, as they are generally ignored by benchmarks of generative language models, even though they are often used in practice by companies (document retrieval, classification for customer reviews, data anonymization, etc.).
|
621 |
+
The fact that 6 and a half years after its release, BERT <d-cite bibtex-key="devlin2019bert"></d-cite> alone is downloaded more times per month than the <a class="link" href="https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads">30 text generation models</a> most downloaded on Hugging Face at the time of writing: 38.5M versus 31.3M.</p>
|
622 |
+
|
623 |
+
<p class="width_125">In the following tables, we underline for FAT5 the line with the best result for each task. We interpret the results of the generation part after the text summarization table. The classification results are interpreted after the binary classification, QA, NER and sentence-similarity tables.</p>
|
624 |
+
|
625 |
+
<p><br></p>
|
626 |
+
<h5>Summarization</h5>
|
627 |
+
<p class="width_125">For this task, we used the dataset <a class="link" href="https://huggingface.co/datasets/orange_sum">orange_sum</a><d-cite bibtex-key="eddine2021barthez"></d-cite>.</p>
|
628 |
+
<table class="width_125">
|
629 |
+
<thead>
|
630 |
+
<tr>
|
631 |
+
<th>Model</th>
|
632 |
+
<th>ROUGE-1</th>
|
633 |
+
<th>ROUGE-2</th>
|
634 |
+
<th>ROUGE-L</th>
|
635 |
+
</tr>
|
636 |
+
</thead>
|
637 |
+
<tbody>
|
638 |
+
<tr>
|
639 |
+
<td>FAT5-small-100K (147M)</td>
|
640 |
+
<td>28.17</td>
|
641 |
+
<td>10.60</td>
|
642 |
+
<td>20.62</td>
|
643 |
+
</tr>
|
644 |
+
<tr>
|
645 |
+
<td>FAT5-small-200K (147M)</td>
|
646 |
+
<td>28.72</td>
|
647 |
+
<td>10.86</td>
|
648 |
+
<td>20.68</td>
|
649 |
+
</tr>
|
650 |
+
<tr>
|
651 |
+
<td>FAT5-small-300K (147M)</td>
|
652 |
+
<td>28.76</td>
|
653 |
+
<td>10.85</td>
|
654 |
+
<td>20.63</td>
|
655 |
+
</tr>
|
656 |
+
<tr>
|
657 |
+
<td>FAT5-small-400K (147M)</td>
|
658 |
+
<td>28.59</td>
|
659 |
+
<td>10.76</td>
|
660 |
+
<td>20.60</td>
|
661 |
+
</tr>
|
662 |
+
<tr>
|
663 |
+
<td>FAT5-small-500K (147M)</td>
|
664 |
+
<td>28.98</td>
|
665 |
+
<td>10.97</td>
|
666 |
+
<td>20.72</td>
|
667 |
+
</tr>
|
668 |
+
<tr>
|
669 |
+
<td>FAT5-small-600K (147M)</td>
|
670 |
+
<td>29.04</td>
|
671 |
+
<td>11.20</td>
|
672 |
+
<td>20.89</td>
|
673 |
+
</tr>
|
674 |
+
<tr>
|
675 |
+
<td>FAT5-small-700K (147M)</td>
|
676 |
+
<td>28.72</td>
|
677 |
+
<td>10.87</td>
|
678 |
+
<td>20.77</td>
|
679 |
+
</tr>
|
680 |
+
<tr>
|
681 |
+
<td>FAT5-small-800K (147M)</td>
|
682 |
+
<td>29.00</td>
|
683 |
+
<td>10.91</td>
|
684 |
+
<td>20.78</td>
|
685 |
+
</tr>
|
686 |
+
<tr>
|
687 |
+
<td>FAT5-small-900K (147M)</td>
|
688 |
+
<td>29.30</td>
|
689 |
+
<td>11.34</td>
|
690 |
+
<td>21.22</td>
|
691 |
+
</tr>
|
692 |
+
<tr>
|
693 |
+
<td>FAT5-small-1000K (147M)</td>
|
694 |
+
<td>29.10</td>
|
695 |
+
<td>11.21</td>
|
696 |
+
<td>21.08</td>
|
697 |
+
</tr>
|
698 |
+
<tr>
|
699 |
+
<td>FAT5-small-1100K (147M)</td>
|
700 |
+
<td>29.43</td>
|
701 |
+
<td>11.40</td>
|
702 |
+
<td>21.15</td>
|
703 |
+
</tr>
|
704 |
+
<tr>
|
705 |
+
<td>FAT5-small-1200K (147M)</td>
|
706 |
+
<td>29.30</td>
|
707 |
+
<td>11.38</td>
|
708 |
+
<td>21.18</td>
|
709 |
+
</tr>
|
710 |
+
<tr>
|
711 |
+
<td>FAT5-small-1300K (147M)</td>
|
712 |
+
<td>29.38</td>
|
713 |
+
<td>11.38</td>
|
714 |
+
<td>21.18</td>
|
715 |
+
</tr>
|
716 |
+
<tr>
|
717 |
+
<td>FAT5-small-1400K (147M)</td>
|
718 |
+
<td>29.29</td>
|
719 |
+
<td>11.18</td>
|
720 |
+
<td>21.14</td>
|
721 |
+
</tr>
|
722 |
+
<tr>
|
723 |
+
<td>FAT5-small-1500K (147M)</td>
|
724 |
+
<td><u>29.48</u></td>
|
725 |
+
<td><u>11.48</u></td>
|
726 |
+
<td><u>21.22</u></td>
|
727 |
+
</tr>
|
728 |
+
<tr>
|
729 |
+
<td>FAT5-small-1600K (147M)</td>
|
730 |
+
<td>29.30</td>
|
731 |
+
<td>11.27</td>
|
732 |
+
<td>21.10</td>
|
733 |
+
</tr>
|
734 |
+
<tr>
|
735 |
+
<td><a class="link" href="https://huggingface.co/moussaKam/barthez">Barthez<d-cite bibtex-key="eddine2021barthez"></d-cite></a> (165M)</td>
|
736 |
+
<td>31.44</td>
|
737 |
+
<td>12.77</td>
|
738 |
+
<td>22.23</td>
|
739 |
+
</tr>
|
740 |
+
<tr>
|
741 |
+
<td><a class="link" href="https://huggingface.co/moussaKam/mbarthez">mBarthez</a> (458M)</td>
|
742 |
+
<td>32.67</td>
|
743 |
+
<td>13.73</td>
|
744 |
+
<td>23.18</td>
|
745 |
+
</tr>
|
746 |
+
</tbody>
|
747 |
+
</table>
|
748 |
+
|
749 |
+
<p><br></p>
|
750 |
+
<p class="width_125">We can see that our model performs worse than the Barthez. We can put forward a few hypotheses on this subject. <br>
|
751 |
+
Firstly, it's likely that our text generation process is not optimal. Not knowing the one used by the Barthez, we simply used the default parameters of the <a class="link" href="https://github.com/huggingface/transformers/blob/241c04d36867259cdf11dbb4e9d9a60f9cb65ebc/src/transformers/generation/utils.py#L1905">generate</a> function in transformers to avoid giving our model an advantage with a more sophisticated generation process.<br>
|
752 |
+
Secondly, we didn't use a prompt to condition the generation, which could have benefited our model since the T5 is the model that introduced this system.<br>
|
753 |
+
Thirdly, the Barthez surely saw more tokens than our model. Although we can't determine this number from the authors' publication, it is indicated that this is a BART model <d-cite bibtex-key="lewis2019bartdenoisingsequencetosequencepretraining"></d-cite> which received additional pre-training on French. However, BART's paper states that the model was trained on 500,000 steps × a sequence of 1,024 tokens × a batch of size 8000, i.e. 4,096,000,000,000 tokens, which is 9.76 times more than our model.
|
754 |
+
</p>
|
755 |
+
|
756 |
+
<p><br></p>
|
757 |
+
<h5 id="classification">Classification</h5>
|
758 |
+
<p class="width_125">We use a cleaned version of the allocine dataset <d-cite bibtex-key="allocine"></d-cite> : <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/allocine_clean">allocine_clean</a>. Specifically, 0.6% of the test sample was unreliable because it contained leaks or duplicate data. It is likely that the resulting dataset is still imperfect, with annotation problems requiring further proofreading/correction.
|
759 |
+
</p>
|
760 |
+
<table class="width_125">
|
761 |
+
<thead>
|
762 |
+
<tr>
|
763 |
+
<th>Model</th>
|
764 |
+
<th>Accuracy</th>
|
765 |
+
</tr>
|
766 |
+
</thead>
|
767 |
+
<tbody>
|
768 |
+
<tr>
|
769 |
+
<td>FAT5-small-100K (67.4M)</td>
|
770 |
+
<td>96.05</td>
|
771 |
+
</tr>
|
772 |
+
<tr>
|
773 |
+
<td>FAT5-small-200K (67.4M)</td>
|
774 |
+
<td>96.20</td>
|
775 |
+
</tr>
|
776 |
+
<tr>
|
777 |
+
<td>FAT5-small-300K (67.4M)</td>
|
778 |
+
<td>96.48</td>
|
779 |
+
</tr>
|
780 |
+
<tr>
|
781 |
+
<td>FAT5-small-400K (67.4M)</td>
|
782 |
+
<td>96.60</td>
|
783 |
+
</tr>
|
784 |
+
<tr>
|
785 |
+
<td>FAT5-small-500K (67.4M)</td>
|
786 |
+
<td>96.60</td>
|
787 |
+
</tr>
|
788 |
+
<tr>
|
789 |
+
<td>FAT5-small-600K (67.4M)</td>
|
790 |
+
<td>96.60</td>
|
791 |
+
</tr>
|
792 |
+
<tr>
|
793 |
+
<td>FAT5-small-700K (67.4M)</td>
|
794 |
+
<td>96.68</td>
|
795 |
+
</tr>
|
796 |
+
<tr>
|
797 |
+
<td>FAT5-small-800K (67.4M)</td>
|
798 |
+
<td>96.59</td>
|
799 |
+
</tr>
|
800 |
+
<tr>
|
801 |
+
<td>FAT5-small-900K (67.4M)</td>
|
802 |
+
<td><u>96.75</u></td>
|
803 |
+
</tr>
|
804 |
+
<tr>
|
805 |
+
<td>FAT5-small-1000K (67.4M)</td>
|
806 |
+
<td>96.62</td>
|
807 |
+
</tr>
|
808 |
+
<tr>
|
809 |
+
<td>FAT5-small-1100K (67.4M)</td>
|
810 |
+
<td>96.69</td>
|
811 |
+
</tr>
|
812 |
+
<tr>
|
813 |
+
<td>FAT5-small-1200K (67.4M)</td>
|
814 |
+
<td>96.71</td>
|
815 |
+
</tr>
|
816 |
+
<tr>
|
817 |
+
<td>FAT5-small-1300K (67.4M)</td>
|
818 |
+
<td>96.69</td>
|
819 |
+
</tr>
|
820 |
+
<tr>
|
821 |
+
<td>FAT5-small-1400K (67.4M)</td>
|
822 |
+
<td>96.65</td>
|
823 |
+
</tr>
|
824 |
+
<tr>
|
825 |
+
<td>FAT5-small-1500K (67.4M)</td>
|
826 |
+
<td>96.57</td>
|
827 |
+
</tr>
|
828 |
+
<tr>
|
829 |
+
<td>FAT5-small-1600K (67.4M)</td>
|
830 |
+
<td>96.69</td>
|
831 |
+
</tr>
|
832 |
+
<tr>
|
833 |
+
<td><a class="link" href="">distillcamembert</a> (68.1M)</td>
|
834 |
+
<td>96.74</td>
|
835 |
+
</tr>
|
836 |
+
<tr>
|
837 |
+
<td><a class="link" href="https://huggingface.co/bourdoiscatie/camembert_base_cls">camembert-base</a> (111M)</td>
|
838 |
+
<td>97.27</td>
|
839 |
+
</tr>
|
840 |
+
<tr>
|
841 |
+
<td><a class="link" href="https://huggingface.co/bourdoiscatie/camembert_large_cls">camembert-large</a> (337M)</td>
|
842 |
+
<td>97.15</td>
|
843 |
+
</tr>
|
844 |
+
<tr>
|
845 |
+
</tbody>
|
846 |
+
</table>
|
847 |
+
<p class="width_125">Note: in this and the following tables, distillcamembert refers to a <a class="link" href="https://huggingface.co/cmarkea/distilcamembert-base">distilcamembert-base</a> <d-cite bibtex-key="delestre2022distilcamembert"></d-cite> that we have finetuned.</p>
|
848 |
+
|
849 |
+
<p><br></p>
|
850 |
+
<h5>Named entity recognition</h5>
|
851 |
+
<p class="width_125">For this task, we used frenchNER in its <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/frenchNER_4entities">4 entities</a> (PER, LOC, ORG, MISC) <d-cite bibtex-key="frenchNER2024"></d-cite> configuration.</p>
|
852 |
+
<table class="width_125">
|
853 |
+
<thead>
|
854 |
+
<tr>
|
855 |
+
<th>Model</th>
|
856 |
+
<th>F1 PER</th>
|
857 |
+
<th>F1 LOC</th>
|
858 |
+
<th>F1 ORG</th>
|
859 |
+
<th>F1 MISC</th>
|
860 |
+
</tr>
|
861 |
+
</thead>
|
862 |
+
<tbody>
|
863 |
+
<tr>
|
864 |
+
<td>FAT5-small-100K (67.1M)</td>
|
865 |
+
<td>96.51</td>
|
866 |
+
<td>94.48</td>
|
867 |
+
<td>87.24</td>
|
868 |
+
<td>75.81</td>
|
869 |
+
</tr>
|
870 |
+
<tr>
|
871 |
+
<td>FAT5-small-200K (67.1M)</td>
|
872 |
+
<td>96.90</td>
|
873 |
+
<td>94.83</td>
|
874 |
+
<td>88.78</td>
|
875 |
+
<td>76.82</td>
|
876 |
+
</tr>
|
877 |
+
<tr>
|
878 |
+
<td>FAT5-small-300K (67.1M)</td>
|
879 |
+
<td>97.25</td>
|
880 |
+
<td>95.11</td>
|
881 |
+
<td>88.86</td>
|
882 |
+
<td><u>77.48</u></td>
|
883 |
+
</tr>
|
884 |
+
<tr>
|
885 |
+
<td>FAT5-small-400K (67.1M)</td>
|
886 |
+
<td>97.18</td>
|
887 |
+
<td>95.08</td>
|
888 |
+
<td>89.11</td>
|
889 |
+
<td>77.42</td>
|
890 |
+
</tr>
|
891 |
+
<tr>
|
892 |
+
<td>FAT5-small-500K (67.1M)</td>
|
893 |
+
<td>97.25</td>
|
894 |
+
<td>95.16</td>
|
895 |
+
<td>89.16</td>
|
896 |
+
<td>76.91</td>
|
897 |
+
</tr>
|
898 |
+
<tr>
|
899 |
+
<td>FAT5-small-600K (67.1M)</td>
|
900 |
+
<td>97.19</td>
|
901 |
+
<td>95.19</td>
|
902 |
+
<td>88.85</td>
|
903 |
+
<td>76.88</td>
|
904 |
+
</tr>
|
905 |
+
<tr>
|
906 |
+
<td>FAT5-small-700K (67.1M)</td>
|
907 |
+
<td>97.17</td>
|
908 |
+
<td>95.14</td>
|
909 |
+
<td>89.39</td>
|
910 |
+
<td>76.82</td>
|
911 |
+
</tr>
|
912 |
+
<tr>
|
913 |
+
<td>FAT5-small-800K (67.1M)</td>
|
914 |
+
<td><u>97.34</u></td>
|
915 |
+
<td>95.20</td>
|
916 |
+
<td>89.18</td>
|
917 |
+
<td>77.27</td>
|
918 |
+
</tr>
|
919 |
+
<tr>
|
920 |
+
<td>FAT5-small-900K (67.1M)</td>
|
921 |
+
<td>97.19</td>
|
922 |
+
<td>95.21</td>
|
923 |
+
<td>89.04</td>
|
924 |
+
<td>76.83</td>
|
925 |
+
</tr>
|
926 |
+
<tr>
|
927 |
+
<td>FAT5-small-1000K (67.1M)</td>
|
928 |
+
<td>97.31</td>
|
929 |
+
<td>95.26</td>
|
930 |
+
<td>89.24</td>
|
931 |
+
<td>76.84</td>
|
932 |
+
</tr>
|
933 |
+
<tr>
|
934 |
+
<td>FAT5-small-1100K (67.1M)</td>
|
935 |
+
<td>97.11</td>
|
936 |
+
<td>94.99</td>
|
937 |
+
<td>88.52</td>
|
938 |
+
<td>76.30</td>
|
939 |
+
</tr>
|
940 |
+
<tr>
|
941 |
+
<td>FAT5-small-1200K (67.1M)</td>
|
942 |
+
<td>97.19</td>
|
943 |
+
<td>95.11</td>
|
944 |
+
<td>88.79</td>
|
945 |
+
<td>76.86</td>
|
946 |
+
</tr>
|
947 |
+
<tr>
|
948 |
+
<td>FAT5-small-1300K (67.1M)</td>
|
949 |
+
<td>97.15</td>
|
950 |
+
<td>95.00</td>
|
951 |
+
<td>88.62</td>
|
952 |
+
<td>76.58</td>
|
953 |
+
</tr>
|
954 |
+
<tr>
|
955 |
+
<td>FAT5-small-1400K (67.1M)</td>
|
956 |
+
<td>97.22</td>
|
957 |
+
<td>95.09</td>
|
958 |
+
<td>89.01</td>
|
959 |
+
<td>77.00</td>
|
960 |
+
</tr>
|
961 |
+
<tr>
|
962 |
+
<td>FAT5-small-1500K (67.1M)</td>
|
963 |
+
<td>97.32</td>
|
964 |
+
<td><u>95.34</u></td>
|
965 |
+
<td><u>89.39</u></td>
|
966 |
+
<td>77.30</td>
|
967 |
+
</tr>
|
968 |
+
<tr>
|
969 |
+
<td>FAT5-small-1600K (67.1M)</td>
|
970 |
+
<td>97.14</td>
|
971 |
+
<td>95.22</td>
|
972 |
+
<td>89.24</td>
|
973 |
+
<td>76.88</td>
|
974 |
+
</tr>
|
975 |
+
<tr>
|
976 |
+
<td><a class="link" href="">distillcamembert</a> (67.5M)</td>
|
977 |
+
<td>97.26</td>
|
978 |
+
<td>95.24</td>
|
979 |
+
<td>89.10</td>
|
980 |
+
<td>79.88</td>
|
981 |
+
</tr>
|
982 |
+
<tr>
|
983 |
+
<td><a class="link" href="https://huggingface.co/CATIE-AQ/NERmembert-base-4entities">camembert-base</a> (110M)</td>
|
984 |
+
<td>97.80</td>
|
985 |
+
<td>95.78</td>
|
986 |
+
<td>90.27</td>
|
987 |
+
<td>81.38</td>
|
988 |
+
</tr>
|
989 |
+
<tr>
|
990 |
+
<td><a class="link" href="https://huggingface.co/CATIE-AQ/NERmembert-large-4entities">camembert-large</a> (336M)</td>
|
991 |
+
<td>98.17</td>
|
992 |
+
<td>96.37</td>
|
993 |
+
<td>91.87</td>
|
994 |
+
<td>83.35</td>
|
995 |
+
</tr>
|
996 |
+
</tbody>
|
997 |
+
</table>
|
998 |
+
|
999 |
+
<p><br></p>
|
1000 |
+
<h5 id="question-answering">Question Answering</h5>
|
1001 |
+
<p class="width_125">
|
1002 |
+
We wanted to finetune our model on this task but realized that our tokenizer has two problems.<br>
|
1003 |
+
Firstly, we forgot to add the token at the beginning of the sentence.
|
1004 |
+
Secondly, we decided to use a fast BPE tokenizer. We learned afterwards that the `add_special_tokens=True` argument doesn't work with this type of tokenizer.
|
1005 |
+
Correcting these two points requires us to post-process the tokenizer's encodings before performing our finetuning task, which isn't elegant and requires time we don't have right now.
|
1006 |
+
<p><br></p>
|
1007 |
+
|
1008 |
+
<h5><i>Sentence Similarity</i></h5>
|
1009 |
+
<p class="width_125">
|
1010 |
+
We invite the reader to take the results of this section with a grain of salt.<br>
|
1011 |
+
We performed a finetuning on this task in order to verify that the <code>T5EncoderModel</code> head was working, but we are not focusing on the results obtained because we are questioning the quality of the benchmark on which we are evaluating the models, namely MTEB FR <d-cite bibtex-key="ciancone2024mtebfrenchresourcesfrenchsentence"></d-cite>, a French version of MTEB.<br>
|
1012 |
+
|
1013 |
+
Indeed, Nils Reimers, creator of the MTEB, recently questioned in a <a class="link" href="https://x.com/Nils_Reimers/status/1870812625505849849">tweet</a> the relevance of this benchmark, declaring it "dead".
|
1014 |
+
|
1015 |
+
Earlier in the year, we observed data leaks and duplications in this benchmark
|
1016 |
+
(see <a class="link" href="https://huggingface.co/datasets/lbourdois/MTEB_leaks_and_duplications">here</a> and
|
1017 |
+
<a class="link" href="https://github.com/embeddings-benchmark/mteb/issues/1036">here</a>).
|
1018 |
+
Alexey Vatolin then extended these observations to include empty lines (see <a class="link" href="https://github.com/embeddings-benchmark/mteb/issues/1049#issuecomment-2463095122">here</a>).
|
1019 |
+
<br>
|
1020 |
+
|
1021 |
+
In the table below, we finetuned on a cleaned version of the dataset <code>stsb_multi_mt</code> <d-cite bibtex-key="huggingface:dataset:stsb_multi_mt"></d-cite> (0.653% of the split test was unreliable because it contained leaks or duplicated data) before evaluating on MTEB FR.
|
1022 |
+
<br>
|
1023 |
+
</p>
|
1024 |
+
|
1025 |
+
|
1026 |
+
<table class="width_125">
|
1027 |
+
<thead>
|
1028 |
+
<tr>
|
1029 |
+
<th>Model</th>
|
1030 |
+
<th>Average</th>
|
1031 |
+
<th>Classification</th>
|
1032 |
+
<th>Clustering</th>
|
1033 |
+
<th>PairClassification</th>
|
1034 |
+
<th>Reranking</th>
|
1035 |
+
<th>Retrieval</th>
|
1036 |
+
<th>STS</th>
|
1037 |
+
<th>Summary</th>
|
1038 |
+
</tr>
|
1039 |
+
</thead>
|
1040 |
+
<tbody>
|
1041 |
+
<tr>
|
1042 |
+
<td>FAT5-small-400K (67.1M)</td>
|
1043 |
+
<td>52.2</td>
|
1044 |
+
<td>59.8</td>
|
1045 |
+
<td>39.1</td>
|
1046 |
+
<td>77.5</td>
|
1047 |
+
<td>56.1</td>
|
1048 |
+
<td>29.1</td>
|
1049 |
+
<td>74</td>
|
1050 |
+
<td>29.8</td>
|
1051 |
+
</tr>
|
1052 |
+
<tr>
|
1053 |
+
<td>distillcamembert(68.1M)</td>
|
1054 |
+
<td>51.3</td>
|
1055 |
+
<td>60.7</td>
|
1056 |
+
<td>37.4</td>
|
1057 |
+
<td>77</td>
|
1058 |
+
<td>51.1</td>
|
1059 |
+
<td>25.2</td>
|
1060 |
+
<td>76.4</td>
|
1061 |
+
<td>31.3</td>
|
1062 |
+
</tr>
|
1063 |
+
</tbody>
|
1064 |
+
</table>
|
1065 |
+
|
1066 |
+
<p><br><br><br></p>
|
1067 |
+
|
1068 |
+
<p class="width_125">
|
1069 |
+
We can see from the masked accuracy convergence graph that the performance of the encoder part of the model progresses initially before flattening out.
|
1070 |
+
</p>
|
1071 |
+
<p><br></p>
|
1072 |
+
|
1073 |
+
<p class="width_125">
|
1074 |
+
<picture>
|
1075 |
+
<source media="(prefers-color-scheme: dark)" srcset="./assets/convergence_masked_accuracy_FAT5.png" width="100%">
|
1076 |
+
<img alt="Convergence masked accuracy FAT5" src="./assets/convergence_masked_accuracy_FAT5.png" width="100%">
|
1077 |
+
</picture>
|
1078 |
+
</p>
|
1079 |
+
|
1080 |
+
<p><br></p>
|
1081 |
+
|
1082 |
+
<p class="width_125">
|
1083 |
+
This phenomenon can also be observed in the finetuning results: FAT5 matches the performance of distilcamembert at around 800 or 900K steps (except for the MISC entity for the NER task), but does no better beyond that. This is nevertheless encouraging in view of scaling up, since distilled models derived from larger models usually perform better than trained models from scratch.<br>
|
1084 |
+
Note that this sort of plateau in performance needs to be confirmed by carrying out several executions with different configurations (notably at seed level), in order to propose results in the form of an interval instead of a single result (for each step evaluated, we use a seed of 42).<br>
|
1085 |
+
It should also be mentioned that this capping for the encoder part has already been observed by other authors. One example is CamemBERT(a) 2.0 <d-cite bibtex-key="antoun2024camembert20smarterfrench"></d-cite> which has also been trained on the French-language part of CulturaX. CamemBERT 2.0 did not perform any better than CamemBERT 1.0, despite having seen more tokens. On the other hand, the authors obtained performance gains with CamemBERTa 2.0, suggesting that for encoders, the most important thing is to focus on the architecture (CamemBERTa 2.0 is a DeBERTaV3 <d-cite bibtex-key="he2023debertav3improvingdebertausing"></d-cite> while CamemBERT 2.0 is a RoBERTa <d-cite bibtex-key="liu2019robertarobustlyoptimizedbert"></d-cite>) rather than data. This result invites us to think about updating the T5 encoder architecture.<br>
|
1086 |
+
A final observation that can be made is that if performance plateaus, it is possible to afford to stop pre-training earlier and thus reduce costs.<br>
|
1087 |
+
|
1088 |
+
In the table below, we list cost estimates (in euros) for the pre-training of our model according to various cloud providers.
|
1089 |
+
For each of them, we base ourselves on the hourly price of an A 100 80GB offered on December 20, 2024.<br>
|
1090 |
+
We show two cases: pre-training on 262 billion tokens (the threshold at which performance on classification tasks begins to plateau and marginal gains become low) on 419 billion tokens (the maximum number of tokens seen by CamemBERT).
|
1091 |
+
<br>
|
1092 |
+
</p>
|
1093 |
+
|
1094 |
+
<table class="width_125">
|
1095 |
+
<thead>
|
1096 |
+
<tr>
|
1097 |
+
<th>Cloud provider</th>
|
1098 |
+
<th>Hourly rate for an A 100</th>
|
1099 |
+
<th>Price for 262B tokens</th>
|
1100 |
+
<th>Price for 419B tokens</th>
|
1101 |
+
<th>Note</th>
|
1102 |
+
</tr>
|
1103 |
+
</thead>
|
1104 |
+
<tbody>
|
1105 |
+
<tr>
|
1106 |
+
<td>AWS</td>
|
1107 |
+
<td>1.77</td>
|
1108 |
+
<td>1,616</td>
|
1109 |
+
<td>2,586</td>
|
1110 |
+
<td></td>
|
1111 |
+
</tr>
|
1112 |
+
<tr>
|
1113 |
+
<td>OVH</td>
|
1114 |
+
<td>2.75</td>
|
1115 |
+
<td>2,475</td>
|
1116 |
+
<td>3,960</td>
|
1117 |
+
<td>By opting for monthly rather than hourly payment, the price in both cases is €2,200.</td>
|
1118 |
+
</tr>
|
1119 |
+
<tr>
|
1120 |
+
<td>Azure</td>
|
1121 |
+
<td>3.31</td>
|
1122 |
+
<td>3,021</td>
|
1123 |
+
<td>4,833</td>
|
1124 |
+
<td>The hourly price was calculated from the monthly price of 8 A100.</td>
|
1125 |
+
</tr>
|
1126 |
+
<tr>
|
1127 |
+
<td>Google Cloud</td>
|
1128 |
+
<td>3.52</td>
|
1129 |
+
<td>3,214</td>
|
1130 |
+
<td>5,143</td>
|
1131 |
+
<td></td>
|
1132 |
+
</tr>
|
1133 |
+
</tbody>
|
1134 |
+
</table>
|
1135 |
+
|
1136 |
+
<p><br><br></p>
|
1137 |
+
<h4>Pre-training times and emissions</h4>
|
1138 |
+
<p class="width_125">Carbon emissions were estimated using the <a class="link" href="https://mlco2.github.io/impact#compute">Machine Learning Impact calculator</a> <d-cite bibtex-key="lacoste2019quantifying"></d-cite>.<br>
|
1139 |
+
Our model was pre-trained on a single A100 PCIe 80GB, on a private infrastructure.
|
1140 |
+
For carbon efficiency, we based ourselves on the daily numbers given by <a class="link" href="https://app.electricitymaps.com/zone/FR">electricitymaps</a> for France during our pre-training period.
|
1141 |
+
The finetunings were carried out on a single A100 PCIe 40GB.
|
1142 |
+
As execution time is generally counted in hours or even minutes, for carbon efficiency we refer to the electricitymaps numbers for the hour in question rather than the daily number.<br>
|
1143 |
+
We estimate the emissions of our model at 14.084 kg eq. CO2, including 13.5 kg eq. CO2 for pre-training and 0.584 kg eq. CO2 for the 49 finetunings.<br>
|
1144 |
+
To this, we must add additional emissions estimated at 6.24 kg eq. CO2.
|
1145 |
+
They correspond to the finetuning of models to establish baselines against which to compare (0.475 kg eq. CO2), to our preliminary work in bfp16 mixed (4.735 kg eq. CO2 for the pre-training of three different models over 300K steps) and to tests in bfp16 full prior to the training of our final model (1.03 kg eq. in pre-training of a model half the size over 400K steps).<br>
|
1146 |
+
In total, we estimate the carbon footprint of our work at 20.324 kg eq. CO2. </p>
|
1147 |
+
|
1148 |
+
<p class="width_125">For the pre-training phase (we don't have enough information to make estimates for the other phases), it is then possible to compare us with the other French pre-trained models listed above: </p>
|
1149 |
+
<table class="width_125">
|
1150 |
+
<thead>
|
1151 |
+
<tr>
|
1152 |
+
<th>Model</th>
|
1153 |
+
<th>Time (H)</th>
|
1154 |
+
<th>Emissions (kg Co2 eq)</th>
|
1155 |
+
<th>Note</th>
|
1156 |
+
</tr>
|
1157 |
+
</thead>
|
1158 |
+
<tbody>
|
1159 |
+
<tr>
|
1160 |
+
<td>Camembert</td>
|
1161 |
+
<td>6,144</td>
|
1162 |
+
<td>106.91 ✝</td>
|
1163 |
+
<td>24H × 256 Tesla V100-SXM2-32GB at 58g (average over 2019) <br>The authors do not specify the numbers for the large version.</td>
|
1164 |
+
</tr>
|
1165 |
+
<tr>
|
1166 |
+
<td>Flaubert base <d-cite bibtex-key="le2020flaubert"></d-cite></td>
|
1167 |
+
<td>13,120</td>
|
1168 |
+
<td>190.24 to 228.29 ✝</td>
|
1169 |
+
<td>410H × 32 V100 at 58g (average over 2019) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
|
1170 |
+
</tr>
|
1171 |
+
<tr>
|
1172 |
+
<td>Flaubert large <d-cite bibtex-key="le2020flaubert"></d-cite></td>
|
1173 |
+
<td>49,920</td>
|
1174 |
+
<td>723.84 to 868.61 ✝</td>
|
1175 |
+
<td>390H × 128 V100 at 58g (average over 2019) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
|
1176 |
+
</tr>
|
1177 |
+
<tr>
|
1178 |
+
<td>Barthez</td>
|
1179 |
+
<td>7,680 ★</td>
|
1180 |
+
<td>107.52 to 129.02 ✝</td>
|
1181 |
+
<td>60H × 128 V100 at 56g (average over 2020) <br>The V100 type is not specified<br>(V100-SXM2-32GB ? V100-SXM2-16GB ? V100-PCIE-16GB ?)</td>
|
1182 |
+
</tr>
|
1183 |
+
<tr>
|
1184 |
+
<td>FAT5-small</td>
|
1185 |
+
<td>1,461</td>
|
1186 |
+
<td>13.5</td>
|
1187 |
+
<td>1 461H × 1 A100 to 36.96 g (average between 2024-18-10 and 2024-19-12)</td>
|
1188 |
+
</tr>
|
1189 |
+
</tbody>
|
1190 |
+
</table>
|
1191 |
+
<p class="width_125">✝ the numbers given are estimates based on the information provided by the authors in their publication<br>
|
1192 |
+
★ we indicate only the hours for the French pre-training applied on top of the initial English pre-training on which the model is based</p>
|
1193 |
+
|
1194 |
+
<p><br></p>
|
1195 |
+
|
1196 |
+
<h3 id="mod-les-en-anglais">Models in other languages</h3>
|
1197 |
+
<p class="width_125">
|
1198 |
+
Our contribution focuses on French, with the introduction of a new model. For other languages, we can't afford to carry out work on the same magnitude.<br>
|
1199 |
+
Nevertheless, we provide a <a class="link" href="https://github.com/catie-aq/flashT5/blob/main/convert_huggingface_t5.py">code</a> for adapting already pre-trained (m)T5/FLAN-T5 weights <d-cite bibtex-key="chung2022scaling"></d-cite> to our method. We hope that users will be able to continue pre-training one of these models to adapt it to more recent data, for example.<br>
|
1200 |
+
Please note, however, that this adaptation is limited, since the additional pre-training will have to be carried out in the precision of the original model. For example, if the model's weights are in fp32 (which is the case with the FLAN-T5), training will not be as fast as with the FAT5, which is in bf16.<br><br>
|
1201 |
+
|
1202 |
+
For English speakers, we have already adapted the weights of the various FLAN-T5 versions to our method. All weights can be found in this <a class="link" href="https://huggingface.co/collections/CATIE-AQ/catie-english-fat5-flan-662b679a8e855c7c0137d69e">Hugging Face collection</a>.<br><br>
|
1203 |
+
|
1204 |
+
If you'd like to pre-train your own model (to specialize in a specific domain, for example, and thus benefit from a customized tokenizer), we refer you once again to the <a class="link" href="https://github.com/catie-aq/flashT5/tree/main/examples/minipile">tutorial</a> showing how to pre-train a model on minipile. Note that we have tested and trained the model in the tutorial on an A100, which may or may not work with other GPUs.</p>
|
1205 |
+
<p class="width_125"><br><br><br></p>
|
1206 |
+
|
1207 |
+
|
1208 |
+
<h2 id="la-suite">Next stage</h2>
|
1209 |
+
<p class="width_125">Let's end this article by mentioning what we intend to do, or at least would like to do, as a follow-up to this work.<br></p>
|
1210 |
+
|
1211 |
+
|
1212 |
+
<h3>Near future</h3>
|
1213 |
+
<p class="width_125">These are things that should already have been in this article, but took more time than expected.
|
1214 |
+
Typically, we've finished building datasets but haven't had time to do the finetunings.<br>
|
1215 |
+
The aim is to complete these tasks in the near future, so that we can include the results in an update to this blog post.
|
1216 |
+
</p>
|
1217 |
+
|
1218 |
+
<h4>Fix the tokenizer</h4>
|
1219 |
+
<p class="width_125">
|
1220 |
+
The current FAT5 is usable. However, due to problems with the tokenizer resulting in inelegant post-processing for certain tasks, we're not excluding the possibility of re-training a model (on 1M steps only) with a new tokenizer allowing simpler use of the model.
|
1221 |
+
<br><br></p>
|
1222 |
+
|
1223 |
+
<h4>Instruct model</h4>
|
1224 |
+
<p class="width_125">We'd like to test FAT5's text generation capabilities in a more optimal way, in particular through the use of prompts, by developing an instruct model.<br>
|
1225 |
+
For this, we have <a class="link" href="https://huggingface.co/datasets/CATIE-AQ/DFP">DFP</a> (<i>Dataset of French Prompts</i>) <d-cite bibtex-key="centre_aquitain_des_technologies_de_l'information_et_electroniques_2023"></d-cite>, a dataset of over 100M rows covering thirty NLP tasks. It follows the methodology of the <a class="link" href="https://huggingface.co/datasets/bigscience/xP3">xP3</a> dataset used for mT0 <d-cite bibtex-key="muennighoff2023crosslingualgeneralizationmultitaskfinetuning"></d-cite>. We could also take this opportunity to check out BigScience's "Finding 2" <d-cite bibtex-key="wang2022languagemodelarchitecturepretraining"></d-cite> (page 9 of the publication) indicating that encoder-decoder models would have better 0-shot capabilities than decoder models. <br>
|
1226 |
+
Beyond NLP tasks, we also have over 2M open QA prompt rows, which should enable us to test FAT5 on more general tasks/knowledge.<br><br>
|
1227 |
+
|
1228 |
+
The development of this instruct model should also enable us to work on its alignment, in particular via a dataset of 12M rows to perform DPO in French.<br><br></p>
|
1229 |
+
|
1230 |
+
<h4>Long sequences</h4>
|
1231 |
+
<p class="width_125">
|
1232 |
+
Pre-training is performed on sequences of 1,024 tokens. However, the CUDA kernel we've developed supports positional encodings that greatly extend the context size, as well a linear inference.<br>
|
1233 |
+
With this in mind, we've created two datasets of long sequences in French (one of QA, one of text summaries) on which we'd like to finetune our model.<br><br><br></p>
|
1234 |
+
|
1235 |
+
|
1236 |
+
<h3>Distant future</h3>
|
1237 |
+
<p class="width_125">The items listed below are longer-term ideas. In other words, they will take time to implement and will be the subject of a new blog post if necessary.</p>
|
1238 |
+
|
1239 |
+
<h4 id="calcul-lin-aire">Memory reduction</h4>
|
1240 |
+
<p class="width_125">Although we're already satisfied with the memory optimisations achieved via our CUDA kernel, we think we can take these results further using other techniques. For example, we can cite the CCE (Cut Cross-Entropy) method <d-cite bibtex-key="wijmans2024cut"></d-cite> with which we have already obtained interesting results on decoder models.<br>
|
1241 |
+
In addition, while we have concentrated on pre-training, more work needs to be done on inference, which in practice consumes the most resources over time once the model is in production. We are thinking in particular of using the SageAttention2 <d-cite bibtex-key="zhang2024sageattention2efficientattentionthorough"></d-cite> released while our model was training.
|
1242 |
+
<br><br></p>
|
1243 |
+
|
1244 |
+
<h4 id="calcul-lin-aire">Linear computation</h4>
|
1245 |
+
<p class="width_125">In this work, we present a linear memory model.
|
1246 |
+
A further improvement would be that, in addition to this memory, the model operates with linear computations.<br>
|
1247 |
+
The idea is to replace traditional quadratic attention with another form of attention.<br>
|
1248 |
+
We can think of some already applied to the T5, such as that of LongT5 <d-cite bibtex-key="guo2022longt5"></d-cite>.
|
1249 |
+
It is also possible to test more recent forms such as Based <d-cite bibtex-key="arora2024simple"></d-cite>.
|
1250 |
+
We are also interested in testing with Hedgehog <d-cite bibtex-key="zhang2024hedgehog"></d-cite>.
|
1251 |
+
In fact, it is possible to combine them with the optimised kernels available in <a class="link" href="https://github.com/HazyResearch/ThunderKittens/tree/main/kernels">ThunderKittens</a> <d-cite bibtex-key="thunderkittens"></d-cite>.
|
1252 |
+
The benefit is that it is then possible to keep the pre-trained model and, via additional finetuning, replace standard attention with softmax by linear attention with Hedgehog.<br>
|
1253 |
+
LoLCATs <d-cite bibtex-key="zhang2024lolcatslowranklinearizinglarge"></d-cite> performs this finetuning via LoRA <d-cite bibtex-key="hu2021loralowrankadaptationlarge"></d-cite>.
|
1254 |
+
<br><br></p>
|
1255 |
+
|
1256 |
+
<h4 id="passage-l-chelle">Model size</h4>
|
1257 |
+
<p class="width_125"> T5/FLAN-T5 have been trained to 11 billion parameters, demonstrating that this architecture can scale.<br>
|
1258 |
+
We would like to offer larger models with a FAT5-base and a FAT5-large with 305M and 973M parameters respectively, which we would then like to distil. The aim is to offer models that consume as little as possible in routine/inference.<br>
|
1259 |
+
We also expect the distilled models to perform better than models of equivalent size trained from scratch.
|
1260 |
+
<br><br></p>
|
1261 |
+
|
1262 |
+
<h4 id="modeles-specialises">Training data</h4>
|
1263 |
+
<p class="width_125">
|
1264 |
+
In this work, we used "generic" French data, mainly from CulturaX. During the training of our model, Hugging Face introduced the FineWeb2 dataset <d-cite bibtex-key="penedo2024fineweb-2"></d-cite> which includes French. We would like to pre-train a new model so that we can compare the impact of pre-training data on performance on downstream tasks.<br>
|
1265 |
+
|
1266 |
+
Beyond generic French, we particularly want to be able to apply our methodology to specific domains (medicine, regional variants of French, etc.).<br>
|
1267 |
+
To do this, we would need to train a new dedicated tokenizer and perform a new pre-training for each of the chosen domains.
|
1268 |
+
The advantage of the optimisations implemented and presented in this blog article is that they enable a significant reduction in the cost of pre-training.<br>
|
1269 |
+
We would then like to conduct a comparison between these small specialised models vs. large generic models.<br><br></p>
|
1270 |
+
|
1271 |
+
<h4 id="modeles-specialises">Update of the T5 architecture</h4>
|
1272 |
+
<p class="width_125">The final direction we would like to explore is an update of the T5 architecture. As encoder-decoders have been neglected, they have not benefited from the improvements that decoder models have received in recent months (more recent activation or normalisation layers, multi-token prediction <d-cite bibtex-key="gloeckle2024betterfasterlarge"></d-cite>, etc.).</p>
|
1273 |
+
|
1274 |
+
|
1275 |
+
<p class="width_125"><br><br><br></p>
|
1276 |
+
<h2 id="conclusion">Conclusion</h2>
|
1277 |
+
<p class="width_125">
|
1278 |
+
We introduced the FAT5 (Flash Attention T5) model, detailing our approach to optimizing various elements of the pre-training and finetuning processes.
|
1279 |
+
This is based on kernels that enable Flash Attention to be used with a T5 and give the model a linear memory.
|
1280 |
+
In particular, we've applied our work to French, and made sure that it can also be used in any other language.
|
1281 |
+
We hope that our method, which enables a model with 147M parameters to be pre-trained from scratch for €1,600, will be useful for people with limited computational resources.
|
1282 |
+
It also opens the way for a possible comeback of encoder-decoder models, rather than only decoder models.<br>
|
1283 |
+
<p class="width_125"><br><br></p>
|
1284 |
+
|
1285 |
+
|
1286 |
+
<style>
|
1287 |
+
d-appendix .citation {
|
1288 |
+
font-size: 11px;
|
1289 |
+
line-height: 15px;
|
1290 |
+
border-left: 1px solid rgba(0, 0, 0, 0.1);
|
1291 |
+
padding-left: 10px;
|
1292 |
+
border: 1px solid rgba(0,0,0,0.1);
|
1293 |
+
background: #0D1117;
|
1294 |
+
padding: 10px 10px;
|
1295 |
+
border-radius: 3px;
|
1296 |
+
color: rgba(150, 150, 150, 1);
|
1297 |
+
overflow: hidden;
|
1298 |
+
margin-top: -12px;
|
1299 |
+
white-space: pre-wrap;
|
1300 |
+
word-wrap: break-word;
|
1301 |
+
}
|
1302 |
+
</style>
|
1303 |
+
|
1304 |
+
<h3 id="citation">Citation</h3>
|
1305 |
+
<pre class="citation long">@misc{FAT5_blogpost,
|
1306 |
+
title={ FAT5: Flash Attention T5 },
|
1307 |
+
author={ Boris ALBAR and Loïck BOURDOIS },
|
1308 |
+
organization={ Centre Aquitain des Technologies de l'Information et Electroniques },
|
1309 |
+
year={2024},
|
1310 |
+
url={ https://huggingface.co/spaces/CATIE-AQ/FAT5-report },
|
1311 |
+
doi={ 10.57967/hf/0821 },
|
1312 |
+
publisher= { Hugging Face }
|
1313 |
+
}</pre>
|
1314 |
+
|
1315 |
+
<d-appendix style="color: #9CA3AF;" >
|
1316 |
+
<d-bibliography src="bibliography.bib"></d-bibliography>
|
1317 |
+
</d-appendix>
|
1318 |
+
</d-article>
|
1319 |
+
|
1320 |
+
<script>
|
1321 |
+
const article = document.querySelector('d-article');
|
1322 |
+
const toc = document.querySelector('d-contents');
|
1323 |
+
if (toc) {
|
1324 |
+
const headings = article.querySelectorAll('h2, h3, h4');
|
1325 |
+
let ToC = `<nav role="navigation" class="l-text figcaption" style="color: #9CA3AF;"><h3>Table des matières</h3>`;
|
1326 |
+
let prevLevel = 0;
|
1327 |
+
for (const el of headings) {
|
1328 |
+
// should element be included in TOC?
|
1329 |
+
const isInTitle = el.parentElement.tagName == 'D-TITLE';
|
1330 |
+
const isException = el.getAttribute('no-toc');
|
1331 |
+
if (isInTitle || isException) continue;
|
1332 |
+
el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
|
1333 |
+
const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';
|
1334 |
+
const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
|
1335 |
+
while (prevLevel < level) {
|
1336 |
+
ToC += '<ul>'
|
1337 |
+
prevLevel++;
|
1338 |
+
}
|
1339 |
+
while (prevLevel > level) {
|
1340 |
+
ToC += '</ul>'
|
1341 |
+
prevLevel--;
|
1342 |
+
}
|
1343 |
+
if (level === 0)
|
1344 |
+
ToC += '<div>' + link + '</div>';
|
1345 |
+
else
|
1346 |
+
ToC += '<li>' + link + '</li>';
|
1347 |
+
}
|
1348 |
+
while (prevLevel > 0) {
|
1349 |
+
ToC += '</ul>'
|
1350 |
+
prevLevel--;
|
1351 |
+
}
|
1352 |
+
ToC += '</nav>';
|
1353 |
+
toc.innerHTML = ToC;
|
1354 |
+
toc.setAttribute('prerendered', 'true');
|
1355 |
+
const toc_links = document.querySelectorAll('d-contents > nav a');
|
1356 |
+
window.addEventListener('scroll', (_event) => {
|
1357 |
+
if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
|
1358 |
+
// Then iterate forwards, on the first match highlight it and break
|
1359 |
+
find_active: {
|
1360 |
+
for (let i = headings.length - 1; i >= 0; i--) {
|
1361 |
+
if (headings[i].getBoundingClientRect().top - 50 <= 0) {
|
1362 |
+
if (!toc_links[i].classList.contains("active")) {
|
1363 |
+
toc_links.forEach((link, _index) => {
|
1364 |
+
link.classList.remove("active");
|
1365 |
+
});
|
1366 |
+
toc_links[i].classList.add('active');
|
1367 |
+
}
|
1368 |
+
break find_active;
|
1369 |
+
}
|
1370 |
+
}
|
1371 |
+
toc_links.forEach((link, _index) => {
|
1372 |
+
link.classList.remove("active");
|
1373 |
+
});
|
1374 |
+
}
|
1375 |
+
}
|
1376 |
+
});
|
1377 |
+
}
|
1378 |
+
</script>
|
1379 |
+
</body>
|
1380 |
+
</html>
|
dist/main.bundle.js
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dist/main.bundle.js.LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* @license
|
2 |
+
Papa Parse
|
3 |
+
v5.4.1
|
4 |
+
https://github.com/mholt/PapaParse
|
5 |
+
License: MIT
|
6 |
+
*/
|
7 |
+
|
8 |
+
/*! For license information please see plotly-basic.min.js.LICENSE.txt */
|
9 |
+
|
10 |
+
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
11 |
+
|
12 |
+
/**
|
13 |
+
* @license
|
14 |
+
* Lodash <https://lodash.com/>
|
15 |
+
* Copyright OpenJS Foundation and other contributors <https://openjsf.org/>
|
16 |
+
* Released under MIT license <https://lodash.com/license>
|
17 |
+
* Based on Underscore.js 1.8.3 <http://underscorejs.org/LICENSE>
|
18 |
+
* Copyright Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
|
19 |
+
*/
|
dist/main.bundle.js.map
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dist/style.css
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* style.css */
|
2 |
+
/* Define colors */
|
3 |
+
/* :root {
|
4 |
+
--distill-gray: rgb(107, 114, 128);
|
5 |
+
--distill-gray-light: rgb(185, 185, 185);
|
6 |
+
--distill-gray-lighter: rgb(228, 228, 228);
|
7 |
+
--distill-gray-lightest: rgb(245, 245, 245);
|
8 |
+
--distill-blue: #007BFF;
|
9 |
+
}
|
10 |
+
*/
|
11 |
+
|
12 |
+
|
13 |
+
/* Begin dark theme */
|
14 |
+
:root {
|
15 |
+
--bg-color: #0B0F19;
|
16 |
+
--text-color: #ffffff;
|
17 |
+
}
|
18 |
+
|
19 |
+
@media (prefers-color-scheme: dark) {
|
20 |
+
:root {
|
21 |
+
--bg-color: #0B0F19;
|
22 |
+
--text-color: #ffffff;
|
23 |
+
}
|
24 |
+
}
|
25 |
+
|
26 |
+
body {
|
27 |
+
background-color: var(--bg-color);
|
28 |
+
color: #9CA3AF;
|
29 |
+
}
|
30 |
+
|
31 |
+
aside {
|
32 |
+
background-color: var(--bg-color);
|
33 |
+
color: #9CA3AF;
|
34 |
+
}
|
35 |
+
|
36 |
+
d-article {
|
37 |
+
color: #9CA3AF;
|
38 |
+
}
|
39 |
+
|
40 |
+
d-bibliography {
|
41 |
+
color: #9CA3AF;
|
42 |
+
}
|
43 |
+
|
44 |
+
d-front-matter {
|
45 |
+
color: #9CA3AF;
|
46 |
+
}
|
47 |
+
|
48 |
+
d-contents {
|
49 |
+
color: #9CA3AF;
|
50 |
+
}
|
51 |
+
|
52 |
+
.active {
|
53 |
+
color: #9CA3AF;
|
54 |
+
}
|
55 |
+
|
56 |
+
.l-text figcaption
|
57 |
+
{
|
58 |
+
color: #9CA3AF;
|
59 |
+
}
|
60 |
+
|
61 |
+
.link {
|
62 |
+
color: #0284C7
|
63 |
+
}
|
64 |
+
|
65 |
+
/* End dark theme */
|
66 |
+
|
67 |
+
|
68 |
+
.width_125 {
|
69 |
+
width: 125%;
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
/* Container for the controls */
|
74 |
+
[id^="plot-"] {
|
75 |
+
display: flex;
|
76 |
+
flex-direction: column;
|
77 |
+
align-items: center;
|
78 |
+
gap: 15px; /* Adjust the gap between controls as needed */
|
79 |
+
}
|
80 |
+
[id^="plot-"] figure {
|
81 |
+
margin-bottom: 0px;
|
82 |
+
margin-top: 0px;
|
83 |
+
padding: 0px;
|
84 |
+
}
|
85 |
+
|
86 |
+
.plotly_caption {
|
87 |
+
font-style: italic;
|
88 |
+
margin-top: 10px;
|
89 |
+
}
|
90 |
+
|
91 |
+
.plotly_controls {
|
92 |
+
display: flex;
|
93 |
+
flex-wrap: wrap;
|
94 |
+
flex-direction: row;
|
95 |
+
justify-content: center;
|
96 |
+
align-items: flex-start;
|
97 |
+
gap: 30px;
|
98 |
+
}
|
99 |
+
|
100 |
+
|
101 |
+
.plotly_input_container {
|
102 |
+
display: flex;
|
103 |
+
align-items: center;
|
104 |
+
flex-direction: column;
|
105 |
+
gap: 10px;
|
106 |
+
}
|
107 |
+
|
108 |
+
/* Style for the select dropdown */
|
109 |
+
.plotly_input_container > select {
|
110 |
+
padding: 2px 4px;
|
111 |
+
/* border: 1px solid #ccc; */
|
112 |
+
line-height: 1.5em;
|
113 |
+
text-align: center;
|
114 |
+
border-radius: 4px;
|
115 |
+
font-size: 12px;
|
116 |
+
background-color: var(--distill-gray-lightest);
|
117 |
+
outline: none;
|
118 |
+
}
|
119 |
+
|
120 |
+
|
121 |
+
/* tags */
|
122 |
+
.note {
|
123 |
+
background-color: #0B1826;
|
124 |
+
color: #9CA3AF;
|
125 |
+
border-left: 4px solid #075985;
|
126 |
+
padding: 10px 10px 3px 5px;
|
127 |
+
margin: 10px 0;
|
128 |
+
width: 125%;
|
129 |
+
}
|
130 |
+
|
131 |
+
.tip {
|
132 |
+
background-color: #0A181E;
|
133 |
+
color: #9CA3AF;
|
134 |
+
border-left: 4px solid #065F46;
|
135 |
+
padding: 10px 10px 3px 5px;;
|
136 |
+
margin: 10px 0;
|
137 |
+
width: 125%;
|
138 |
+
}
|
139 |
+
|
140 |
+
.caution {
|
141 |
+
background-color: #1C111A;
|
142 |
+
color: #9CA3AF;
|
143 |
+
border-left: 4px solid #FF474C;
|
144 |
+
padding: 10px 10px 3px 5px;;
|
145 |
+
margin: 10px 0;
|
146 |
+
width: 125%;
|
147 |
+
}
|
148 |
+
|
149 |
+
/* Style for the range input */
|
150 |
+
|
151 |
+
.plotly_slider {
|
152 |
+
display: flex;
|
153 |
+
align-items: center;
|
154 |
+
gap: 10px;
|
155 |
+
}
|
156 |
+
|
157 |
+
.plotly_slider > input[type="range"] {
|
158 |
+
-webkit-appearance: none;
|
159 |
+
height: 2px;
|
160 |
+
background: var(--distill-gray-light);
|
161 |
+
border-radius: 5px;
|
162 |
+
outline: none;
|
163 |
+
}
|
164 |
+
|
165 |
+
.plotly_slider > span {
|
166 |
+
font-size: 14px;
|
167 |
+
line-height: 1.6em;
|
168 |
+
min-width: 16px;
|
169 |
+
}
|
170 |
+
|
171 |
+
.plotly_slider > input[type="range"]::-webkit-slider-thumb {
|
172 |
+
-webkit-appearance: none;
|
173 |
+
appearance: none;
|
174 |
+
width: 18px;
|
175 |
+
height: 18px;
|
176 |
+
border-radius: 50%;
|
177 |
+
background: var(--distill-blue);
|
178 |
+
cursor: pointer;
|
179 |
+
}
|
180 |
+
|
181 |
+
.plotly_slider > input[type="range"]::-moz-range-thumb {
|
182 |
+
width: 18px;
|
183 |
+
height: 18px;
|
184 |
+
border-radius: 50%;
|
185 |
+
background: var(--distill-blue);
|
186 |
+
cursor: pointer;
|
187 |
+
}
|
188 |
+
|
189 |
+
/* Style for the labels */
|
190 |
+
.plotly_input_container > label {
|
191 |
+
font-size: 14px;
|
192 |
+
font-weight: bold;
|
193 |
+
}
|
194 |
+
|
195 |
+
.main-plot-container {
|
196 |
+
margin-top: 21px;
|
197 |
+
margin-bottom: 35px;
|
198 |
+
}
|
199 |
+
|
200 |
+
.main-plot-container > figure {
|
201 |
+
display: block !important;
|
202 |
+
/* Let this be handled by graph-container */
|
203 |
+
margin-bottom: 0px;
|
204 |
+
margin-top: 0px;
|
205 |
+
}
|
206 |
+
.main-plot-container > div {
|
207 |
+
display: none !important;
|
208 |
+
}
|
209 |
+
|
210 |
+
|
211 |
+
@media (min-width: 768px) {
|
212 |
+
.main-plot-container > figure {
|
213 |
+
display: none !important;
|
214 |
+
}
|
215 |
+
.main-plot-container > div {
|
216 |
+
display: flex !important;
|
217 |
+
}
|
218 |
+
}
|
219 |
+
|
220 |
+
d-byline .byline {
|
221 |
+
grid-template-columns: 1fr;
|
222 |
+
grid-column: text;
|
223 |
+
font-size: 0.9rem;
|
224 |
+
line-height: 1.8em;
|
225 |
+
}
|
226 |
+
|
227 |
+
@media (min-width: 768px) {
|
228 |
+
d-byline .byline {
|
229 |
+
grid-template-columns: 4fr 1fr 1fr 1fr;
|
230 |
+
}
|
231 |
+
}
|
232 |
+
|
233 |
+
#title-plot {
|
234 |
+
margin-top: 0px;
|
235 |
+
margin-bottom: 0px;
|
236 |
+
}
|
237 |
+
|
238 |
+
d-contents > nav a.active {
|
239 |
+
text-decoration: underline;
|
240 |
+
}
|
241 |
+
|
242 |
+
@media (max-width: 1199px) {
|
243 |
+
d-contents {
|
244 |
+
display: none;
|
245 |
+
justify-self: start;
|
246 |
+
align-self: start;
|
247 |
+
padding-bottom: 0.5em;
|
248 |
+
margin-bottom: 1em;
|
249 |
+
padding-left: 0.25em;
|
250 |
+
border-bottom: 1px solid rgba(0, 0, 0, 0.1);
|
251 |
+
border-bottom-width: 1px;
|
252 |
+
border-bottom-style: solid;
|
253 |
+
border-bottom-color: rgba(0, 0, 0, 0.1);
|
254 |
+
}
|
255 |
+
}
|
256 |
+
|
257 |
+
d-contents a {
|
258 |
+
text-decoration: underline;
|
259 |
+
}
|
260 |
+
|
261 |
+
|
262 |
+
@media (min-width: 1200px) {
|
263 |
+
d-article {
|
264 |
+
/* Ensure d-article does not prevent sticky positioning */
|
265 |
+
overflow: visible;
|
266 |
+
}
|
267 |
+
|
268 |
+
d-contents {
|
269 |
+
align-self: start;
|
270 |
+
grid-column-start: 1 !important;
|
271 |
+
grid-column-end: 4 !important;
|
272 |
+
grid-row: auto / span 6;
|
273 |
+
justify-self: end;
|
274 |
+
margin-top: 0em;
|
275 |
+
padding-right: 2em;
|
276 |
+
padding-left: 2em;
|
277 |
+
border-right: 1px solid rgba(0, 0, 0, 0.1);
|
278 |
+
border-right-width: 1px;
|
279 |
+
border-right-style: solid;
|
280 |
+
border-right-color: rgba(0, 0, 0, 0.1);
|
281 |
+
position: -webkit-sticky; /* For Safari */
|
282 |
+
position: sticky;
|
283 |
+
top: 10px; /* Adjust this value if needed */
|
284 |
+
}
|
285 |
+
}
|
286 |
+
|
287 |
+
d-contents nav h3 {
|
288 |
+
margin-top: 0;
|
289 |
+
margin-bottom: 1em;
|
290 |
+
}
|
291 |
+
|
292 |
+
d-contents nav div {
|
293 |
+
color: #9CA3AF;
|
294 |
+
font-weight: bold;
|
295 |
+
}
|
296 |
+
|
297 |
+
d-contents nav a {
|
298 |
+
color: #9CA3AF;
|
299 |
+
border-bottom: none;
|
300 |
+
text-decoration: none;
|
301 |
+
}
|
302 |
+
|
303 |
+
d-contents li {
|
304 |
+
list-style-type: none;
|
305 |
+
}
|
306 |
+
|
307 |
+
d-contents ul, d-article d-contents ul {
|
308 |
+
padding-left: 1em;
|
309 |
+
}
|
310 |
+
|
311 |
+
d-contents nav ul li {
|
312 |
+
margin-bottom: .25em;
|
313 |
+
}
|
314 |
+
|
315 |
+
d-contents nav a:hover {
|
316 |
+
text-decoration: underline solid rgba(0, 0, 0, 0.6);
|
317 |
+
}
|
318 |
+
|
319 |
+
d-contents nav ul {
|
320 |
+
margin-top: 0;
|
321 |
+
margin-bottom: 6px;
|
322 |
+
}
|
323 |
+
|
324 |
+
|
325 |
+
d-contents nav > div {
|
326 |
+
display: block;
|
327 |
+
outline: none;
|
328 |
+
margin-bottom: 0.5em;
|
329 |
+
}
|
330 |
+
|
331 |
+
d-contents nav > div > a {
|
332 |
+
font-size: 13px;
|
333 |
+
font-weight: 600;
|
334 |
+
}
|
335 |
+
|
336 |
+
d-article aside {
|
337 |
+
margin-bottom: 1em;
|
338 |
+
}
|
339 |
+
|
340 |
+
@media (min-width: 768px) {
|
341 |
+
d-article aside {
|
342 |
+
margin-bottom: 0;
|
343 |
+
}
|
344 |
+
}
|
345 |
+
|
346 |
+
d-contents nav > div > a:hover,
|
347 |
+
d-contents nav > ul > li > a:hover {
|
348 |
+
text-decoration: none;
|
349 |
+
}
|
package-lock.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
package.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dependencies": {
|
3 |
+
"lodash": "^4.17.21",
|
4 |
+
"papaparse": "^5.4.1",
|
5 |
+
"plotly.js-basic-dist-min": "^2.33.0"
|
6 |
+
},
|
7 |
+
"name": "blogpost",
|
8 |
+
"version": "1.0.0",
|
9 |
+
"description": "--- title: \"FAT5: Flash Attention T5\" emoji: ⚡ colorFrom: pink colorTo: red sdk: static pinned: false header: mini ---",
|
10 |
+
"main": "index.js",
|
11 |
+
"scripts": {
|
12 |
+
"dev": "webpack serve --open",
|
13 |
+
"build": "NODE_ENV=production webpack"
|
14 |
+
},
|
15 |
+
"author": "",
|
16 |
+
"license": "ISC",
|
17 |
+
"devDependencies": {
|
18 |
+
"@babel/preset-env": "^7.24.6",
|
19 |
+
"babel-loader": "^9.1.3",
|
20 |
+
"clean-webpack-plugin": "^4.0.0",
|
21 |
+
"compression-webpack-plugin": "^11.1.0",
|
22 |
+
"copy-webpack-plugin": "^12.0.2",
|
23 |
+
"css-loader": "^7.1.2",
|
24 |
+
"html-webpack-change-assets-extension-plugin": "^1.3.1",
|
25 |
+
"html-webpack-plugin": "^5.6.0",
|
26 |
+
"style-loader": "^4.0.0",
|
27 |
+
"webpack": "^5.91.0",
|
28 |
+
"webpack-bundle-analyzer": "^4.10.2",
|
29 |
+
"webpack-cli": "^5.1.4",
|
30 |
+
"webpack-dev-server": "^5.0.4"
|
31 |
+
}
|
32 |
+
}
|
webpack.config.js
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const path = require("path");
|
2 |
+
const { CleanWebpackPlugin } = require("clean-webpack-plugin");
|
3 |
+
const CopyPlugin = require("copy-webpack-plugin");
|
4 |
+
const BundleAnalyzerPlugin = require("webpack-bundle-analyzer").BundleAnalyzerPlugin;
|
5 |
+
|
6 |
+
const COLOR_KEYS = ["color", "bgColor", "fillcolor"];
|
7 |
+
|
8 |
+
const transformDataColors = async (data, path) => {
|
9 |
+
const {getNamedColor} = await import('./src/colors.mjs');
|
10 |
+
// if not json file, return
|
11 |
+
if (!path.endsWith(".json")) {
|
12 |
+
return data;
|
13 |
+
}
|
14 |
+
const parsedData = JSON.parse(data);
|
15 |
+
// Change the color of the data
|
16 |
+
const deepIterateAndSetColor = (key, val) => {
|
17 |
+
if (val === null) {
|
18 |
+
return null;
|
19 |
+
}
|
20 |
+
if (val == undefined) {
|
21 |
+
return undefined;
|
22 |
+
}
|
23 |
+
if (Array.isArray(val)) {
|
24 |
+
return val.map(item => deepIterateAndSetColor(key, item));
|
25 |
+
}
|
26 |
+
if (typeof val === "object") {
|
27 |
+
return Object.entries(val).reduce((newObj, [key, value]) => {
|
28 |
+
newObj[key] = deepIterateAndSetColor(key, value);
|
29 |
+
return newObj;
|
30 |
+
}, {});
|
31 |
+
}
|
32 |
+
if (COLOR_KEYS.includes(key)) {
|
33 |
+
const [colorName, opacity, ...rest] = val.trim().split(/\s+/);
|
34 |
+
const floatOpacity = parseFloat(opacity);
|
35 |
+
const newColor = getNamedColor(colorName, floatOpacity);
|
36 |
+
if (newColor !== undefined && rest.length === 0 && !isNaN(floatOpacity)) {
|
37 |
+
console.log(`key: ${key} in file ${path} changed from ${val} to ${newColor}`);
|
38 |
+
return newColor;
|
39 |
+
} else {
|
40 |
+
return val;
|
41 |
+
}
|
42 |
+
}
|
43 |
+
return val;
|
44 |
+
};
|
45 |
+
return JSON.stringify(deepIterateAndSetColor(undefined, parsedData))
|
46 |
+
};
|
47 |
+
|
48 |
+
module.exports = {
|
49 |
+
entry: {
|
50 |
+
distill: "./src/distill.js",
|
51 |
+
main: "./src/index.js",
|
52 |
+
},
|
53 |
+
output: {
|
54 |
+
filename: "[name].bundle.js", // The output file
|
55 |
+
path: path.resolve(__dirname, "dist"), // Output directory
|
56 |
+
},
|
57 |
+
module: {
|
58 |
+
rules: [
|
59 |
+
// { test: /\.css$/, use: ["style-loader", "css-loader"] },
|
60 |
+
{
|
61 |
+
test: /\.(js|mjs)$/,
|
62 |
+
exclude: /node_modules/,
|
63 |
+
use: {
|
64 |
+
loader: "babel-loader",
|
65 |
+
options: {
|
66 |
+
presets: ["@babel/preset-env"],
|
67 |
+
},
|
68 |
+
},
|
69 |
+
},
|
70 |
+
],
|
71 |
+
},
|
72 |
+
plugins: [
|
73 |
+
new CleanWebpackPlugin(),
|
74 |
+
new CopyPlugin({
|
75 |
+
patterns: [
|
76 |
+
{
|
77 |
+
from: "assets",
|
78 |
+
to: "assets",
|
79 |
+
transform: transformDataColors,
|
80 |
+
},
|
81 |
+
{ from: "src/style.css", to: "style.css" },
|
82 |
+
{ from: "src/bibliography.bib", to: "bibliography.bib" },
|
83 |
+
{ from: "src/index.html", to: "index.html" },
|
84 |
+
],
|
85 |
+
}),
|
86 |
+
],
|
87 |
+
devtool: process.env.NODE_ENV === 'production' ? 'source-map' : 'eval-source-map',
|
88 |
+
devServer: {
|
89 |
+
static: "./dist", // Serve files from the 'dist' directory
|
90 |
+
open: process.env.NODE_ENV !== 'production', // Automatically open the browser unless in production
|
91 |
+
hot: process.env.NODE_ENV !== 'production', // Enable hot module replacement unless in production
|
92 |
+
},
|
93 |
+
mode: process.env.NODE_ENV === 'production' ? 'production' : 'development',
|
94 |
+
};
|
95 |
+
|
96 |
+
console.log(process.env.NODE_ENV)
|