Spaces:

CATIE-AQ
/

FAT5-report

Running

App Files Files Community

bourdoiscatie commited on Jan 10

Commit

d57a578

verified ·

1 Parent(s): be67ee1

Upload 35 files

Browse files

Files changed (36) hide show

.gitattributes +3 -0
README.md +6 -6
dist/assets/BWD-causal-True.png +0 -0
dist/assets/BWD-causal-True_dark.png +0 -0
dist/assets/CE.png +0 -0
dist/assets/CE_dark.png +0 -0
dist/assets/FAT5.gif +3 -0
dist/assets/FAT5_dark.gif +3 -0
dist/assets/FWD-causal-True.png +0 -0
dist/assets/FWD-causal-True_dark.png +0 -0
dist/assets/LN.png +0 -0
dist/assets/LN_dark.png +0 -0
dist/assets/LinFAT5_dark.gif +3 -0
dist/assets/bwd-bfloat16-b16-dark.png +0 -0
dist/assets/bwd-bfloat16-b16.png +0 -0
dist/assets/convergence_masked_accuracy_FAT5.png +0 -0
dist/assets/fwd-bfloat16-b16-dark.png +0 -0
dist/assets/fwd-bfloat16-b16.png +0 -0
dist/assets/loss_eval.png +0 -0
dist/assets/loss_train.png +0 -0
dist/assets/mem-bfloat16-b32-dark.png +0 -0
dist/assets/mem-bfloat16-b32.png +0 -0
dist/assets/mem-bfloat16-b8-dark.png +0 -0
dist/assets/mem-bfloat16-b8.png +0 -0
dist/assets/nvidiasmi.png +0 -0
dist/bibliography.bib +590 -0
dist/distill.bundle.js +0 -0
dist/distill.bundle.js.map +0 -0
dist/index.html +1380 -0
dist/main.bundle.js +0 -0
dist/main.bundle.js.LICENSE.txt +19 -0
dist/main.bundle.js.map +0 -0
dist/style.css +349 -0
package-lock.json +0 -0
package.json +32 -0
webpack.config.js +96 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+dist/assets/FAT5_dark.gif filter=lfs diff=lfs merge=lfs -text
+dist/assets/FAT5.gif filter=lfs diff=lfs merge=lfs -text
+dist/assets/LinFAT5_dark.gif filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: FAT5 Report
 emoji: ⚡
-colorFrom: red
-colorTo: gray
 sdk: static
 pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: 'FAT5 (Flash Attention T5) report'
 emoji: ⚡
+colorFrom: blue
+colorTo: red
 sdk: static
 pinned: false
+header: default
+app_file: dist/index.html
+---

dist/assets/BWD-causal-True.png ADDED Viewed

dist/assets/BWD-causal-True_dark.png ADDED Viewed

dist/assets/CE.png ADDED Viewed

dist/assets/CE_dark.png ADDED Viewed

dist/assets/FAT5.gif ADDED Viewed

Git LFS Details

SHA256: e5979eaa4e283803e7169a3c65b83025e64d72261427f48f06e14d577198ec3c
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

dist/assets/FAT5_dark.gif ADDED Viewed

Git LFS Details

SHA256: ba409a75a41d632a5f5b68cd9dec2b491bc7bf8841a3952ba80667ba885dc291
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

dist/assets/FWD-causal-True.png ADDED Viewed

dist/assets/FWD-causal-True_dark.png ADDED Viewed

dist/assets/LN.png ADDED Viewed

dist/assets/LN_dark.png ADDED Viewed

dist/assets/LinFAT5_dark.gif ADDED Viewed

Git LFS Details

SHA256: b00ba50f9b6bad77e527f2d3e3048b8295edf9bfe4cb36a95fa6704d7c7e8756
Pointer size: 132 Bytes
Size of remote file: 1.66 MB

dist/assets/bwd-bfloat16-b16-dark.png ADDED Viewed

dist/assets/bwd-bfloat16-b16.png ADDED Viewed

dist/assets/convergence_masked_accuracy_FAT5.png ADDED Viewed

dist/assets/fwd-bfloat16-b16-dark.png ADDED Viewed

dist/assets/fwd-bfloat16-b16.png ADDED Viewed

dist/assets/loss_eval.png ADDED Viewed

dist/assets/loss_train.png ADDED Viewed

dist/assets/mem-bfloat16-b32-dark.png ADDED Viewed

dist/assets/mem-bfloat16-b32.png ADDED Viewed

dist/assets/mem-bfloat16-b8-dark.png ADDED Viewed

dist/assets/mem-bfloat16-b8.png ADDED Viewed

dist/assets/nvidiasmi.png ADDED Viewed

dist/bibliography.bib ADDED Viewed

	@@ -0,0 +1,590 @@

+@article{JMLR:v21:20-074,
+  author  = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+  title   = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+  journal = {Journal of Machine Learning Research},
+  year    = {2020},
+  volume  = {21},
+  number  = {140},
+  pages   = {1--67},
+  url     = {http://jmlr.org/papers/v21/20-074.html}
+}
+@misc{chia2023instructeval,
+      title={INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models},
+      author={Yew Ken Chia and Pengfei Hong and Lidong Bing and Soujanya Poria},
+      year={2023},
+      url={https://arxiv.org/abs/2306.04757},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{fu2024tiny,
+      title={Tiny Titans: Can Smaller Large Language Models Punch Above Their Weight in the Real World for Meeting Summarization?},
+      author={Xue-Yong Fu and Md Tahmid Rahman Laskar and Elena Khasanova and Cheng Chen and Shashi Bhushan TN},
+      year={2024},
+      url={https://arxiv.org/abs/2402.00841},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{hsieh2023distilling,
+      title={Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes},
+      author={Cheng-Yu Hsieh and Chun-Liang Li and Chih-Kuan Yeh and Hootan Nakhost and Yasuhisa Fujii and Alexander Ratner and Ranjay Krishna and Chen-Yu Lee and Tomas Pfister},
+      year={2023},
+      url={https://arxiv.org/abs/2305.02301},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{nawrot2023nanot5,
+      title={nanoT5: A PyTorch Framework for Pre-training and Fine-tuning T5-style Models with Limited Resources},
+      author={Piotr Nawrot},
+      year={2023},
+      url={https://arxiv.org/abs/2309.02373},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tay2023ul2,
+      title={UL2: Unifying Language Learning Paradigms},
+      author={Yi Tay and Mostafa Dehghani and Vinh Q. Tran and Xavier Garcia and Jason Wei and Xuezhi Wang and Hyung Won Chung and Siamak Shakeri and Dara Bahri and Tal Schuster and Huaixiu Steven Zheng and Denny Zhou and Neil Houlsby and Donald Metzler},
+      year={2023},
+      url={https://arxiv.org/abs/2205.05131},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{dao2022flashattention,
+      title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
+      author={Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher Ré},
+      year={2022},
+      url={https://arxiv.org/abs/2205.14135},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{turbot5,
+      title={TurboT5},
+      author={Knowledgator},
+      year={2024},
+      publisher={GitHub},
+      url={https://github.com/Knowledgator/TurboT5},
+}
+@misc{nguyen2023culturax,
+      title={CulturaX: A Cleaned, Enormous, and Multilingual Dataset for Large Language Models in 167 Languages},
+      author={Thuat Nguyen and Chien Van Nguyen and Viet Dac Lai and Hieu Man and Nghia Trung Ngo and Franck Dernoncourt and Ryan A. Rossi and Thien Huu Nguyen},
+      year={2023},
+      url={https://arxiv.org/abs/2309.09400},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{leclerc2023ffcv,
+      title={FFCV: Accelerating Training by Removing Data Bottlenecks},
+      author={Guillaume Leclerc and Andrew Ilyas and Logan Engstrom and Sung Min Park and Hadi Salman and Aleksander Madry},
+      year={2023},
+      url={https://arxiv.org/abs/2306.12517},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{lhoest2021datasets,
+      title={Datasets: A Community Library for Natural Language Processing},
+      author={Quentin Lhoest and Albert Villanova del Moral and Yacine Jernite and Abhishek Thakur and Patrick von Platen and Suraj Patil and Julien Chaumond and Mariama Drame and Julien Plu and Lewis Tunstall and Joe Davison and Mario Šaško and Gunjan Chhablani and Bhavitvya Malik and Simon Brandeis and Teven Le Scao and Victor Sanh and Canwen Xu and Nicolas Patry and Angelina McMillan-Major and Philipp Schmid and Sylvain Gugger and Clément Delangue and Théo Matussière and Lysandre Debut and Stas Bekman and Pierric Cistac and Thibault Goehringer and Victor Mustar and François Lagunas and Alexander M. Rush and Thomas Wolf},
+      year={2021},
+      url={https://arxiv.org/abs/2109.02846},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{paszke2019pytorch,
+      title={PyTorch: An Imperative Style, High-Performance Deep Learning Library},
+      author={Adam Paszke and Sam Gross and Francisco Massa and Adam Lerer and James Bradbury and Gregory Chanan and Trevor Killeen and Zeming Lin and Natalia Gimelshein and Luca Antiga and Alban Desmaison and Andreas Köpf and Edward Yang and Zach DeVito and Martin Raison and Alykhan Tejani and Sasank Chilamkurthy and Benoit Steiner and Lu Fang and Junjie Bai and Soumith Chintala},
+      year={2019},
+      url={https://arxiv.org/abs/1912.01703},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@inproceedings{10.1145/2833157.2833162,
+author = {Lam, Siu Kwan and Pitrou, Antoine and Seibert, Stanley},
+title = {Numba: a LLVM-based Python JIT compiler},
+year = {2015},
+isbn = {9781450340052},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/2833157.2833162},
+doi = {10.1145/2833157.2833162},
+abstract = {Dynamic, interpreted languages, like Python, are attractive for domain-experts and scientists experimenting with new ideas. However, the performance of the interpreter is often a barrier when scaling to larger data sets. This paper presents a just-in-time compiler for Python that focuses in scientific and array-oriented computing. Starting with the simple syntax of Python, Numba compiles a subset of the language into efficient machine code that is comparable in performance to a traditional compiled language. In addition, we share our experience in building a JIT compiler using LLVM[1].},
+booktitle = {Proceedings of the Second Workshop on the LLVM Compiler Infrastructure in HPC},
+articleno = {7},
+numpages = {6},
+keywords = {compiler, Python, LLVM},
+location = {Austin, Texas},
+series = {LLVM '15}
+}
+@inproceedings{10.1145/3315508.3329973,
+author = {Tillet, Philippe and Kung, H. T. and Cox, David},
+title = {Triton: an intermediate language and compiler for tiled neural network computations},
+year = {2019},
+isbn = {9781450367196},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3315508.3329973},
+doi = {10.1145/3315508.3329973},
+abstract = {The validation and deployment of novel research ideas in the field of Deep Learning is often limited by the availability of efficient compute kernels for certain basic primitives. In particular, operations that cannot leverage existing vendor libraries (e.g., cuBLAS, cuDNN) are at risk of facing poor device utilization unless custom implementations are written by experts – usually at the expense of portability. For this reason, the development of new programming abstractions for specifying custom Deep Learning workloads at a minimal performance cost has become crucial. We present Triton, a language and compiler centered around the concept of tile, i.e., statically shaped multi-dimensional sub-arrays. Our approach revolves around (1) a C-based language and an LLVM-based intermediate representation (IR) for expressing tensor programs in terms of operations on parametric tile variables and (2) a set of novel tile-level optimization passes for compiling these programs into efficient GPU code. We demonstrate how Triton can be used to build portable implementations of matrix multiplication and convolution kernels on par with hand-tuned vendor libraries (cuBLAS / cuDNN), or for efficiently implementing recent research ideas such as shift convolutions.},
+booktitle = {Proceedings of the 3rd ACM SIGPLAN International Workshop on Machine Learning and Programming Languages},
+pages = {10–19},
+numpages = {10},
+keywords = {GPU, compiler, neural networks},
+location = {Phoenix, AZ, USA},
+series = {MAPL 2019}
+}
+@misc{flagattention,
+      title={FlagAttention},
+      author={FlagOpen},
+      year={2023},
+      publisher={GitHub},
+      title={https://github.com/FlagOpen/FlagAttention}
+}
+@misc{nvidiapex,
+      title={Apex},
+      author={NVIDIA},
+      year={2018},
+      publisher={GitHub},
+      url={https://github.com/NVIDIA/apex},
+}
+@misc{jansen2022perplexed,
+      title={Perplexed by Quality: A Perplexity-based Method for Adult and Harmful Content Detection in Multilingual Heterogeneous Web Data},
+      author={Tim Jansen and Yangling Tong and Victoria Zevallos and Pedro Ortiz Suarez},
+      year={2022},
+      url={https://arxiv.org/abs/2212.10440},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{micikevicius2018mixed,
+      title={Mixed Precision Training},
+      author={Paulius Micikevicius and Sharan Narang and Jonah Alben and Gregory Diamos and Erich Elsen and David Garcia and Boris Ginsburg and Michael Houston and Oleksii Kuchaiev and Ganesh Venkatesh and Hao Wu},
+      year={2018},
+      url={https://arxiv.org/abs/1710.03740},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+@misc{you2020large,
+      title={Large Batch Optimization for Deep Learning: Training BERT in 76 minutes},
+      author={Yang You and Jing Li and Sashank Reddi and Jonathan Hseu and Sanjiv Kumar and Srinadh Bhojanapalli and Xiaodan Song and James Demmel and Kurt Keutzer and Cho-Jui Hsieh},
+      year={2020},
+      url={https://arxiv.org/abs/1904.00962},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{liu2024sophia,
+      title={Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training},
+      author={Hong Liu and Zhiyuan Li and David Hall and Percy Liang and Tengyu Ma},
+      year={2024},
+      url={https://arxiv.org/abs/2305.14342},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{rajbhandari2020zero,
+      title={ZeRO: Memory Optimizations Toward Training Trillion Parameter Models},
+      author={Samyam Rajbhandari and Jeff Rasley and Olatunji Ruwase and Yuxiong He},
+      year={2020},
+      url={https://arxiv.org/abs/1910.02054},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{debrébisson2016zloss,
+      title={The Z-loss: a shift and scale invariant classification loss belonging to the Spherical Family},
+      author={Alexandre de Brébisson and Pascal Vincent},
+      year={2016},
+      url={https://arxiv.org/abs/1604.08859},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{zhang2019root,
+      title={Root Mean Square Layer Normalization},
+      author={Biao Zhang and Rico Sennrich},
+      year={2019},
+      url={https://arxiv.org/abs/1910.07467},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{liu2021pay,
+      title={Pay Attention to MLPs},
+      author={Hanxiao Liu and Zihang Dai and David R. So and Quoc V. Le},
+      year={2021},
+      url={https://arxiv.org/abs/2105.08050},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{wolf2020huggingfaces,
+      title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
+      author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
+      year={2020},
+      url={https://arxiv.org/abs/1910.03771},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{liu2022enct5,
+      title={EncT5: A Framework for Fine-tuning T5 as Non-autoregressive Models},
+      author={Frederick Liu and Terry Huang and Shihang Lyu and Siamak Shakeri and Hongkun Yu and Jing Li},
+      year={2022},
+      url={https://arxiv.org/abs/2110.08426},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{shaw2018selfattention,
+      title={Self-Attention with Relative Position Representations},
+      author={Peter Shaw and Jakob Uszkoreit and Ashish Vaswani},
+      year={2018},
+      url={https://arxiv.org/abs/1803.02155},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{press2022train,
+      title={Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
+      author={Ofir Press and Noah A. Smith and Mike Lewis},
+      year={2022},
+      url={https://arxiv.org/abs/2108.12409},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{su2023roformer,
+      title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
+      author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
+      year={2023},
+      url={https://arxiv.org/abs/2104.09864},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{li2024functional,
+      title={Functional Interpolation for Relative Positions Improves Long Context Transformers},
+      author={Shanda Li and Chong You and Guru Guruganesh and Joshua Ainslie and Santiago Ontanon and Manzil Zaheer and Sumit Sanghai and Yiming Yang and Sanjiv Kumar and Srinadh Bhojanapalli},
+      year={2024},
+      url={https://arxiv.org/abs/2310.04418},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{lacoste2019quantifying,
+      title={Quantifying the Carbon Emissions of Machine Learning},
+      author={Alexandre Lacoste and Alexandra Luccioni and Victor Schmidt and Thomas Dandres},
+      year={2019},
+      url={https://arxiv.org/abs/1910.09700},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY}
+}
+@misc{devlin2019bert,
+      title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+      author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
+      year={2019},
+      url={https://arxiv.org/abs/1810.04805},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{eddine2021barthez,
+      title={BARThez: a Skilled Pretrained French Sequence-to-Sequence Model},
+      author={Moussa Kamal Eddine and Antoine J. -P. Tixier and Michalis Vazirgiannis},
+      year={2021},
+      url={https://arxiv.org/abs/2010.12321},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+}
+@misc{allocine,
+      title={French sentiment analysis with BERT},
+      author={Théophile Blard},
+      year={2020},
+      url={https://arxiv.org/abs/https://github.com/TheophileBlard/french-sentiment-analysis-with-bert},
+}
+@misc{delestre2022distilcamembert,
+      title={DistilCamemBERT: a distillation of the French model CamemBERT},
+      author={Cyrile Delestre and Abibatou Amar},
+      year={2022},
+      url={https://arxiv.org/abs/2205.11111},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc {frenchQA2023,
+    author       = { ALBAR, Boris and BEDU, Pierre and BOURDOIS, Loïck },
+    organization  = { Centre Aquitain des Technologies de l'Information et Electroniques },
+    title        = { frenchQA (Revision 6249cd5) },
+    year         = 2023,
+    url          = { https://huggingface.co/CATIE-AQ/frenchQA },
+    doi          = { 10.57967/hf/0862 },
+    publisher    = { Hugging Face }
+}
+@misc {frenchNER2024,
+    author       = { BOURDOIS, Loïck },
+    organization = { Centre Aquitain des Technologies de l'Information et Electroniques },
+    title        = { frenchNER_4entities (Revision f1e8fef) },
+    year         = 2024,
+    url          = { https://huggingface.co/datasets/CATIE-AQ/frenchNER_4entities },
+    doi          = { 10.57967/hf/1751 },
+    publisher    = { Hugging Face }
+}
+@misc{lacoste2019quantifying,
+      title={Quantifying the Carbon Emissions of Machine Learning},
+      author={Alexandre Lacoste and Alexandra Luccioni and Victor Schmidt and Thomas Dandres},
+      year={2019},
+      url={https://arxiv.org/abs/1910.09700},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY}
+}
+@inproceedings{Martin_2020,
+   title={CamemBERT: a Tasty French Language Model},
+   url={http://dx.doi.org/10.18653/v1/2020.acl-main.645},
+   DOI={10.18653/v1/2020.acl-main.645},
+   booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+   publisher={Association for Computational Linguistics},
+   author={Martin, Louis and Muller, Benjamin and Ortiz Suárez, Pedro Javier and Dupont, Yoann and Romary, Laurent and de la Clergerie, Éric and Seddah, Djamé and Sagot, Benoît},
+   year={2020} }
+@misc{le2020flaubert,
+      title={FlauBERT: Unsupervised Language Model Pre-training for French},
+      author={Hang Le and Loïc Vial and Jibril Frej and Vincent Segonne and Maximin Coavoux and Benjamin Lecouteux and Alexandre Allauzen and Benoît Crabbé and Laurent Besacier and Didier Schwab},
+      year={2020},
+      url={https://arxiv.org/abs/1912.05372},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{chung2022scaling,
+      title={Scaling Instruction-Finetuned Language Models},
+      author={Hyung Won Chung and Le Hou and Shayne Longpre and Barret Zoph and Yi Tay and William Fedus and Yunxuan Li and Xuezhi Wang and Mostafa Dehghani and Siddhartha Brahma and Albert Webson and Shixiang Shane Gu and Zhuyun Dai and Mirac Suzgun and Xinyun Chen and Aakanksha Chowdhery and Alex Castro-Ros and Marie Pellat and Kevin Robinson and Dasha Valter and Sharan Narang and Gaurav Mishra and Adams Yu and Vincent Zhao and Yanping Huang and Andrew Dai and Hongkun Yu and Slav Petrov and Ed H. Chi and Jeff Dean and Jacob Devlin and Adam Roberts and Denny Zhou and Quoc V. Le and Jason Wei},
+      year={2022},
+      url={https://arxiv.org/abs/2210.11416},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{guo2022longt5,
+      title={LongT5: Efficient Text-To-Text Transformer for Long Sequences},
+      author={Mandy Guo and Joshua Ainslie and David Uthus and Santiago Ontanon and Jianmo Ni and Yun-Hsuan Sung and Yinfei Yang},
+      year={2022},
+      url={https://arxiv.org/abs/2112.07916},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{arora2024simple,
+      title={Simple linear attention language models balance the recall-throughput tradeoff},
+      author={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and Dylan Zinsley and James Zou and Atri Rudra and Christopher Ré},
+      year={2024},
+      url={https://arxiv.org/abs/2402.18668},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{zhang2024hedgehog,
+      title={The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax Mimicry},
+      author={Michael Zhang and Kush Bhatia and Hermann Kumbong and Christopher Ré},
+      year={2024},
+      url={https://arxiv.org/abs/2402.04347},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+@misc{thunderkittens ,
+      title={ThunderKittens},
+      author={HazyResearch},
+      year={2024},
+      publisher={GitHub},
+      url={https://github.com/HazyResearch/ThunderKittens},
+}
+@misc{flashdecoding ,
+      title={Flash-Decoding for long-context inference},
+      author={Tri Dao and Daniel Haziza and Francisco Massa and Grigory Sizov},
+      year={2023},
+      url={https://crfm.stanford.edu/2023/10/12/flashdecoding.html},
+}
+@InProceedings{huggingface:dataset:stsb_multi_mt,
+title = {Machine translated multilingual STS benchmark dataset.},
+author={Philip May},
+year={2021},
+url={https://github.com/PhilipMay/stsb-multi-mt}
+}
+@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
+      title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis},
+      author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini},
+      year={2024},
+      eprint={2405.20468},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2405.20468},
+}
+@misc{dubey2024llama3herdmodels,
+      title={The Llama 3 Herd of Models},
+      author={Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony Hartshorn and Aobo Yang and Archi Mitra and Archie Sravankumar and Artem Korenev and Arthur Hinsvark and Arun Rao and Aston Zhang and Aurelien Rodriguez and Austen Gregerson and Ava Spataru and Baptiste Roziere and Bethany Biron and Binh Tang and Bobbie Chern and Charlotte Caucheteux and Chaya Nayak and Chloe Bi and Chris Marra and Chris McConnell and Christian Keller and Christophe Touret and Chunyang Wu and Corinne Wong and Cristian Canton Ferrer and Cyrus Nikolaidis and Damien Allonsius and Daniel Song and Danielle Pintz and Danny Livshits and David Esiobu and Dhruv Choudhary and Dhruv Mahajan and Diego Garcia-Olano and Diego Perino and Dieuwke Hupkes and Egor Lakomkin and Ehab AlBadawy and Elina Lobanova and Emily Dinan and Eric Michael Smith and Filip Radenovic and Frank Zhang and Gabriel Synnaeve and Gabrielle Lee and Georgia Lewis Anderson and Graeme Nail and Gregoire Mialon and Guan Pang and Guillem Cucurell and Hailey Nguyen and Hannah Korevaar and Hu Xu and Hugo Touvron and Iliyan Zarov and Imanol Arrieta Ibarra and Isabel Kloumann and Ishan Misra and Ivan Evtimov and Jade Copet and Jaewon Lee and Jan Geffert and Jana Vranes and Jason Park and Jay Mahadeokar and Jeet Shah and Jelmer van der Linde and Jennifer Billock and Jenny Hong and Jenya Lee and Jeremy Fu and Jianfeng Chi and Jianyu Huang and Jiawen Liu and Jie Wang and Jiecao Yu and Joanna Bitton and Joe Spisak and Jongsoo Park and Joseph Rocca and Joshua Johnstun and Joshua Saxe and Junteng Jia and Kalyan Vasuden Alwala and Kartikeya Upasani and Kate Plawiak and Ke Li and Kenneth Heafield and Kevin Stone and Khalid El-Arini and Krithika Iyer and Kshitiz Malik and Kuenley Chiu and Kunal Bhalla and Lauren Rantala-Yeary and Laurens van der Maaten and Lawrence Chen and Liang Tan and Liz Jenkins and Louis Martin and Lovish Madaan and Lubo Malo and Lukas Blecher and Lukas Landzaat and Luke de Oliveira and Madeline Muzzi and Mahesh Pasupuleti and Mannat Singh and Manohar Paluri and Marcin Kardas and Mathew Oldham and Mathieu Rita and Maya Pavlova and Melanie Kambadur and Mike Lewis and Min Si and Mitesh Kumar Singh and Mona Hassan and Naman Goyal and Narjes Torabi and Nikolay Bashlykov and Nikolay Bogoychev and Niladri Chatterji and Olivier Duchenne and Onur Çelebi and Patrick Alrassy and Pengchuan Zhang and Pengwei Li and Petar Vasic and Peter Weng and Prajjwal Bhargava and Pratik Dubal and Praveen Krishnan and Punit Singh Koura and Puxin Xu and Qing He and Qingxiao Dong and Ragavan Srinivasan and Raj Ganapathy and Ramon Calderer and Ricardo Silveira Cabral and Robert Stojnic and Roberta Raileanu and Rohit Girdhar and Rohit Patel and Romain Sauvestre and Ronnie Polidoro and Roshan Sumbaly and Ross Taylor and Ruan Silva and Rui Hou and Rui Wang and Saghar Hosseini and Sahana Chennabasappa and Sanjay Singh and Sean Bell and Seohyun Sonia Kim and Sergey Edunov and Shaoliang Nie and Sharan Narang and Sharath Raparthy and Sheng Shen and Shengye Wan and Shruti Bhosale and Shun Zhang and Simon Vandenhende and Soumya Batra and Spencer Whitman and Sten Sootla and Stephane Collot and Suchin Gururangan and Sydney Borodinsky and Tamar Herman and Tara Fowler and Tarek Sheasha and Thomas Georgiou and Thomas Scialom and Tobias Speckbacher and Todor Mihaylov and Tong Xiao and Ujjwal Karn and Vedanuj Goswami and Vibhor Gupta and Vignesh Ramanathan and Viktor Kerkez and Vincent Gonguet and Virginie Do and Vish Vogeti and Vladan Petrovic and Weiwei Chu and Wenhan Xiong and Wenyin Fu and Whitney Meers and Xavier Martinet and Xiaodong Wang and Xiaoqing Ellen Tan and Xinfeng Xie and Xuchao Jia and Xuewei Wang and Yaelle Goldschlag and Yashesh Gaur and Yasmine Babaei and Yi Wen and Yiwen Song and Yuchen Zhang and Yue Li and Yuning Mao and Zacharie Delpierre Coudert and Zheng Yan and Zhengxing Chen and Zoe Papakipos and Aaditya Singh and Aaron Grattafiori and Abha Jain and Adam Kelsey and Adam Shajnfeld and Adithya Gangidi and Adolfo Victoria and Ahuva Goldstand and Ajay Menon and Ajay Sharma and Alex Boesenberg and Alex Vaughan and Alexei Baevski and Allie Feinstein and Amanda Kallet and Amit Sangani and Anam Yunus and Andrei Lupu and Andres Alvarado and Andrew Caples and Andrew Gu and Andrew Ho and Andrew Poulton and Andrew Ryan and Ankit Ramchandani and Annie Franco and Aparajita Saraf and Arkabandhu Chowdhury and Ashley Gabriel and Ashwin Bharambe and Assaf Eisenman and Azadeh Yazdan and Beau James and Ben Maurer and Benjamin Leonhardi and Bernie Huang and Beth Loyd and Beto De Paola and Bhargavi Paranjape and Bing Liu and Bo Wu and Boyu Ni and Braden Hancock and Bram Wasti and Brandon Spence and Brani Stojkovic and Brian Gamido and Britt Montalvo and Carl Parker and Carly Burton and Catalina Mejia and Changhan Wang and Changkyu Kim and Chao Zhou and Chester Hu and Ching-Hsiang Chu and Chris Cai and Chris Tindal and Christoph Feichtenhofer and Damon Civin and Dana Beaty and Daniel Kreymer and Daniel Li and Danny Wyatt and David Adkins and David Xu and Davide Testuggine and Delia David and Devi Parikh and Diana Liskovich and Didem Foss and Dingkang Wang and Duc Le and Dustin Holland and Edward Dowling and Eissa Jamil and Elaine Montgomery and Eleonora Presani and Emily Hahn and Emily Wood and Erik Brinkman and Esteban Arcaute and Evan Dunbar and Evan Smothers and Fei Sun and Felix Kreuk and Feng Tian and Firat Ozgenel and Francesco Caggioni and Francisco Guzmán and Frank Kanayet and Frank Seide and Gabriela Medina Florez and Gabriella Schwarz and Gada Badeer and Georgia Swee and Gil Halpern and Govind Thattai and Grant Herman and Grigory Sizov and Guangyi and Zhang and Guna Lakshminarayanan and Hamid Shojanazeri and Han Zou and Hannah Wang and Hanwen Zha and Haroun Habeeb and Harrison Rudolph and Helen Suk and Henry Aspegren and Hunter Goldman and Ibrahim Damlaj and Igor Molybog and Igor Tufanov and Irina-Elena Veliche and Itai Gat and Jake Weissman and James Geboski and James Kohli and Japhet Asher and Jean-Baptiste Gaya and Jeff Marcus and Jeff Tang and Jennifer Chan and Jenny Zhen and Jeremy Reizenstein and Jeremy Teboul and Jessica Zhong and Jian Jin and Jingyi Yang and Joe Cummings and Jon Carvill and Jon Shepard and Jonathan McPhie and Jonathan Torres and Josh Ginsburg and Junjie Wang and Kai Wu and Kam Hou U and Karan Saxena and Karthik Prasad and Kartikay Khandelwal and Katayoun Zand and Kathy Matosich and Kaushik Veeraraghavan and Kelly Michelena and Keqian Li and Kun Huang and Kunal Chawla and Kushal Lakhotia and Kyle Huang and Lailin Chen and Lakshya Garg and Lavender A and Leandro Silva and Lee Bell and Lei Zhang and Liangpeng Guo and Licheng Yu and Liron Moshkovich and Luca Wehrstedt and Madian Khabsa and Manav Avalani and Manish Bhatt and Maria Tsimpoukelli and Martynas Mankus and Matan Hasson and Matthew Lennie and Matthias Reso and Maxim Groshev and Maxim Naumov and Maya Lathi and Meghan Keneally and Michael L. Seltzer and Michal Valko and Michelle Restrepo and Mihir Patel and Mik Vyatskov and Mikayel Samvelyan and Mike Clark and Mike Macey and Mike Wang and Miquel Jubert Hermoso and Mo Metanat and Mohammad Rastegari and Munish Bansal and Nandhini Santhanam and Natascha Parks and Natasha White and Navyata Bawa and Nayan Singhal and Nick Egebo and Nicolas Usunier and Nikolay Pavlovich Laptev and Ning Dong and Ning Zhang and Norman Cheng and Oleg Chernoguz and Olivia Hart and Omkar Salpekar and Ozlem Kalinli and Parkin Kent and Parth Parekh and Paul Saab and Pavan Balaji and Pedro Rittner and Philip Bontrager and Pierre Roux and Piotr Dollar and Polina Zvyagina and Prashant Ratanchandani and Pritish Yuvraj and Qian Liang and Rachad Alao and Rachel Rodriguez and Rafi Ayub and Raghotham Murthy and Raghu Nayani and Rahul Mitra and Raymond Li and Rebekkah Hogan and Robin Battey and Rocky Wang and Rohan Maheswari and Russ Howes and Ruty Rinott and Sai Jayesh Bondu and Samyak Datta and Sara Chugh and Sara Hunt and Sargun Dhillon and Sasha Sidorov and Satadru Pan and Saurabh Verma and Seiji Yamamoto and Sharadh Ramaswamy and Shaun Lindsay and Shaun Lindsay and Sheng Feng and Shenghao Lin and Shengxin Cindy Zha and Shiva Shankar and Shuqiang Zhang and Shuqiang Zhang and Sinong Wang and Sneha Agarwal and Soji Sajuyigbe and Soumith Chintala and Stephanie Max and Stephen Chen and Steve Kehoe and Steve Satterfield and Sudarshan Govindaprasad and Sumit Gupta and Sungmin Cho and Sunny Virk and Suraj Subramanian and Sy Choudhury and Sydney Goldman and Tal Remez and Tamar Glaser and Tamara Best and Thilo Kohler and Thomas Robinson and Tianhe Li and Tianjun Zhang and Tim Matthews and Timothy Chou and Tzook Shaked and Varun Vontimitta and Victoria Ajayi and Victoria Montanez and Vijai Mohan and Vinay Satish Kumar and Vishal Mangla and Vítor Albiero and Vlad Ionescu and Vlad Poenaru and Vlad Tiberiu Mihailescu and Vladimir Ivanov and Wei Li and Wenchen Wang and Wenwen Jiang and Wes Bouaziz and Will Constable and Xiaocheng Tang and Xiaofang Wang and Xiaojian Wu and Xiaolan Wang and Xide Xia and Xilun Wu and Xinbo Gao and Yanjun Chen and Ye Hu and Ye Jia and Ye Qi and Yenda Li and Yilin Zhang and Ying Zhang and Yossi Adi and Youngjin Nam and Yu and Wang and Yuchen Hao and Yundi Qian and Yuzi He and Zach Rait and Zachary DeVito and Zef Rosnbrick and Zhaoduo Wen and Zhenyu Yang and Zhiwei Zhao},
+      year={2024},
+      eprint={2407.21783},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2407.21783},
+}
+@Misc{accelerate,
+  title =        {Accelerate: Training and inference at scale made simple, efficient and adaptable.},
+  author =       {Sylvain Gugger and Lysandre Debut and Thomas Wolf and Philipp Schmid and Zachary Mueller and Sourab Mangrulkar and Marc Sun and Benjamin Bossan},
+  howpublished = {\url{https://github.com/huggingface/accelerate}},
+  year =         {2022}
+}
+@misc{wang2022languagemodelarchitecturepretraining,
+      title={What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?},
+      author={Thomas Wang and Adam Roberts and Daniel Hesslow and Teven Le Scao and Hyung Won Chung and Iz Beltagy and Julien Launay and Colin Raffel},
+      year={2022},
+      eprint={2204.05832},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2204.05832},
+}
+@misc{kaddour2023minipilechallengedataefficientlanguage,
+      title={The MiniPile Challenge for Data-Efficient Language Models},
+      author={Jean Kaddour},
+      year={2023},
+      eprint={2304.08442},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2304.08442},
+}
+@article{Kocetkov2022TheStack,
+  title={The Stack: 3 TB of permissively licensed source code},
+  author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Muñoz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm},
+  journal={Preprint},
+  year={2022}
+}
+@misc{sennrich2016neuralmachinetranslationrare,
+      title={Neural Machine Translation of Rare Words with Subword Units},
+      author={Rico Sennrich and Barry Haddow and Alexandra Birch},
+      year={2016},
+      eprint={1508.07909},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1508.07909},
+}
+@ONLINE{wikidump,
+    author = "Wikimedia Foundation",
+    title  = "Wikimedia Downloads",
+    url    = "https://dumps.wikimedia.org"
+}
+@misc {justice_fr,
+    author       = { ECKENDOERFFER, Guillaume },
+    title        = { justice_fr },
+    year         = 2023,
+    url          = { https://huggingface.co/datasets/eckendoerffer/justice_fr },
+    publisher    = { Hugging Face }
+}
+@software{unsloth,
+  author = {Daniel Han, Michael Han and Unsloth team},
+  title = {Unsloth},
+  url = {http://github.com/unslothai/unsloth},
+  year = {2023}
+}
+@misc{antoun2024camembert20smarterfrench,
+      title={CamemBERT 2.0: A Smarter French Language Model Aged to Perfection},
+      author={Wissam Antoun and Francis Kulumba and Rian Touchent and Éric de la Clergerie and Benoît Sagot and Djamé Seddah},
+      year={2024},
+      eprint={2411.08868},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2411.08868},
+}
+@article{wijmans2024cut,
+  author       = {Erik Wijmans and
+                  Brody Huval and
+                  Alexander Hertzberg and
+                  Vladlen Koltun and
+                  Philipp Kr\"ahenb\"uhl},
+  title        = {Cut Your Losses in Large-Vocabulary Language Models},
+  journal      = {arXiv},
+  year         = {2024},
+  url          = {https://arxiv.org/abs/2411.09009},
+}
+@misc{rafailov2024directpreferenceoptimizationlanguage,
+      title={Direct Preference Optimization: Your Language Model is Secretly a Reward Model},
+      author={Rafael Rafailov and Archit Sharma and Eric Mitchell and Stefano Ermon and Christopher D. Manning and Chelsea Finn},
+      year={2024},
+      eprint={2305.18290},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2305.18290},
+}
+@misc{muennighoff2023crosslingualgeneralizationmultitaskfinetuning,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2023},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2211.01786},
+}
+@misc {centre_aquitain_des_technologies_de_l'information_et_electroniques_2023,
+    author       = { {BOURDOIS, Loïck} },
+    organization  = { {Centre Aquitain des Technologies de l'Information et Electroniques} },
+    title        = { Dataset of French Prompts (DFP) (Revision 1d24c09) },
+    year         = 2023,
+    url          = { https://huggingface.co/datasets/CATIE-AQ/DFP },
+    doi          = { 10.57967/hf/1200 },
+    publisher    = { Hugging Face } }
+@misc{wang2022languagemodelarchitecturepretraining,
+      title={What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?},
+      author={Thomas Wang and Adam Roberts and Daniel Hesslow and Teven Le Scao and Hyung Won Chung and Iz Beltagy and Julien Launay and Colin Raffel},
+      year={2022},
+      eprint={2204.05832},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2204.05832},
+}
+@misc{zhang2024lolcatslowranklinearizinglarge,
+      title={LoLCATs: On Low-Rank Linearizing of Large Language Models},
+      author={Michael Zhang and Simran Arora and Rahul Chalamala and Alan Wu and Benjamin Spector and Aaryan Singhal and Krithik Ramesh and Christopher Ré},
+      year={2024},
+      eprint={2410.10254},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2410.10254},
+}
+@misc{hu2021loralowrankadaptationlarge,
+      title={LoRA: Low-Rank Adaptation of Large Language Models},
+      author={Edward J. Hu and Yelong Shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Lu Wang and Weizhu Chen},
+      year={2021},
+      eprint={2106.09685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2106.09685},
+}
+@misc{zhang2024sageattention2efficientattentionthorough,
+      title={SageAttention2: Efficient Attention with Thorough Outlier Smoothing and Per-thread INT4 Quantization},
+      author={Jintao Zhang and Haofeng Huang and Pengle Zhang and Jia Wei and Jun Zhu and Jianfei Chen},
+      year={2024},
+      eprint={2411.10958},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2411.10958},
+}
+@software{penedo2024fineweb-2,
+  author = {Penedo, Guilherme and Kydlíček, Hynek and Sabolčec, Vinko and Messmer, Bettina and Foroutan, Negar and Jaggi, Martin and von Werra, Leandro and Wolf, Thomas},
+  title = {FineWeb2: A sparkling update with 1000s of languages},
+  month = dec,
+  year = 2024,
+  doi = { },
+  url = {https://huggingface.co/datasets/HuggingFaceFW/fineweb-2}
+}
+@misc{lewis2019bartdenoisingsequencetosequencepretraining,
+      title={BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
+      author={Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Ves Stoyanov and Luke Zettlemoyer},
+      year={2019},
+      eprint={1910.13461},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1910.13461},
+}
+@misc{gloeckle2024betterfasterlarge,
+      title={Better & Faster Large Language Models via Multi-token Prediction},
+      author={Fabian Gloeckle and Badr Youbi Idrissi and Baptiste Rozière and David Lopez-Paz and Gabriel Synnaeve},
+      year={2024},
+      eprint={2404.19737},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2404.19737},
+}
+@misc{he2023debertav3improvingdebertausing,
+      title={DeBERTaV3: Improving DeBERTa using ELECTRA-Style Pre-Training with Gradient-Disentangled Embedding Sharing},
+      author={Pengcheng He and Jianfeng Gao and Weizhu Chen},
+      year={2023},
+      eprint={2111.09543},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2111.09543},
+}
+@misc{liu2019robertarobustlyoptimizedbert,
+      title={RoBERTa: A Robustly Optimized BERT Pretraining Approach},
+      author={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},
+      year={2019},
+      eprint={1907.11692},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1907.11692},
+}

dist/distill.bundle.js ADDED Viewed