MariaFjodorowa commited on
Commit
eb9ee24
·
verified ·
1 Parent(s): a628add

Updating README

Browse files
Files changed (1) hide show
  1. README.md +9 -30
README.md CHANGED
@@ -89,38 +89,17 @@ print([b.name for b in out.branches])
89
  url = "https://aclanthology.org/2023.findings-eacl.146",
90
  doi = "10.18653/v1/2023.findings-eacl.146",
91
  pages = "1954--1974"
92
- })
93
  ```
94
 
95
  ```bibtex
96
- @inproceedings{de-gibert-etal-2024-new-massive,
97
- title = "A New Massive Multilingual Dataset for High-Performance Language Technologies",
98
- author = {de Gibert, Ona and
99
- Nail, Graeme and
100
- Arefyev, Nikolay and
101
- Ba{\~n}{\'o}n, Marta and
102
- van der Linde, Jelmer and
103
- Ji, Shaoxiong and
104
- Zaragoza-Bernabeu, Jaume and
105
- Aulamo, Mikko and
106
- Ram{\'\i}rez-S{\'a}nchez, Gema and
107
- Kutuzov, Andrey and
108
- Pyysalo, Sampo and
109
- Oepen, Stephan and
110
- Tiedemann, J{\"o}rg},
111
- editor = "Calzolari, Nicoletta and
112
- Kan, Min-Yen and
113
- Hoste, Veronique and
114
- Lenci, Alessandro and
115
- Sakti, Sakriani and
116
- Xue, Nianwen",
117
- booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
118
- month = may,
119
- year = "2024",
120
- address = "Torino, Italia",
121
- publisher = "ELRA and ICCL",
122
- url = "https://aclanthology.org/2024.lrec-main.100",
123
- pages = "1116--1128",
124
- abstract = "We present the HPLT (High Performance Language Technologies) language resources, a new massive multilingual dataset including both monolingual and bilingual corpora extracted from CommonCrawl and previously unused web crawls from the Internet Archive. We describe our methods for data acquisition, management and processing of large corpora, which rely on open-source software tools and high-performance computing. Our monolingual collection focuses on low- to medium-resourced languages and covers 75 languages and a total of {\mbox{$\approx$}} 5.6 trillion word tokens de-duplicated on the document level. Our Slovenian-centric parallel corpus is derived from its monolingual counterpart and covers 18 language pairs and more than 96 million aligned sentence pairs with roughly 1.4 billion Slovenian tokens. The HPLT language resources are one of the largest open text corpora ever released, providing a great resource for language modeling and machine translation training. We publicly release the corpora, the software, and the tools used in this work.",
125
  }
126
  ```
 
89
  url = "https://aclanthology.org/2023.findings-eacl.146",
90
  doi = "10.18653/v1/2023.findings-eacl.146",
91
  pages = "1954--1974"
92
+ }
93
  ```
94
 
95
  ```bibtex
96
+ @misc{burchell2025expandedmassivemultilingualdataset,
97
+ title={An Expanded Massive Multilingual Dataset for High-Performance Language Technologies},
98
+ author={Laurie Burchell and Ona de Gibert and Nikolay Arefyev and Mikko Aulamo and Marta Bañón and Pinzhen Chen and Mariia Fedorova and Liane Guillou and Barry Haddow and Jan Hajič and Jindřich Helcl and Erik Henriksson and Mateusz Klimaszewski and Ville Komulainen and Andrey Kutuzov and Joona Kytöniemi and Veronika Laippala and Petter Mæhlum and Bhavitvya Malik and Farrokh Mehryary and Vladislav Mikhailov and Nikita Moghe and Amanda Myntti and Dayyán O'Brien and Stephan Oepen and Proyag Pal and Jousia Piha and Sampo Pyysalo and Gema Ramírez-Sánchez and David Samuel and Pavel Stepachev and Jörg Tiedemann and Dušan Variš and Tereza Vojtěchová and Jaume Zaragoza-Bernabeu},
99
+ year={2025},
100
+ eprint={2503.10267},
101
+ archivePrefix={arXiv},
102
+ primaryClass={cs.CL},
103
+ url={https://arxiv.org/abs/2503.10267},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  }
105
  ```