| import io | |
| import gzip | |
| import json | |
| import sys | |
| import requests | |
| from tqdm import tqdm | |
| _DATA_URL_TRAIN = "https://huggingface.co/datasets/bertin-project/mc4-es-sampled/resolve/main/mc4-es-train-50M-{config}-shard-{index:04d}-of-{n_shards:04d}.json.gz" | |
| def main(config="stepwise"): | |
| data_urls = [ | |
| _DATA_URL_TRAIN.format( | |
| config=config, | |
| index=index + 1, | |
| n_shards=1024, | |
| ) | |
| for index in range(1024) | |
| ] | |
| with open(f"mc4-es-train-50M-{config}.jsonl", "w") as f: | |
| for dara_url in tqdm(data_urls): | |
| response = requests.get(dara_url) | |
| bio = io.BytesIO(response.content) | |
| with gzip.open(bio, "rt", encoding="utf8") as g: | |
| for line in g: | |
| json_line = json.loads(line.strip()) | |
| f.write(json.dumps(json_line) + "\n") | |
| if __name__ == "__main__": | |
| main(sys.argv[1]) | |