|
{ |
|
"builder_name": "common_voice_17_0", |
|
"citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", |
|
"config_name": "ta", |
|
"dataset_name": "common_voice_17_0", |
|
"dataset_size": 221361139, |
|
"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak. The dataset currently consists of 20408 validated hours of speech in 124 languages, but more voices and languages are always added.", |
|
"download_checksums": { |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/n_shards.json": { |
|
"num_bytes": 17491, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/train/ta_train_0.tar": { |
|
"num_bytes": 1598955520, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/train/ta_train_1.tar": { |
|
"num_bytes": 224542720, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/dev/ta_dev_0.tar": { |
|
"num_bytes": 434257920, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/test/ta_test_0.tar": { |
|
"num_bytes": 454778880, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_0.tar": { |
|
"num_bytes": 1560514560, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_1.tar": { |
|
"num_bytes": 1515827200, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/other/ta_other_2.tar": { |
|
"num_bytes": 495831040, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/invalidated/ta_invalidated_0.tar": { |
|
"num_bytes": 231424000, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_0.tar": { |
|
"num_bytes": 1447434240, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_1.tar": { |
|
"num_bytes": 1530644480, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_2.tar": { |
|
"num_bytes": 1654978560, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ta/validated/ta_validated_3.tar": { |
|
"num_bytes": 652861440, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/train.tsv": { |
|
"num_bytes": 19608830, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/dev.tsv": { |
|
"num_bytes": 5203704, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/test.tsv": { |
|
"num_bytes": 4944646, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/other.tsv": { |
|
"num_bytes": 39470943, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/invalidated.tsv": { |
|
"num_bytes": 2499761, |
|
"checksum": null |
|
}, |
|
"https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/transcript/ta/validated.tsv": { |
|
"num_bytes": 56763398, |
|
"checksum": null |
|
} |
|
}, |
|
"download_size": 11930559333, |
|
"features": { |
|
"context": { |
|
"sampling_rate": 16000, |
|
"_type": "Audio" |
|
}, |
|
"instruction": { |
|
"dtype": "string", |
|
"_type": "Value" |
|
}, |
|
"answer": { |
|
"dtype": "string", |
|
"_type": "Value" |
|
}, |
|
"audio_length": { |
|
"dtype": "float64", |
|
"_type": "Value" |
|
}, |
|
"language": { |
|
"dtype": "string", |
|
"_type": "Value" |
|
} |
|
}, |
|
"homepage": "https://commonvoice.mozilla.org/en/datasets", |
|
"license": "https://creativecommons.org/publicdomain/zero/1.0/", |
|
"size_in_bytes": 12151920472, |
|
"splits": { |
|
"train": { |
|
"name": "train", |
|
"num_bytes": 33336098, |
|
"num_examples": 45587, |
|
"dataset_name": "common_voice_17_0" |
|
}, |
|
"validation": { |
|
"name": "validation", |
|
"num_bytes": 8797317, |
|
"num_examples": 12095, |
|
"dataset_name": "common_voice_17_0" |
|
}, |
|
"test": { |
|
"name": "test", |
|
"num_bytes": 8556167, |
|
"num_examples": 12074, |
|
"dataset_name": "common_voice_17_0" |
|
}, |
|
"other": { |
|
"name": "other", |
|
"num_bytes": 67773267, |
|
"num_examples": 93989, |
|
"dataset_name": "common_voice_17_0" |
|
}, |
|
"invalidated": { |
|
"name": "invalidated", |
|
"num_bytes": 4282268, |
|
"num_examples": 5693, |
|
"dataset_name": "common_voice_17_0" |
|
}, |
|
"validated": { |
|
"name": "validated", |
|
"num_bytes": 98616022, |
|
"num_examples": 135391, |
|
"dataset_name": "common_voice_17_0" |
|
} |
|
}, |
|
"version": { |
|
"version_str": "17.0.0", |
|
"major": 17, |
|
"minor": 0, |
|
"patch": 0 |
|
} |
|
} |