Automatic Speech Recognition
Transformers
TensorBoard
Safetensors
whisper
Generated from Trainer
cobrayyxx commited on
Commit
f1f0089
·
verified ·
1 Parent(s): 334b528

Modify Citation

Browse files
Files changed (1) hide show
  1. README.md +8 -36
README.md CHANGED
@@ -81,13 +81,14 @@ Performance of this model was evaluated using WER on the test split of Big-C dat
81
  ## Citation
82
 
83
  ```
84
- @inproceedings{nllb2022,
85
- title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
86
- author = {Costa-jussà, Marta R. and Cross, James and et al.},
87
- booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
88
- year = {2022},
89
- publisher = {Association for Computational Linguistics},
90
- url = {https://aclanthology.org/2022.emnlp-main.9}
 
91
  }
92
 
93
  @inproceedings{sikasote-etal-2023-big,
@@ -110,19 +111,6 @@ Performance of this model was evaluated using WER on the test split of Big-C dat
110
  abstract = "We present BIG-C (Bemba Image Grounded Conversations), a large multimodal dataset for Bemba. While Bemba is the most populous language of Zambia, it exhibits a dearth of resources which render the development of language technologies or language processing research almost impossible. The dataset is comprised of multi-turn dialogues between Bemba speakers based on images, transcribed and translated into English. There are more than 92,000 utterances/sentences, amounting to more than 180 hours of audio data with corresponding transcriptions and English translations. We also provide baselines on speech recognition (ASR), machine translation (MT) and speech translation (ST) tasks, and sketch out other potential future multimodal uses of our dataset. We hope that by making the dataset available to the research community, this work will foster research and encourage collaboration across the language, speech, and vision communities especially for languages outside the {``}traditionally{''} used high-resourced ones. All data and code are publicly available: [\url{https://github.com/csikasote/bigc}](\url{https://github.com/csikasote/bigc}).",
111
  }
112
 
113
- Copy@inproceedings{wang-etal-2024-afrimte,
114
- title = "{A}fri{MTE} and {A}fri{COMET}: Enhancing {COMET} to Embrace Under-resourced {A}frican Languages",
115
- author = "Wang, Jiayi and Adelani, David and Agrawal, Sweta and Masiak, Marek and Rei, Ricardo and Briakou, Eleftheria and Carpuat, Marine and He, Xuanli and others",
116
- booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
117
- month = "jun",
118
- year = "2024",
119
- address = "Mexico City, Mexico",
120
- publisher = "Association for Computational Linguistics",
121
- url = "https://aclanthology.org/2024.naacl-long.334/",
122
- doi = "10.18653/v1/2024.naacl-long.334",
123
- pages = "5997--6023"
124
- }
125
-
126
  @InProceedings{sikasote-anastasopoulos:2022:LREC,
127
  author = {Sikasote, Claytone and Anastasopoulos, Antonios},
128
  title = {BembaSpeech: A Speech Recognition Corpus for the Bemba Language},
@@ -135,22 +123,6 @@ Copy@inproceedings{wang-etal-2024-afrimte,
135
  abstract = {We present a preprocessed, ready-to-use automatic speech recognition corpus, BembaSpeech, consisting over 24 hours of read speech in the Bemba language, a written but low-resourced language spoken by over 30\% of the population in Zambia. To assess its usefulness for training and testing ASR systems for Bemba, we explored different approaches; supervised pre-training (training from scratch), cross-lingual transfer learning from a monolingual English pre-trained model using DeepSpeech on the portion of the dataset and fine-tuning large scale self-supervised Wav2Vec2.0 based multilingual pre-trained models on the complete BembaSpeech corpus. From our experiments, the 1 billion XLS-R parameter model gives the best results. The model achieves a word error rate (WER) of 32.91\%, results demonstrating that model capacity significantly improves performance and that multilingual pre-trained models transfers cross-lingual acoustic representation better than monolingual pre-trained English model on the BembaSpeech for the Bemba ASR. Lastly, results also show that the corpus can be used for building ASR systems for Bemba language.},
136
  url = {https://aclanthology.org/2022.lrec-1.790}
137
  }
138
-
139
- @inproceedings{wang2024evaluating,
140
- title={Evaluating WMT 2024 Metrics Shared Task Submissions on AfriMTE (the African Challenge Set)},
141
- author={Wang, Jiayi and Adelani, David Ifeoluwa and Stenetorp, Pontus},
142
- booktitle={Proceedings of the Ninth Conference on Machine Translation},
143
- pages={505--516},
144
- year={2024}
145
- }
146
-
147
- @inproceedings{freitag2024llms,
148
- title={Are LLMs breaking MT metrics? results of the WMT24 metrics shared task},
149
- author={Freitag, Markus and Mathur, Nitika and Deutsch, Daniel and Lo, Chi-Kiu and Avramidis, Eleftherios and Rei, Ricardo and Thompson, Brian and Blain, Frederic and Kocmi, Tom and Wang, Jiayi and others},
150
- booktitle={Proceedings of the Ninth Conference on Machine Translation},
151
- pages={47--81},
152
- year={2024}
153
- }
154
  ```
155
  # Contact
156
 
 
81
  ## Citation
82
 
83
  ```
84
+ @misc{radford2022whisper,
85
+ doi = {10.48550/ARXIV.2212.04356},
86
+ url = {https://arxiv.org/abs/2212.04356},
87
+ author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
88
+ title = {Robust Speech Recognition via Large-Scale Weak Supervision},
89
+ publisher = {arXiv},
90
+ year = {2022},
91
+ copyright = {arXiv.org perpetual, non-exclusive license}
92
  }
93
 
94
  @inproceedings{sikasote-etal-2023-big,
 
111
  abstract = "We present BIG-C (Bemba Image Grounded Conversations), a large multimodal dataset for Bemba. While Bemba is the most populous language of Zambia, it exhibits a dearth of resources which render the development of language technologies or language processing research almost impossible. The dataset is comprised of multi-turn dialogues between Bemba speakers based on images, transcribed and translated into English. There are more than 92,000 utterances/sentences, amounting to more than 180 hours of audio data with corresponding transcriptions and English translations. We also provide baselines on speech recognition (ASR), machine translation (MT) and speech translation (ST) tasks, and sketch out other potential future multimodal uses of our dataset. We hope that by making the dataset available to the research community, this work will foster research and encourage collaboration across the language, speech, and vision communities especially for languages outside the {``}traditionally{''} used high-resourced ones. All data and code are publicly available: [\url{https://github.com/csikasote/bigc}](\url{https://github.com/csikasote/bigc}).",
112
  }
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  @InProceedings{sikasote-anastasopoulos:2022:LREC,
115
  author = {Sikasote, Claytone and Anastasopoulos, Antonios},
116
  title = {BembaSpeech: A Speech Recognition Corpus for the Bemba Language},
 
123
  abstract = {We present a preprocessed, ready-to-use automatic speech recognition corpus, BembaSpeech, consisting over 24 hours of read speech in the Bemba language, a written but low-resourced language spoken by over 30\% of the population in Zambia. To assess its usefulness for training and testing ASR systems for Bemba, we explored different approaches; supervised pre-training (training from scratch), cross-lingual transfer learning from a monolingual English pre-trained model using DeepSpeech on the portion of the dataset and fine-tuning large scale self-supervised Wav2Vec2.0 based multilingual pre-trained models on the complete BembaSpeech corpus. From our experiments, the 1 billion XLS-R parameter model gives the best results. The model achieves a word error rate (WER) of 32.91\%, results demonstrating that model capacity significantly improves performance and that multilingual pre-trained models transfers cross-lingual acoustic representation better than monolingual pre-trained English model on the BembaSpeech for the Bemba ASR. Lastly, results also show that the corpus can be used for building ASR systems for Bemba language.},
124
  url = {https://aclanthology.org/2022.lrec-1.790}
125
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  ```
127
  # Contact
128