|
|
import altair as alt |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import streamlit as st |
|
|
import streamlit.components.v1 as components |
|
|
from pathlib import Path |
|
|
import os |
|
|
import pyperclip |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Automatic Speech Recognition for African Languages", |
|
|
layout="wide" |
|
|
) |
|
|
|
|
|
st.title("ASR for African Languages Model Hub") |
|
|
|
|
|
|
|
|
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([ |
|
|
"About", |
|
|
"Benchmark Dataset", |
|
|
"Model Collections", |
|
|
"Evaluation Scenarios", |
|
|
"ASR models demo", |
|
|
"Results" |
|
|
]) |
|
|
|
|
|
with tab5: |
|
|
st.header("Demo") |
|
|
|
|
|
|
|
|
components.iframe( |
|
|
"https://asr-africa-asr-african-languages.hf.space/", |
|
|
height=800, |
|
|
scrolling=True |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab2: |
|
|
|
|
|
md_file = Path("src/benchmark.md") |
|
|
if md_file.exists(): |
|
|
st.markdown(md_file.read_text()) |
|
|
else: |
|
|
st.info("Upload .md file") |
|
|
|
|
|
with tab3: |
|
|
st.header("Model Collections") |
|
|
st.write("Explore available ASR model collections, grouped by language:") |
|
|
|
|
|
languages = { |
|
|
"Ewe": "https://huggingface.co/collections/asr-africa/ewe-68d3d85e015eea82e1355e95", |
|
|
"Swahili": "https://huggingface.co/collections/asr-africa/swahili-new-676666b26fd924e18fa8781a", |
|
|
"Lingala": "https://huggingface.co/collections/asr-africa/lingala-new-676666a913beb149ccc22243", |
|
|
"Luganda": "https://huggingface.co/collections/asr-africa/luganda-new-67666690a7812f6a52248d66", |
|
|
"Wolof": "https://huggingface.co/collections/asr-africa/wolof-66fbeddd8f3b78428e0bdd57", |
|
|
"Hausa": "https://huggingface.co/collections/asr-africa/hausa-66e14b187658eb2032f2d80b", |
|
|
"Igbo": "https://huggingface.co/collections/asr-africa/igbo-66e14e30a533df3d8277334d", |
|
|
"Yoruba": "https://huggingface.co/collections/asr-africa/yoruba-66e15043c177114958255eaa", |
|
|
"Bambara": "https://huggingface.co/collections/asr-africa/bambara-66e152a56048d62cd8e6750b", |
|
|
"Zulu": "https://huggingface.co/collections/asr-africa/zulu-66e1d8c419ce4dfba1d500b1", |
|
|
"Xhosa": "https://huggingface.co/collections/asr-africa/xhosa-66e1da92a4fcbc413b4699eb", |
|
|
"Afrikaans": "https://huggingface.co/collections/asr-africa/afrikaans-66e1dc2e07da322da51ca415", |
|
|
"Bemba": "https://huggingface.co/collections/asr-africa/bemba-66e1dd3adce93c72498d12c3", |
|
|
"Shona": "https://huggingface.co/collections/asr-africa/shona-66e1de0a076e2b2237b7c5a8", |
|
|
"Kinyarwanda": "https://huggingface.co/collections/asr-africa/kinyarwanda-66e2e97e15879154e1f47fb7", |
|
|
"Fula": "https://huggingface.co/collections/asr-africa/fula-66e97b9370af82f2d163e80d", |
|
|
"Akan": "https://huggingface.co/collections/asr-africa/akan-66e97d0da2f86f17cad499f0" |
|
|
} |
|
|
|
|
|
base_models = { |
|
|
"Wav2Vec2 XLS-R (300M)": "https://huggingface.co/facebook/wav2vec2-xls-r-300m", |
|
|
"Whisper-Small": "https://huggingface.co/openai/whisper-small", |
|
|
"MMS-1B": "https://huggingface.co/facebook/mms-1b-all", |
|
|
"W2V2-BERT 2.0": "https://huggingface.co/facebook/w2v-bert-2.0" |
|
|
} |
|
|
|
|
|
st.subheader("Base Architectures") |
|
|
for name, link in base_models.items(): |
|
|
st.markdown(f"- [{name}]({link})") |
|
|
|
|
|
st.subheader("Language-Specific Collections") |
|
|
|
|
|
|
|
|
for lang in sorted(languages.keys()): |
|
|
link = languages[lang] |
|
|
with st.expander(f"{lang} Models"): |
|
|
st.markdown(f"[View full {lang} collection on Hugging Face]({link})") |
|
|
st.write( |
|
|
"Models fine-tuned from Wav2Vec2 XLS-R, Whisper, MMS-1B, and W2V2-BERT " |
|
|
"to support high-quality speech recognition in this language." |
|
|
) |
|
|
|
|
|
with tab4: |
|
|
st.header("Evaluation Scenarios") |
|
|
st.write( |
|
|
"To benchmark ASR models for African languages, we design evaluation scenarios " |
|
|
"that mimic real-world challenges such as limited training data, domain shift, " |
|
|
"and variation in speech style." |
|
|
) |
|
|
|
|
|
|
|
|
st.subheader("Scenario Overview") |
|
|
scenarios = pd.DataFrame([ |
|
|
{ |
|
|
"Scenario": "Data Efficiency Benchmark", |
|
|
"Focus": "Low-resource training (1 hour per language)", |
|
|
"Languages": "Multiple African languages", |
|
|
"Dataset": "asr-africa/ASRAfricaDataEfficiencyBenchmark" |
|
|
}, |
|
|
{ |
|
|
"Scenario": "Domain Adaptation Benchmark", |
|
|
"Focus": "Performance shift across domains", |
|
|
"Languages": "Akan (Finance), Wolof (Agriculture)", |
|
|
"Dataset": "asr-africa/African-ASR-Domain-Adaptation-Evaluation" |
|
|
}, |
|
|
{ |
|
|
"Scenario": "Speech Type Adaptation", |
|
|
"Focus": "Different speech types (read, conversational, etc.)", |
|
|
"Languages": "Luganda, Wolof", |
|
|
"Dataset": "asr-africa/African-ASR-Speech-Type-Adaptation" |
|
|
} |
|
|
]) |
|
|
|
|
|
st.dataframe(scenarios, width='stretch') |
|
|
|
|
|
st.subheader("Explore Scenarios") |
|
|
|
|
|
with st.expander("Data Efficiency Benchmark"): |
|
|
st.markdown(""" |
|
|
- **Goal:** Evaluate ASR performance in low-resource conditions. |
|
|
- **Design:** 1 hour of transcribed audio per language. |
|
|
- **Includes:** audio + metadata. |
|
|
- **Use case:** Encourage data-efficient ASR systems. |
|
|
🔗 [View dataset](https://huggingface.co/datasets/asr-africa/ASRAfricaDataEfficiencyBenchmark) |
|
|
""") |
|
|
|
|
|
with st.expander("Domain Adaptation Benchmark"): |
|
|
st.markdown(""" |
|
|
- **Goal:** Test ASR generalization across domains. |
|
|
- **Languages:** |
|
|
- Akan → Financial domain testing. |
|
|
- Wolof → Agricultural domain testing. |
|
|
- **Challenge:** Many ASR systems degrade when tested on new domains. |
|
|
🔗 [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Domain-Adaptation-Evaluation) |
|
|
""") |
|
|
|
|
|
with st.expander("Speech Type Adaptation"): |
|
|
st.markdown(""" |
|
|
- **Goal:** Measure ASR performance on different types of speech. |
|
|
- **Types of Speech:** Read speech, conversational, spontaneous speech. |
|
|
🔗 [View dataset](https://huggingface.co/datasets/asr-africa/African-ASR-Speech-Type-Adaptation) |
|
|
""") |
|
|
|
|
|
with tab1: |
|
|
st.header("About") |
|
|
st.write( |
|
|
"Automatic Speech Recognition for African Languages: How much speech data is required for a good domain-specific Automatic Speech Recognition model?" |
|
|
) |
|
|
st.markdown(""" |
|
|
Previous studies have led to the collection of a considerable number of hours of open-source ASR data, |
|
|
for example, the work done in India where over 1000’s of hours of data were collected for low-resource |
|
|
Indian languages. In this research, we would like to transfer the learnings from these successes and |
|
|
replicate the same model for low-resource African languages. For example, the aspects around the use of |
|
|
speech data from noisy and non-noisy environments. However, to ensure that we proceed in a cost-efficient |
|
|
and sustainable approach, we deem it necessary to understand the amount of data that we need to collect |
|
|
for African languages. Hence, we propose to leverage the Mozilla Common Voice (MCV) platform and other |
|
|
appropriate and openly available / open-source repositories of African language datasets (see Appendix One) |
|
|
to build automatic speech recognition models and test their performance to learn if the data collected was sufficient. |
|
|
|
|
|
### What is a “Good ASR Model”? |
|
|
We will need to define what a “good” ASR model is for African languages. The aspects to consider will include |
|
|
the performance measures of ASR models, the performance indicators and the vocabulary size for domain-specific ASR models. |
|
|
|
|
|
- **Performance metrics**: Word Error Rate (WER) and Character Error Rate (CER). |
|
|
- **Target benchmarks**: < 10% WER and < 5% CER in lab settings. |
|
|
- **Lab vs Non-Lab**: Lab data = controlled clean audio; Non-lab = noisy, diverse, real-world audio. |
|
|
|
|
|
### Performance Indicators |
|
|
WER is not always the best indicator, especially for languages with diacritics. We need to test models on |
|
|
different **domains, distributions, and languages** to avoid over/underestimation. |
|
|
Usefulness will be measured on **generalization** and **accuracy**. |
|
|
|
|
|
### Sectors of Interest |
|
|
We will also further investigate the performance of domain-specific ASR models. We would like to investigate three specific domains: Health, Agriculture, and Education, which are key focus areas for the Foundation and for which we can have usable end-user applications in the African context. The idea is to obtain open-source speech datasets for these specific domains and evaluate ASR model performance across these domains. |
|
|
|
|
|
|
|
|
### Benchmark Dataset |
|
|
We will build a test set that can be used for benchmarking ASR models in some of the 30 most spoken African languages. The benchmark dataset will be structured to consist of unique MP3 files and corresponding text files. We will ensure as much as possible that the benchmark datasets are as diverse as possible with dataset characteristics like gender, age, accent, variant, vocabulary, acoustic characteristics to help improve the accuracy of speech recognition models. The speech benchmark dataset will be reviewed, deemed highly quality, and split into dev, test and train sets. Due to the largely acoustic nature of African languages (mostly tonal, diacritical, etc.), a careful speech analysis of African languages is necessary and the benchmark dataset is important to spur more research in the African context. |
|
|
|
|
|
""") |
|
|
|
|
|
CITATION_TEXT = """@misc{asr-africa-2025, |
|
|
title = {Automatic Speech Recognition for African Languages}, |
|
|
author = {Dr Joyce Nakatumba-Nabende, Dr Peter Nabende, Dr Andrew Katumba, Alvin Nahabwe}, |
|
|
year = 2025, |
|
|
publisher = {Hugging Face}, |
|
|
howpublished = "\\url{https://huggingface.co/spaces/asr-africa/Automatic_Speech_Recognition_for_African_Languages}" |
|
|
}""" |
|
|
|
|
|
with st.expander("📙 Citation", expanded=False): |
|
|
st.text_area( |
|
|
"BibTeX snippet to cite this source", |
|
|
value=CITATION_TEXT, |
|
|
height=150, |
|
|
disabled=True |
|
|
) |
|
|
|
|
|
if st.button("📋 Copy to Clipboard"): |
|
|
try: |
|
|
pyperclip.copy(CITATION_TEXT) |
|
|
st.success("Citation copied to clipboard!") |
|
|
except pyperclip.PyperclipException: |
|
|
st.error("Could not copy automatically. Please copy manually.") |
|
|
|
|
|
with tab6: |
|
|
st.header("Results: WER vs Dataset Size") |
|
|
|
|
|
|
|
|
st.subheader("Introduction") |
|
|
st.write(""" |
|
|
Automatic Speech Recognition (ASR) for African languages remains challenging due to the scarcity of labeled data and limited methodological guidance for low-resource settings. While interest in multilingual and low-resource ASR is growing, there is still limited understanding of how different pretrained models perform across diverse African languages, data sizes, and decoding strategies. |
|
|
|
|
|
In this study, we benchmark four state-of-the-art ASR models, Wav2Vec2 XLS-R, Whisper, MMS, and W2V-BERT, across 17 African languages representing East, West, and Southern Africa. These include Luganda, Swahili, Kinyarwanda, Wolof, Akan, Ewe, Xhosa, Lingala, Amharic, Bambara, Bemba, Zulu, Igbo, Shona, Afrikaans, Hausa, and Fula. Our findings contribute empirical insights into model robustness and data efficiency in low-resource scenarios. |
|
|
|
|
|
""") |
|
|
|
|
|
|
|
|
st.subheader("Datasets") |
|
|
st.write(""" |
|
|
We trained each ASR model on 1, 5, 10, 20, 50, 100, 200 and 400-hour splits, based on labelled data available perlanguage. For Wav2Vec2-XLS-R and W2V-BERT, we also trained 5-gram language models using available textual data to assess the impact of language model integration |
|
|
|
|
|
""") |
|
|
|
|
|
|
|
|
st.subheader("Results") |
|
|
st.write(""" |
|
|
Overall, the Word Error Rate (WER) decreases as the number of training hours increases across all models and |
|
|
languages. This highlights the importance of dataset size in improving ASR performance, although the rate of |
|
|
improvement varies significantly between models. |
|
|
""") |
|
|
|
|
|
st.subheader("XLS-R") |
|
|
st.write(""" |
|
|
XLS-R shows a steep decline in log WER as the dataset size increases, especially in low-to-moderate data regimes. |
|
|
The improvement slows as the dataset becomes larger, suggesting diminishing returns in high-data settings. |
|
|
""") |
|
|
st.image("src/Images/xlsrlog.png", caption="Log WER vs Training Hours for XLS-R") |
|
|
|
|
|
|
|
|
st.subheader("W2v-BERT") |
|
|
st.write(""" |
|
|
W2v-BERT exhibits a more gradual decline in log WER. It performs well in low-data settings, showing stable reduction |
|
|
in WER as dataset size increases. This makes it suitable for low-resource languages. |
|
|
""") |
|
|
st.image("src/Images/bertlog.png", caption="Log WER vs Training Hours for W2v-BERT") |
|
|
|
|
|
|
|
|
st.subheader("Whisper") |
|
|
st.write(""" |
|
|
Whisper shows a consistent but moderate decline in log WER. Improvements are more linear compared to XLS-R, benefiting |
|
|
steadily from additional data, but it does not reach XLS-R’s high-data performance. |
|
|
""") |
|
|
st.image("src/Images/whisperlog.png", caption="Log WER vs Training Hours for Whisper") |
|
|
|
|
|
|
|
|
st.subheader("MMS") |
|
|
st.write(""" |
|
|
MMS shows significant improvement between 1–5 hours of training across multiple languages. However, the rate of |
|
|
improvement declines as more data is added. MMS performs strongly in both low- and high-data settings. |
|
|
""") |
|
|
st.image("src/Images/mmslog.png", caption="Log WER vs Training Hours for MMS") |
|
|
|
|
|
|
|
|
st.subheader("Takeaways") |
|
|
st.write(""" |
|
|
Model performance generally improves with more training data, but performance gains become smaller after 100 hours for some languages. Language model are more effective when training data is limited especially below 5o hours, but their impact reduces as data increases, with some variation across languages. |
|
|
|
|
|
""") |