test_app / app.py
rjzevallos's picture
Update app.py
4e61db1 verified
import gradio as gr
LAST_UPDATED = "Nov 25th 2024"
####################################
# Datos estáticos del leaderboard
####################################
leaderboard_data = [
{'name': 'StyleTTS 2', 'STOI': 0.998, 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 3.47},
{'name': 'Matxa-TTS', 'STOI': 0.996, 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50},
{'name': 'Matxa-TTS-multiaccent', 'STOI': 0.996, 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98},
{'name': 'StableTTS', 'STOI': 0.997, 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62},
]
# Texto para la pestaña de métricas
METRICS_TAB_TEXT = """
## Metrics
Models in the leaderboard are evaluated using several key metrics:
* **UTMOS** (UTokyo-SaruLab Mean Opinion Score),
* **WER** (Word Error Rate),
* **STOI** (Short-Time Objective Intelligibility),
* **PESQ** (Perceptual Evaluation of Speech Quality).
These metrics help evaluate both the accuracy and quality of the model.
### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
UTMOS is a MOS prediction system. **A higher UTMOS indicates better quality** of the generated voice.
### WER (Word Error Rate)
WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
Example:
| Reference | the | cat | sat | on | the | mat |
|-------------|------|-----|---------|-----|------|-----|
| Prediction | the | cat | **sit** | on | the | |
| Label | ✅ | ✅ | S | ✅ | ✅ | D |
The WER calculation is done as follows:
```
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
```
### STOI (Short-Time Objective Intelligibility)[[Paper](https://ieeexplore.ieee.org/abstract/document/5495701?casa_token=PLtqLc8KNAgAAAAA:FOLuZ4dgMYsnGb1dQHgqVOouQzRJ3vA5yqj-sbwf8gs9Q-AIDCLkMZzAgzRrAogwwxULK9zsYeE)]
STOI measures the intelligibility of the synthesized speech signal compared to the original signal. **A higher STOI indicates better intelligibility**.
### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**.
## Benchmark Datasets
Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/rjzevallos/test_app/blob/main/bsc.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation.
"""
####################################
# Functions (static version)
####################################
def get_leaderboard():
"""
Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS.
"""
# Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida)
sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True)
# Asignar el rank basado en el orden por PESQ
for rank, model in enumerate(sorted_leaderboard):
model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed)
return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['STOI'], model['PESQ']] for model in sorted_leaderboard]
####################################
# Interfaz con Gradio
####################################
theme = gr.themes.Base(
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
leaderboard_table = gr.DataFrame(
headers=["Rank", "Model", "UTMOS", "WER", "STOI", "PESQ"],
datatype=["str", "str", "str", "str", "str", "str"],
value=get_leaderboard() # Carga los datos iniciales de la tabla
)
with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")
# Lanzar la aplicación
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)