File size: 4,799 Bytes
5bf8054
c310cb4
ee5c425
c310cb4
 
a25ae1b
c310cb4
a25ae1b
df2d876
 
 
 
a25ae1b
c310cb4
f75874f
 
 
 
 
047ab5b
bf10d7f
047ab5b
 
 
f75874f
bf10d7f
 
4e61db1
bf10d7f
f75874f
047ab5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e61db1
047ab5b
 
4e61db1
047ab5b
 
 
 
bf10d7f
f75874f
 
047ab5b
 
c310cb4
a25ae1b
c310cb4
 
a25ae1b
 
98eaa14
a25ae1b
98eaa14
84ca7cb
6e89986
98eaa14
ed2e1d6
 
 
df2d876
 
6e89986
 
 
c310cb4
 
 
5bf8054
 
a25ae1b
c0052fc
a25ae1b
f75874f
df2d876
f75874f
 
ee5c425
 
f75874f
 
 
 
 
ee5c425
 
 
 
f75874f
ee5c425
 
 
c310cb4
a25ae1b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr

LAST_UPDATED = "Nov 25th 2024"

####################################
# Datos estáticos del leaderboard
####################################
leaderboard_data = [
    {'name': 'StyleTTS 2', 'STOI': 0.998, 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 3.47},
    {'name': 'Matxa-TTS', 'STOI': 0.996, 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50},
    {'name': 'Matxa-TTS-multiaccent', 'STOI': 0.996, 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98},
    {'name': 'StableTTS', 'STOI': 0.997, 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62},
]


# Texto para la pestaña de métricas
METRICS_TAB_TEXT = """
## Metrics

Models in the leaderboard are evaluated using several key metrics: 
* **UTMOS** (UTokyo-SaruLab Mean Opinion Score),
* **WER** (Word Error Rate),
* **STOI** (Short-Time Objective Intelligibility),
* **PESQ** (Perceptual Evaluation of Speech Quality).

These metrics help evaluate both the accuracy and quality of the model.

### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
UTMOS is a MOS prediction system. **A higher UTMOS indicates better quality** of the generated voice.


### WER (Word Error Rate)
WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.

Example:
| Reference   | the  | cat | sat     | on  | the  | mat |
|-------------|------|-----|---------|-----|------|-----|
| Prediction  | the  | cat | **sit** | on  | the  |     |
| Label       | ✅   | ✅  | S       | ✅  | ✅   | D   |

The WER calculation is done as follows:


```
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
```

### STOI (Short-Time Objective Intelligibility)[[Paper](https://ieeexplore.ieee.org/abstract/document/5495701?casa_token=PLtqLc8KNAgAAAAA:FOLuZ4dgMYsnGb1dQHgqVOouQzRJ3vA5yqj-sbwf8gs9Q-AIDCLkMZzAgzRrAogwwxULK9zsYeE)]
STOI measures the intelligibility of the synthesized speech signal compared to the original signal. **A higher STOI indicates better intelligibility**.

### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**.


## Benchmark Datasets
Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/rjzevallos/test_app/blob/main/bsc.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation.
"""



####################################
# Functions (static version)
####################################

def get_leaderboard():
    """
    Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS.
    """
    # Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida)
    sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True)
    
    # Asignar el rank basado en el orden por PESQ
    for rank, model in enumerate(sorted_leaderboard):
        model['rank'] = rank + 1  # rank es la posición en la lista (1-indexed)
    
    return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['STOI'], model['PESQ']] for model in sorted_leaderboard]

####################################
# Interfaz con Gradio
####################################

theme = gr.themes.Base(
    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
)

with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")
    

    
with gr.Blocks(theme=theme) as demo:
    gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.DataFrame(
                headers=["Rank", "Model", "UTMOS", "WER", "STOI", "PESQ"], 
                datatype=["str", "str", "str", "str", "str", "str"], 
                value=get_leaderboard()  # Carga los datos iniciales de la tabla
            )

        with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        
    gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")



# Lanzar la aplicación
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)