Spaces:

rjzevallos
/

test_app

Sleeping

App Files Files Community

test_app / app.py

rjzevallos

Update app.py

4e61db1 verified 3 months ago

raw

history blame contribute delete

4.8 kB

	import gradio as gr

	LAST_UPDATED = "Nov 25th 2024"

	####################################
	# Datos estáticos del leaderboard
	####################################
	leaderboard_data = [
	{'name': 'StyleTTS 2', 'STOI': 0.998, 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 3.47},
	{'name': 'Matxa-TTS', 'STOI': 0.996, 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50},
	{'name': 'Matxa-TTS-multiaccent', 'STOI': 0.996, 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98},
	{'name': 'StableTTS', 'STOI': 0.997, 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62},
	]


	# Texto para la pestaña de métricas
	METRICS_TAB_TEXT = """
	## Metrics

	Models in the leaderboard are evaluated using several key metrics:
	* UTMOS (UTokyo-SaruLab Mean Opinion Score),
	* WER (Word Error Rate),
	* STOI (Short-Time Objective Intelligibility),
	* PESQ (Perceptual Evaluation of Speech Quality).

	These metrics help evaluate both the accuracy and quality of the model.

	### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
	UTMOS is a MOS prediction system. A higher UTMOS indicates better quality of the generated voice.


	### WER (Word Error Rate)
	WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. A lower WER value indicates higher accuracy.

	Example:
	\| Reference \| the \| cat \| sat \| on \| the \| mat \|
	\|-------------\|------\|-----\|---------\|-----\|------\|-----\|
	\| Prediction \| the \| cat \| sit \| on \| the \| \|
	\| Label \| ✅ \| ✅ \| S \| ✅ \| ✅ \| D \|

	The WER calculation is done as follows:


	```
	WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
	```

	### STOI (Short-Time Objective Intelligibility)[[Paper](https://ieeexplore.ieee.org/abstract/document/5495701?casa_token=PLtqLc8KNAgAAAAA:FOLuZ4dgMYsnGb1dQHgqVOouQzRJ3vA5yqj-sbwf8gs9Q-AIDCLkMZzAgzRrAogwwxULK9zsYeE)]
	STOI measures the intelligibility of the synthesized speech signal compared to the original signal. A higher STOI indicates better intelligibility.

	### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
	PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. A higher PESQ indicates better voice quality.


	## Benchmark Datasets
	Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/rjzevallos/test_app/blob/main/bsc.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation.
	"""



	####################################
	# Functions (static version)
	####################################

	def get_leaderboard():
	"""
	Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS.
	"""
	# Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida)
	sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True)

	# Asignar el rank basado en el orden por PESQ
	for rank, model in enumerate(sorted_leaderboard):
	model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed)

	return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['STOI'], model['PESQ']] for model in sorted_leaderboard]

	####################################
	# Interfaz con Gradio
	####################################

	theme = gr.themes.Base(
	font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
	)

	with gr.Blocks(theme=theme) as demo:
	gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")



	with gr.Blocks(theme=theme) as demo:
	gr.Markdown("# 🏆 Leaderboard\nVote to help the community determine the best Catalan TTS models.\n")
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
	leaderboard_table = gr.DataFrame(
	headers=["Rank", "Model", "UTMOS", "WER", "STOI", "PESQ"],
	datatype=["str", "str", "str", "str", "str", "str"],
	value=get_leaderboard() # Carga los datos iniciales de la tabla
	)

	with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
	gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")


	gr.Markdown(f"Last updated on {LAST_UPDATED}", elem_classes="markdown-text")



	# Lanzar la aplicación
	demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False)