Spaces:
Runtime error
Runtime error
UPDAED app.py with català speaker embedding
Browse files
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
from datasets import load_dataset
|
5 |
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
|
6 |
-
|
7 |
|
8 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
9 |
|
@@ -18,12 +20,59 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
|
|
18 |
"JanLilan/speecht5_finetuned_openslr-slr69-cat"
|
19 |
).to(device)
|
20 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
|
|
|
|
|
|
|
|
21 |
# we will try to translate with this voice embedding... Let's see what happen. else:
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# etc.
|
25 |
-
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
26 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
27 |
|
28 |
|
29 |
def translate(audio):
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
import gradio as gr
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
from datasets import load_dataset
|
7 |
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
|
8 |
+
from speechbrain.pretrained import EncoderClassifier
|
9 |
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
11 |
|
|
|
20 |
"JanLilan/speecht5_finetuned_openslr-slr69-cat"
|
21 |
).to(device)
|
22 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
|
23 |
+
|
24 |
+
######################################################################################
|
25 |
+
################################## SPEAKER EMBEDDING #################################
|
26 |
+
######################################################################################
|
27 |
# we will try to translate with this voice embedding... Let's see what happen. else:
|
28 |
+
dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
|
29 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
30 |
+
# LOAD
|
31 |
+
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
32 |
+
speaker_model = EncoderClassifier.from_hparams(
|
33 |
+
source=spk_model_name,
|
34 |
+
run_opts={"device": device},
|
35 |
+
savedir=os.path.join("/tmp", spk_model_name),
|
36 |
+
)
|
37 |
+
|
38 |
+
def create_speaker_embedding(waveform):
|
39 |
+
with torch.no_grad():
|
40 |
+
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
|
41 |
+
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
|
42 |
+
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
|
43 |
+
return speaker_embeddings
|
44 |
+
|
45 |
+
# we must take one speaker embeding
|
46 |
+
checkpoint = "microsoft/speecht5_tts"
|
47 |
+
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
48 |
+
|
49 |
+
# function to embedd
|
50 |
+
def prepare_dataset(example):
|
51 |
+
audio = example["audio"]
|
52 |
+
|
53 |
+
example = processor(
|
54 |
+
text=example["transcription"],
|
55 |
+
audio_target=audio["array"],
|
56 |
+
sampling_rate=audio["sampling_rate"],
|
57 |
+
return_attention_mask=False,
|
58 |
+
)
|
59 |
+
|
60 |
+
# strip off the batch dimension
|
61 |
+
example["labels"] = example["labels"][0]
|
62 |
+
|
63 |
+
# use SpeechBrain to obtain x-vector
|
64 |
+
example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
65 |
+
|
66 |
+
return example
|
67 |
+
|
68 |
+
processed_example = prepare_dataset(dataset[0])
|
69 |
+
speaker_embeddings = torch.tensor(processed_example["speaker_embeddings"]).unsqueeze(0)
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
# etc.
|
74 |
+
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
75 |
+
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
76 |
|
77 |
|
78 |
def translate(audio):
|