JanLilan commited on
Commit
2a3194a
·
1 Parent(s): 9192af8

UPDAED app.py with català speaker embedding

Browse files
Files changed (1) hide show
  1. app.py +54 -5
app.py CHANGED
@@ -1,9 +1,11 @@
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
-
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
@@ -18,12 +20,59 @@ model = SpeechT5ForTextToSpeech.from_pretrained(
18
  "JanLilan/speecht5_finetuned_openslr-slr69-cat"
19
  ).to(device)
20
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
 
 
 
21
  # we will try to translate with this voice embedding... Let's see what happen. else:
22
- # dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
23
- # dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # etc.
25
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
27
 
28
 
29
  def translate(audio):
 
1
+ import os
2
+ import torch
3
  import gradio as gr
4
  import numpy as np
5
  import torch
6
  from datasets import load_dataset
7
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
8
+ from speechbrain.pretrained import EncoderClassifier
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
 
20
  "JanLilan/speecht5_finetuned_openslr-slr69-cat"
21
  ).to(device)
22
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
23
+
24
+ ######################################################################################
25
+ ################################## SPEAKER EMBEDDING #################################
26
+ ######################################################################################
27
  # we will try to translate with this voice embedding... Let's see what happen. else:
28
+ dataset = load_dataset("projecte-aina/openslr-slr69-ca-trimmed-denoised", split="train")
29
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
30
+ # LOAD
31
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
32
+ speaker_model = EncoderClassifier.from_hparams(
33
+ source=spk_model_name,
34
+ run_opts={"device": device},
35
+ savedir=os.path.join("/tmp", spk_model_name),
36
+ )
37
+
38
+ def create_speaker_embedding(waveform):
39
+ with torch.no_grad():
40
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
41
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
42
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
43
+ return speaker_embeddings
44
+
45
+ # we must take one speaker embeding
46
+ checkpoint = "microsoft/speecht5_tts"
47
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
48
+
49
+ # function to embedd
50
+ def prepare_dataset(example):
51
+ audio = example["audio"]
52
+
53
+ example = processor(
54
+ text=example["transcription"],
55
+ audio_target=audio["array"],
56
+ sampling_rate=audio["sampling_rate"],
57
+ return_attention_mask=False,
58
+ )
59
+
60
+ # strip off the batch dimension
61
+ example["labels"] = example["labels"][0]
62
+
63
+ # use SpeechBrain to obtain x-vector
64
+ example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
65
+
66
+ return example
67
+
68
+ processed_example = prepare_dataset(dataset[0])
69
+ speaker_embeddings = torch.tensor(processed_example["speaker_embeddings"]).unsqueeze(0)
70
+
71
+
72
+
73
  # etc.
74
+ # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
75
+ # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
76
 
77
 
78
  def translate(audio):