bharat-mukheja commited on
Commit
d90da61
·
1 Parent(s): 95b8e87

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +71 -0
model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import torch
3
+ from transformers import pipeline
4
+
5
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
6
+ pipe = pipeline(
7
+ "automatic-speech-recognition", model="openai/whisper-base", device=device
8
+ )
9
+
10
+
11
+ from datasets import load_dataset
12
+
13
+ dataset = load_dataset("facebook/voxpopuli", "it", split="validation", streaming=True)
14
+ sample = next(iter(dataset))
15
+
16
+
17
+ from IPython.display import Audio
18
+
19
+ Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])
20
+
21
+ def translate(audio):
22
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
23
+ return outputs["text"]
24
+
25
+ # print(translate(sample["audio"].copy()))
26
+ # print(sample["raw_text"])
27
+
28
+
29
+ # Text to speech
30
+
31
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
32
+
33
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
34
+
35
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
36
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
37
+
38
+ model.to(device)
39
+ vocoder.to(device)
40
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
41
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
42
+
43
+ def synthesise(text):
44
+ inputs = processor(text=text, return_tensors="pt")
45
+ speech = model.generate_speech(
46
+ inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
47
+ )
48
+ return speech.cpu()
49
+
50
+ speech = synthesise("Hey there! This is a test!")
51
+
52
+ Audio(speech, rate=16000)
53
+
54
+
55
+ # Concatenate the two models
56
+
57
+ import numpy as np
58
+
59
+ target_dtype = np.int16
60
+ max_range = np.iinfo(target_dtype).max
61
+
62
+
63
+ def speech_to_speech_translation(audio):
64
+ translated_text = translate(audio)
65
+ synthesised_speech = synthesise(translated_text)
66
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
67
+ return 16000, synthesised_speech
68
+
69
+ sampling_rate, synthesised_speech = speech_to_speech_translation(sample["audio"])
70
+
71
+ Audio(synthesised_speech, rate=sampling_rate)