Update README.md
Browse files
README.md
CHANGED
@@ -43,14 +43,11 @@ The model can generate audio on its own but its better to use a voice to prompt
|
|
43 |
|
44 |
### Prompt YarnGPT2
|
45 |
```python
|
46 |
-
# clone the YarnGPT repo to get access to the `audiotokenizer`
|
47 |
-
!git clone https://github.com/saheedniyi02/yarngpt.git
|
48 |
|
|
|
49 |
|
50 |
-
|
51 |
-
!pip install outetts==0.2.3 uroman
|
52 |
|
53 |
-
#import some important packages
|
54 |
import os
|
55 |
import re
|
56 |
import json
|
@@ -63,174 +60,47 @@ import torchaudio
|
|
63 |
import IPython
|
64 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
65 |
from outetts.wav_tokenizer.decoder import WavTokenizer
|
66 |
-
from yarngpt.audiotokenizer import AudioTokenizerV2
|
67 |
|
68 |
|
69 |
-
# download the wavtokenizer weights and config (to encode and decode the audio)
|
70 |
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
71 |
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
72 |
|
73 |
-
|
74 |
-
|
|
|
|
|
75 |
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
76 |
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
77 |
|
78 |
-
|
79 |
audio_tokenizer=AudioTokenizerV2(
|
80 |
-
|
81 |
-
)
|
82 |
|
83 |
-
#load the model weights
|
84 |
|
85 |
-
model = AutoModelForCausalLM.from_pretrained(
|
86 |
|
87 |
-
#
|
88 |
-
text="
|
89 |
|
90 |
-
#
|
91 |
-
prompt=audio_tokenizer.create_prompt(text,"idera")
|
92 |
|
93 |
-
# tokenize the prompt
|
94 |
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
95 |
|
96 |
-
# generate output from the model, you can tune the `.generate` parameters as you wish
|
97 |
output = model.generate(
|
98 |
input_ids=input_ids,
|
99 |
temperature=0.1,
|
100 |
repetition_penalty=1.1,
|
101 |
max_length=4000,
|
|
|
102 |
)
|
103 |
|
104 |
-
# convert the output to "audio codes"
|
105 |
codes=audio_tokenizer.get_codes(output)
|
106 |
-
|
107 |
-
# converts the codes to audio
|
108 |
audio=audio_tokenizer.get_audio(codes)
|
109 |
-
|
110 |
-
# play the audio
|
111 |
IPython.display.Audio(audio,rate=24000)
|
|
|
112 |
|
113 |
-
# save the audio
|
114 |
-
torchaudio.save(f"audio.wav", audio, sample_rate=24000)
|
115 |
-
```
|
116 |
-
|
117 |
-
### Simple Nigerian Accented-NewsReader
|
118 |
-
```python
|
119 |
-
!git clone https://github.com/saheedniyi02/yarngpt.git
|
120 |
-
|
121 |
-
pip install outetts uroman trafilatura pydub
|
122 |
-
|
123 |
-
import os
|
124 |
-
import re
|
125 |
-
import json
|
126 |
-
import torch
|
127 |
-
import inflect
|
128 |
-
import random
|
129 |
-
import requests
|
130 |
-
import trafilatura
|
131 |
-
import inflect
|
132 |
-
import uroman as ur
|
133 |
-
import numpy as np
|
134 |
-
import torchaudio
|
135 |
-
import IPython
|
136 |
-
from pydub import AudioSegment
|
137 |
-
from pydub.effects import normalize
|
138 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
139 |
-
from outetts.wav_tokenizer.decoder import WavTokenizer
|
140 |
-
|
141 |
-
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
142 |
-
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
143 |
-
|
144 |
-
from yarngpt.audiotokenizer import AudioTokenizerV2
|
145 |
-
|
146 |
-
tokenizer_path="saheedniyi/YarnGPT2"
|
147 |
-
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
148 |
-
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
149 |
-
|
150 |
-
audio_tokenizer=AudioTokenizerV2(
|
151 |
-
tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
152 |
-
)
|
153 |
-
|
154 |
-
model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
155 |
-
|
156 |
-
# Split text into chunks
|
157 |
-
def split_text_into_chunks(text, word_limit=25):
|
158 |
-
sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
|
159 |
-
chunks=[]
|
160 |
-
for sentence in sentences:
|
161 |
-
chunks.append(".")
|
162 |
-
sentence_splitted=sentence.split(" ")
|
163 |
-
num_words=len(sentence_splitted)
|
164 |
-
|
165 |
-
if (num_words>word_limit) and (num_words<=word_limit*2):
|
166 |
-
chunks.append(" ".join(sentence_splitted[:int(num_words/2)]))
|
167 |
-
chunks.append(" ".join(sentence_splitted[int(num_words/2):]))
|
168 |
-
elif (num_words>word_limit*2) and (num_words<=word_limit*3):
|
169 |
-
chunks.append(" ".join(sentence_splitted[:int(num_words/3)]))
|
170 |
-
chunks.append(" ".join(sentence_splitted[int(num_words/3):int(2*num_words/3)]))
|
171 |
-
chunks.append(" ".join(sentence_splitted[int(2*num_words/3):]))
|
172 |
-
elif (num_words>word_limit*3) and (num_words<=word_limit*4):
|
173 |
-
chunks.append(" ".join(sentence_splitted[:int(num_words/4)]))
|
174 |
-
chunks.append(" ".join(sentence_splitted[int(num_words/4):word_limit*2]))
|
175 |
-
chunks.append(" ".join(sentence_splitted[int(2*num_words/4):int(3*num_words/4)]))
|
176 |
-
chunks.append(" ".join(sentence_splitted[int(3*num_words/4):]))
|
177 |
-
elif (num_words>word_limit*4) and (num_words<=word_limit*5):
|
178 |
-
chunks.append(" ".join(sentence_splitted[:int(num_words/5)]))
|
179 |
-
chunks.append(" ".join(sentence_splitted[int(num_words/5):int(2*num_words/5)]))
|
180 |
-
chunks.append(" ".join(sentence_splitted[int(2*num_words/5):int(3*num_words/5)]))
|
181 |
-
chunks.append(" ".join(sentence_splitted[int(3*num_words/5):int(4*num_words/5)]))
|
182 |
-
chunks.append(" ".join(sentence_splitted[int(4*num_words/5):]))
|
183 |
-
else:
|
184 |
-
chunks.append(sentence)
|
185 |
-
return chunks
|
186 |
-
|
187 |
-
def speed_change(sound, speed=0.9):
|
188 |
-
# Manually override the frame_rate. This tells the computer how many
|
189 |
-
# samples to play per second
|
190 |
-
sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
|
191 |
-
"frame_rate": int(sound.frame_rate * speed)
|
192 |
-
})
|
193 |
-
# convert the sound with altered frame rate to a standard frame rate
|
194 |
-
# so that regular playback programs will work right. They often only
|
195 |
-
# know how to play audio at standard frame rate (like 44.1k)
|
196 |
-
return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
|
197 |
-
|
198 |
-
#change the url
|
199 |
-
url="https://punchng.com/im-not-desperate-for-2027-presidential-ticket-obi/"
|
200 |
-
|
201 |
-
page=requests.get(url)
|
202 |
-
content=trafilatura.extract(page.text)
|
203 |
-
chunks=split_text_into_chunks(content)
|
204 |
-
|
205 |
-
all_codes=[]
|
206 |
-
#Looping over the chunks and adding creating a large `all_codes` list
|
207 |
-
for i,chunk in enumerate(chunks):
|
208 |
-
print(i)
|
209 |
-
print("\n")
|
210 |
-
print(chunk)
|
211 |
-
if chunk==".":
|
212 |
-
#add silence for 0.5 seconds if we encounter a full stop
|
213 |
-
all_codes.extend([453]*38)
|
214 |
-
else:
|
215 |
-
# Change the language and voice here
|
216 |
-
prompt=audio_tokenizer.create_prompt(chunk,lang="english",speaker_name="jude")
|
217 |
-
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
218 |
-
output = model.generate(
|
219 |
-
input_ids=input_ids,
|
220 |
-
temperature=0.1,
|
221 |
-
repetition_penalty=1.1,
|
222 |
-
max_length=4000,
|
223 |
-
#num_beams=5,
|
224 |
-
)
|
225 |
-
codes=audio_tokenizer.get_codes(output)
|
226 |
-
all_codes.extend(codes)
|
227 |
-
|
228 |
-
audio=audio_tokenizer.get_audio(all_codes)
|
229 |
-
IPython.display.Audio(audio,rate=24000)
|
230 |
-
torchaudio.save(f"news1.wav",
|
231 |
-
audio,
|
232 |
-
sample_rate=24000,
|
233 |
-
)
|
234 |
```
|
235 |
|
236 |
## Model Description
|
|
|
43 |
|
44 |
### Prompt YarnGPT2
|
45 |
```python
|
|
|
|
|
46 |
|
47 |
+
!git clone https://github.com/saheedniyi02/yarngpt.git
|
48 |
|
49 |
+
pip install outetts uroman
|
|
|
50 |
|
|
|
51 |
import os
|
52 |
import re
|
53 |
import json
|
|
|
60 |
import IPython
|
61 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
62 |
from outetts.wav_tokenizer.decoder import WavTokenizer
|
|
|
63 |
|
64 |
|
|
|
65 |
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
66 |
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
67 |
|
68 |
+
|
69 |
+
from yarngpt.audiotokenizer import AudioTokenizerV2
|
70 |
+
|
71 |
+
tokenizer_path="saheedniyi/YarnGPT2"
|
72 |
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
73 |
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
74 |
|
75 |
+
|
76 |
audio_tokenizer=AudioTokenizerV2(
|
77 |
+
tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
78 |
+
)
|
79 |
|
|
|
80 |
|
81 |
+
model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
82 |
|
83 |
+
#change the text
|
84 |
+
text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
|
85 |
|
86 |
+
# change the language and voice
|
87 |
+
prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
|
88 |
|
|
|
89 |
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
90 |
|
|
|
91 |
output = model.generate(
|
92 |
input_ids=input_ids,
|
93 |
temperature=0.1,
|
94 |
repetition_penalty=1.1,
|
95 |
max_length=4000,
|
96 |
+
#num_beams=5,# using a beam size helps for the local languages but not english
|
97 |
)
|
98 |
|
|
|
99 |
codes=audio_tokenizer.get_codes(output)
|
|
|
|
|
100 |
audio=audio_tokenizer.get_audio(codes)
|
|
|
|
|
101 |
IPython.display.Audio(audio,rate=24000)
|
102 |
+
torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
```
|
105 |
|
106 |
## Model Description
|