saheedniyi commited on
Commit
30c7d13
·
verified ·
1 Parent(s): 343d40a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +16 -146
README.md CHANGED
@@ -43,14 +43,11 @@ The model can generate audio on its own but its better to use a voice to prompt
43
 
44
  ### Prompt YarnGPT2
45
  ```python
46
- # clone the YarnGPT repo to get access to the `audiotokenizer`
47
- !git clone https://github.com/saheedniyi02/yarngpt.git
48
 
 
49
 
50
- # install some necessary libraries
51
- !pip install outetts==0.2.3 uroman
52
 
53
- #import some important packages
54
  import os
55
  import re
56
  import json
@@ -63,174 +60,47 @@ import torchaudio
63
  import IPython
64
  from transformers import AutoModelForCausalLM, AutoTokenizer
65
  from outetts.wav_tokenizer.decoder import WavTokenizer
66
- from yarngpt.audiotokenizer import AudioTokenizerV2
67
 
68
 
69
- # download the wavtokenizer weights and config (to encode and decode the audio)
70
  !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
71
  !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
72
 
73
- # model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
74
- hf_path="saheedniyi/YarnGPT"
 
 
75
  wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
76
  wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
77
 
78
- # create the AudioTokenizer object
79
  audio_tokenizer=AudioTokenizerV2(
80
- hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
81
- )
82
 
83
- #load the model weights
84
 
85
- model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
86
 
87
- # your input text
88
- text="Uhm, so, what was the inspiration behind your latest project? Like, was there a specific moment where you were like, 'Yeah, this is it!' Or, you know, did it just kind of, uh, come together naturally over time?"
89
 
90
- # creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter, the possible speakers are "idera","emma","jude","osagie","tayo","zainab","joke","regina","remi","umar","chinenye" if no speaker is selected a speaker is chosen at random
91
- prompt=audio_tokenizer.create_prompt(text,"idera")
92
 
93
- # tokenize the prompt
94
  input_ids=audio_tokenizer.tokenize_prompt(prompt)
95
 
96
- # generate output from the model, you can tune the `.generate` parameters as you wish
97
  output = model.generate(
98
  input_ids=input_ids,
99
  temperature=0.1,
100
  repetition_penalty=1.1,
101
  max_length=4000,
 
102
  )
103
 
104
- # convert the output to "audio codes"
105
  codes=audio_tokenizer.get_codes(output)
106
-
107
- # converts the codes to audio
108
  audio=audio_tokenizer.get_audio(codes)
109
-
110
- # play the audio
111
  IPython.display.Audio(audio,rate=24000)
 
112
 
113
- # save the audio
114
- torchaudio.save(f"audio.wav", audio, sample_rate=24000)
115
- ```
116
-
117
- ### Simple Nigerian Accented-NewsReader
118
- ```python
119
- !git clone https://github.com/saheedniyi02/yarngpt.git
120
-
121
- pip install outetts uroman trafilatura pydub
122
-
123
- import os
124
- import re
125
- import json
126
- import torch
127
- import inflect
128
- import random
129
- import requests
130
- import trafilatura
131
- import inflect
132
- import uroman as ur
133
- import numpy as np
134
- import torchaudio
135
- import IPython
136
- from pydub import AudioSegment
137
- from pydub.effects import normalize
138
- from transformers import AutoModelForCausalLM, AutoTokenizer
139
- from outetts.wav_tokenizer.decoder import WavTokenizer
140
-
141
- !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
142
- !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
143
-
144
- from yarngpt.audiotokenizer import AudioTokenizerV2
145
-
146
- tokenizer_path="saheedniyi/YarnGPT2"
147
- wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
148
- wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
149
-
150
- audio_tokenizer=AudioTokenizerV2(
151
- tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
152
- )
153
-
154
- model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
155
-
156
- # Split text into chunks
157
- def split_text_into_chunks(text, word_limit=25):
158
- sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
159
- chunks=[]
160
- for sentence in sentences:
161
- chunks.append(".")
162
- sentence_splitted=sentence.split(" ")
163
- num_words=len(sentence_splitted)
164
-
165
- if (num_words>word_limit) and (num_words<=word_limit*2):
166
- chunks.append(" ".join(sentence_splitted[:int(num_words/2)]))
167
- chunks.append(" ".join(sentence_splitted[int(num_words/2):]))
168
- elif (num_words>word_limit*2) and (num_words<=word_limit*3):
169
- chunks.append(" ".join(sentence_splitted[:int(num_words/3)]))
170
- chunks.append(" ".join(sentence_splitted[int(num_words/3):int(2*num_words/3)]))
171
- chunks.append(" ".join(sentence_splitted[int(2*num_words/3):]))
172
- elif (num_words>word_limit*3) and (num_words<=word_limit*4):
173
- chunks.append(" ".join(sentence_splitted[:int(num_words/4)]))
174
- chunks.append(" ".join(sentence_splitted[int(num_words/4):word_limit*2]))
175
- chunks.append(" ".join(sentence_splitted[int(2*num_words/4):int(3*num_words/4)]))
176
- chunks.append(" ".join(sentence_splitted[int(3*num_words/4):]))
177
- elif (num_words>word_limit*4) and (num_words<=word_limit*5):
178
- chunks.append(" ".join(sentence_splitted[:int(num_words/5)]))
179
- chunks.append(" ".join(sentence_splitted[int(num_words/5):int(2*num_words/5)]))
180
- chunks.append(" ".join(sentence_splitted[int(2*num_words/5):int(3*num_words/5)]))
181
- chunks.append(" ".join(sentence_splitted[int(3*num_words/5):int(4*num_words/5)]))
182
- chunks.append(" ".join(sentence_splitted[int(4*num_words/5):]))
183
- else:
184
- chunks.append(sentence)
185
- return chunks
186
-
187
- def speed_change(sound, speed=0.9):
188
- # Manually override the frame_rate. This tells the computer how many
189
- # samples to play per second
190
- sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
191
- "frame_rate": int(sound.frame_rate * speed)
192
- })
193
- # convert the sound with altered frame rate to a standard frame rate
194
- # so that regular playback programs will work right. They often only
195
- # know how to play audio at standard frame rate (like 44.1k)
196
- return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
197
-
198
- #change the url
199
- url="https://punchng.com/im-not-desperate-for-2027-presidential-ticket-obi/"
200
-
201
- page=requests.get(url)
202
- content=trafilatura.extract(page.text)
203
- chunks=split_text_into_chunks(content)
204
-
205
- all_codes=[]
206
- #Looping over the chunks and adding creating a large `all_codes` list
207
- for i,chunk in enumerate(chunks):
208
- print(i)
209
- print("\n")
210
- print(chunk)
211
- if chunk==".":
212
- #add silence for 0.5 seconds if we encounter a full stop
213
- all_codes.extend([453]*38)
214
- else:
215
- # Change the language and voice here
216
- prompt=audio_tokenizer.create_prompt(chunk,lang="english",speaker_name="jude")
217
- input_ids=audio_tokenizer.tokenize_prompt(prompt)
218
- output = model.generate(
219
- input_ids=input_ids,
220
- temperature=0.1,
221
- repetition_penalty=1.1,
222
- max_length=4000,
223
- #num_beams=5,
224
- )
225
- codes=audio_tokenizer.get_codes(output)
226
- all_codes.extend(codes)
227
-
228
- audio=audio_tokenizer.get_audio(all_codes)
229
- IPython.display.Audio(audio,rate=24000)
230
- torchaudio.save(f"news1.wav",
231
- audio,
232
- sample_rate=24000,
233
- )
234
  ```
235
 
236
  ## Model Description
 
43
 
44
  ### Prompt YarnGPT2
45
  ```python
 
 
46
 
47
+ !git clone https://github.com/saheedniyi02/yarngpt.git
48
 
49
+ pip install outetts uroman
 
50
 
 
51
  import os
52
  import re
53
  import json
 
60
  import IPython
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from outetts.wav_tokenizer.decoder import WavTokenizer
 
63
 
64
 
 
65
  !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
66
  !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
67
 
68
+
69
+ from yarngpt.audiotokenizer import AudioTokenizerV2
70
+
71
+ tokenizer_path="saheedniyi/YarnGPT2"
72
  wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
73
  wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
74
 
75
+
76
  audio_tokenizer=AudioTokenizerV2(
77
+ tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
78
+ )
79
 
 
80
 
81
+ model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
82
 
83
+ #change the text
84
+ text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
85
 
86
+ # change the language and voice
87
+ prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
88
 
 
89
  input_ids=audio_tokenizer.tokenize_prompt(prompt)
90
 
 
91
  output = model.generate(
92
  input_ids=input_ids,
93
  temperature=0.1,
94
  repetition_penalty=1.1,
95
  max_length=4000,
96
+ #num_beams=5,# using a beam size helps for the local languages but not english
97
  )
98
 
 
99
  codes=audio_tokenizer.get_codes(output)
 
 
100
  audio=audio_tokenizer.get_audio(codes)
 
 
101
  IPython.display.Audio(audio,rate=24000)
102
+ torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  ```
105
 
106
  ## Model Description