saheedniyi
/

YarnGPT2

@@ -43,14 +43,11 @@ The model can generate audio on its own but its better to use a voice to prompt
 ### Prompt YarnGPT2
 ```python
-# clone the YarnGPT repo to get access to the `audiotokenizer`
-!git clone https://github.com/saheedniyi02/yarngpt.git
-# install some necessary libraries
-!pip install outetts==0.2.3 uroman
-#import some important packages
 import os
 import re
 import json
@@ -63,174 +60,47 @@ import torchaudio
 import IPython
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
-from yarngpt.audiotokenizer import AudioTokenizerV2
-# download the wavtokenizer weights and config (to encode and decode the audio)
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
-# model path and wavtokenizer weight path (the paths are assumed based on Google colab, a different environment might save the weights to a different location).
-hf_path="saheedniyi/YarnGPT"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
-# create the AudioTokenizer object
 audio_tokenizer=AudioTokenizerV2(
-    hf_path,wav_tokenizer_model_path,wav_tokenizer_config_path
-)
-#load the model weights
-model = AutoModelForCausalLM.from_pretrained(hf_path,torch_dtype="auto").to(audio_tokenizer.device)
-# your input text
-text="Uhm, so, what was the inspiration behind your latest project? Like, was there a specific moment where you were like, 'Yeah, this is it!' Or, you know, did it just kind of, uh, come together naturally over time?"
-# creating a prompt, when creating a prompt, there is an optional `speaker_name` parameter, the possible speakers are "idera","emma","jude","osagie","tayo","zainab","joke","regina","remi","umar","chinenye" if no speaker is selected a speaker is chosen at random
-prompt=audio_tokenizer.create_prompt(text,"idera")
-# tokenize the prompt
 input_ids=audio_tokenizer.tokenize_prompt(prompt)
-# generate output from the model, you can tune the `.generate` parameters as you wish
 output  = model.generate(
             input_ids=input_ids,
             temperature=0.1,
             repetition_penalty=1.1,
             max_length=4000,
         )
-# convert the output to "audio codes"
 codes=audio_tokenizer.get_codes(output)
-# converts the codes to audio
 audio=audio_tokenizer.get_audio(codes)
-# play the audio
 IPython.display.Audio(audio,rate=24000)
-# save the audio
-torchaudio.save(f"audio.wav", audio, sample_rate=24000)
-```
-### Simple Nigerian Accented-NewsReader
-```python
-!git clone https://github.com/saheedniyi02/yarngpt.git
-pip install outetts uroman trafilatura pydub
-import os
-import re
-import json
-import torch
-import inflect
-import random
-import requests
-import trafilatura
-import inflect
-import uroman as ur
-import numpy as np
-import torchaudio
-import IPython
-from pydub import AudioSegment
-from pydub.effects import normalize
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from outetts.wav_tokenizer.decoder import WavTokenizer
-!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
-!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
-from yarngpt.audiotokenizer import AudioTokenizerV2
-tokenizer_path="saheedniyi/YarnGPT2"
-wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
-wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
-audio_tokenizer=AudioTokenizerV2(
-    tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
-       )
-model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
-# Split text into chunks
-def split_text_into_chunks(text, word_limit=25):
-  sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
-  chunks=[]
-  for sentence in sentences:
-    chunks.append(".")
-    sentence_splitted=sentence.split(" ")
-    num_words=len(sentence_splitted)
-    if (num_words>word_limit) and (num_words<=word_limit*2):
-      chunks.append(" ".join(sentence_splitted[:int(num_words/2)]))
-      chunks.append(" ".join(sentence_splitted[int(num_words/2):]))
-    elif (num_words>word_limit*2) and (num_words<=word_limit*3):
-      chunks.append(" ".join(sentence_splitted[:int(num_words/3)]))
-      chunks.append(" ".join(sentence_splitted[int(num_words/3):int(2*num_words/3)]))
-      chunks.append(" ".join(sentence_splitted[int(2*num_words/3):]))
-    elif (num_words>word_limit*3) and (num_words<=word_limit*4):
-      chunks.append(" ".join(sentence_splitted[:int(num_words/4)]))
-      chunks.append(" ".join(sentence_splitted[int(num_words/4):word_limit*2]))
-      chunks.append(" ".join(sentence_splitted[int(2*num_words/4):int(3*num_words/4)]))
-      chunks.append(" ".join(sentence_splitted[int(3*num_words/4):]))
-    elif (num_words>word_limit*4) and (num_words<=word_limit*5):
-      chunks.append(" ".join(sentence_splitted[:int(num_words/5)]))
-      chunks.append(" ".join(sentence_splitted[int(num_words/5):int(2*num_words/5)]))
-      chunks.append(" ".join(sentence_splitted[int(2*num_words/5):int(3*num_words/5)]))
-      chunks.append(" ".join(sentence_splitted[int(3*num_words/5):int(4*num_words/5)]))
-      chunks.append(" ".join(sentence_splitted[int(4*num_words/5):]))
-    else:
-      chunks.append(sentence)
-  return chunks
-def speed_change(sound, speed=0.9):
-    # Manually override the frame_rate. This tells the computer how many
-    # samples to play per second
-    sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
-         "frame_rate": int(sound.frame_rate * speed)
-      })
-     # convert the sound with altered frame rate to a standard frame rate
-     # so that regular playback programs will work right. They often only
-     # know how to play audio at standard frame rate (like 44.1k)
-    return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
-#change the url
-url="https://punchng.com/im-not-desperate-for-2027-presidential-ticket-obi/"
-page=requests.get(url)
-content=trafilatura.extract(page.text)
-chunks=split_text_into_chunks(content)
-all_codes=[]
-#Looping over the chunks and adding creating a large `all_codes` list
-for i,chunk in enumerate(chunks):
-  print(i)
-  print("\n")
-  print(chunk)
-  if chunk==".":
-    #add silence for 0.5 seconds if we encounter a full stop
-    all_codes.extend([453]*38)
-  else:
-    # Change the language and voice here
-    prompt=audio_tokenizer.create_prompt(chunk,lang="english",speaker_name="jude")
-    input_ids=audio_tokenizer.tokenize_prompt(prompt)
-    output  = model.generate(
-            input_ids=input_ids,
-            temperature=0.1,
-            repetition_penalty=1.1,
-            max_length=4000,
-            #num_beams=5,
-        )
-    codes=audio_tokenizer.get_codes(output)
-    all_codes.extend(codes)
-audio=audio_tokenizer.get_audio(all_codes)
-IPython.display.Audio(audio,rate=24000)
-torchaudio.save(f"news1.wav",
-                audio,
-                sample_rate=24000,
-)
 ```
 ## Model Description

 ### Prompt YarnGPT2
 ```python
+!git clone https://github.com/saheedniyi02/yarngpt.git
+pip install outetts uroman
 import os
 import re
 import json
 import IPython
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
 !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
+from yarngpt.audiotokenizer import AudioTokenizerV2
+tokenizer_path="saheedniyi/YarnGPT2"
 wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
 wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
 audio_tokenizer=AudioTokenizerV2(
+    tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
+    )
+model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
+#change the text
+text="The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security."
+# change the language and voice
+prompt=audio_tokenizer.create_prompt(text,lang="english",speaker_name="idera")
 input_ids=audio_tokenizer.tokenize_prompt(prompt)
 output  = model.generate(
             input_ids=input_ids,
             temperature=0.1,
             repetition_penalty=1.1,
             max_length=4000,
+            #num_beams=5,# using a beam size helps for the local languages but not english
         )
 codes=audio_tokenizer.get_codes(output)
 audio=audio_tokenizer.get_audio(codes)
 IPython.display.Audio(audio,rate=24000)
+torchaudio.save(f"Sample.wav", audio, sample_rate=24000)
 ```
 ## Model Description