Updated the inference code. Added a notebook and a demo audio.
Browse files
README.md
CHANGED
@@ -42,29 +42,36 @@ python -m mlx_audio.tts.generate --model Marvis-AI/marvis-tts-250m-v0.1 --strea
|
|
42 |
|
43 |
## Using transformers
|
44 |
|
45 |
-
**Without Voice Cloning**
|
46 |
```python
|
47 |
import torch
|
48 |
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
|
49 |
from tokenizers.processors import TemplateProcessing
|
50 |
import soundfile as sf
|
51 |
|
52 |
-
model_id = "Marvis-AI/marvis-tts-
|
53 |
device = "cuda"if torch.cuda.is_available() else "cpu"
|
54 |
|
55 |
# load the model and the processor
|
56 |
processor = AutoProcessor.from_pretrained(model_id)
|
57 |
-
model = CsmForConditionalGeneration.from_pretrained(model_id
|
58 |
|
59 |
# prepare the inputs
|
60 |
text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
|
61 |
-
inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
|
62 |
# infer the model
|
63 |
-
audio = model.generate(
|
64 |
sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
|
65 |
|
66 |
```
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
# Model Description
|
70 |
|
|
|
42 |
|
43 |
## Using transformers
|
44 |
|
45 |
+
**Without Voice Cloning**([Colab Notebook](https://colab.research.google.com/drive/1m9pdNFGlWMZW8gyXwkN9MNgbBEWP5lfO?usp=sharing))
|
46 |
```python
|
47 |
import torch
|
48 |
from transformers import AutoTokenizer, AutoProcessor, CsmForConditionalGeneration
|
49 |
from tokenizers.processors import TemplateProcessing
|
50 |
import soundfile as sf
|
51 |
|
52 |
+
model_id = "Marvis-AI/marvis-tts-0.25m-v0.1-transformers"
|
53 |
device = "cuda"if torch.cuda.is_available() else "cpu"
|
54 |
|
55 |
# load the model and the processor
|
56 |
processor = AutoProcessor.from_pretrained(model_id)
|
57 |
+
model = CsmForConditionalGeneration.from_pretrained(model_id).to(device)
|
58 |
|
59 |
# prepare the inputs
|
60 |
text = "[0]Marvis TTS is a new text-to-speech model that provides fast streaming on edge devices." # `[0]` for speaker id 0
|
61 |
+
inputs = processor(text, add_special_tokens=True, return_tensors="pt").to(device)
|
62 |
# infer the model
|
63 |
+
audio = model.generate(input_ids=inputs['input_ids'], output_audio=True)
|
64 |
sf.write("example_without_context.wav", audio[0].cpu(), samplerate=24_000, subtype="PCM_16")
|
65 |
|
66 |
```
|
67 |
|
68 |
+
**Output:**
|
69 |
+
|
70 |
+
<audio controls>
|
71 |
+
<source src="https://audio.jukehost.co.uk/gqWAk28VaBoRaX3UPdnMBedGWgXLJ8Mt" type="audio/mpeg">
|
72 |
+
</audio>
|
73 |
+
|
74 |
+
---
|
75 |
|
76 |
# Model Description
|
77 |
|