try it out
#20
by
dmaran
- opened
- README.md +6 -12
- config.json +43 -47
- model.safetensors +0 -3
README.md
CHANGED
@@ -1,25 +1,19 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
-
pipeline_tag: text-to-speech
|
4 |
language:
|
5 |
- en
|
6 |
tags:
|
7 |
-
-
|
8 |
-
-
|
9 |
-
|
10 |
-
- text: "[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."
|
11 |
-
example_title: "Dia intro"
|
12 |
-
- text: "[S1] Oh fire! Oh my goodness! What's the procedure? What to we do people? The smoke could be coming through an air duct! [S2] Oh my god! Okay.. it's happening. Everybody stay calm! [S1] What's the procedure... [S2] Everybody stay fucking calm!!!... Everybody fucking calm down!!!!! [S1] No! No! If you touch the handle, if its hot there might be a fire down the hallway!"
|
13 |
-
example_title: "Panic protocol"
|
14 |
---
|
15 |
-
|
16 |
<center>
|
17 |
<a href="https://github.com/nari-labs/dia">
|
18 |
<img src="https://github.com/nari-labs/dia/raw/main/dia/static/images/banner.png">
|
19 |
</a>
|
20 |
</center>
|
21 |
|
22 |
-
Dia is a 1.6B parameter text to speech model created by Nari Labs.
|
23 |
|
24 |
Dia **directly generates highly realistic dialogue from a transcript**. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
|
25 |
|
@@ -28,7 +22,7 @@ To accelerate research, we are providing access to pretrained model checkpoints
|
|
28 |
We also provide a [demo page](https://yummy-fir-7a4.notion.site/dia) comparing our model to [ElevenLabs Studio](https://elevenlabs.io/studio) and [Sesame CSM-1B](https://github.com/SesameAILabs/csm).
|
29 |
|
30 |
- (Update) We have a ZeroGPU Space running! Try it now [here](https://huggingface.co/spaces/nari-labs/Dia-1.6B). Thanks to the HF team for the support :)
|
31 |
-
- Join our [discord server](https://discord.gg/
|
32 |
- Play with a larger version of Dia: generate fun conversations, remix content, and share with friends. 🔮 Join the [waitlist](https://tally.so/r/meokbo) for early access.
|
33 |
|
34 |
## ⚡️ Quickstart
|
@@ -120,7 +114,7 @@ By using this model, you agree to uphold relevant legal standards and ethical re
|
|
120 |
## 🤝 Contributing
|
121 |
|
122 |
We are a tiny team of 1 full-time and 1 part-time research-engineers. We are extra-welcome to any contributions!
|
123 |
-
Join our [Discord Server](https://discord.gg/
|
124 |
|
125 |
## 🤗 Acknowledgements
|
126 |
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
|
|
3 |
language:
|
4 |
- en
|
5 |
tags:
|
6 |
+
- Text-to-Speech
|
7 |
+
pipeline_tag: text-to-speech
|
8 |
+
library_name: dia-tts
|
|
|
|
|
|
|
|
|
9 |
---
|
|
|
10 |
<center>
|
11 |
<a href="https://github.com/nari-labs/dia">
|
12 |
<img src="https://github.com/nari-labs/dia/raw/main/dia/static/images/banner.png">
|
13 |
</a>
|
14 |
</center>
|
15 |
|
16 |
+
Dia is a 1.6B parameter text to speech model created by Nari Labs.
|
17 |
|
18 |
Dia **directly generates highly realistic dialogue from a transcript**. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
|
19 |
|
|
|
22 |
We also provide a [demo page](https://yummy-fir-7a4.notion.site/dia) comparing our model to [ElevenLabs Studio](https://elevenlabs.io/studio) and [Sesame CSM-1B](https://github.com/SesameAILabs/csm).
|
23 |
|
24 |
- (Update) We have a ZeroGPU Space running! Try it now [here](https://huggingface.co/spaces/nari-labs/Dia-1.6B). Thanks to the HF team for the support :)
|
25 |
+
- Join our [discord server](https://discord.gg/pgdB5YRe) for community support and access to new features.
|
26 |
- Play with a larger version of Dia: generate fun conversations, remix content, and share with friends. 🔮 Join the [waitlist](https://tally.so/r/meokbo) for early access.
|
27 |
|
28 |
## ⚡️ Quickstart
|
|
|
114 |
## 🤝 Contributing
|
115 |
|
116 |
We are a tiny team of 1 full-time and 1 part-time research-engineers. We are extra-welcome to any contributions!
|
117 |
+
Join our [Discord Server](https://discord.gg/pgdB5YRe) for discussions.
|
118 |
|
119 |
## 🤗 Acknowledgements
|
120 |
|
config.json
CHANGED
@@ -1,50 +1,46 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
"
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
"cross_head_dim": 128,
|
25 |
-
"cross_query_heads": 16,
|
26 |
-
"gqa_head_dim": 128,
|
27 |
-
"gqa_query_heads": 16,
|
28 |
-
"kv_heads": 4,
|
29 |
-
"n_embd": 2048,
|
30 |
-
"n_hidden": 8192,
|
31 |
-
"n_layer": 18
|
32 |
},
|
33 |
-
"
|
34 |
-
"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
}
|
|
|
1 |
{
|
2 |
+
"version": "0.1",
|
3 |
+
"model": {
|
4 |
+
"encoder": {
|
5 |
+
"n_layer": 12,
|
6 |
+
"n_embd": 1024,
|
7 |
+
"n_hidden": 4096,
|
8 |
+
"n_head": 16,
|
9 |
+
"head_dim": 128
|
10 |
+
},
|
11 |
+
"decoder": {
|
12 |
+
"n_layer": 18,
|
13 |
+
"n_embd": 2048,
|
14 |
+
"n_hidden": 8192,
|
15 |
+
"gqa_query_heads": 16,
|
16 |
+
"cross_query_heads": 16,
|
17 |
+
"kv_heads": 4,
|
18 |
+
"gqa_head_dim": 128,
|
19 |
+
"cross_head_dim": 128
|
20 |
+
},
|
21 |
+
"src_vocab_size": 256,
|
22 |
+
"tgt_vocab_size": 1028,
|
23 |
+
"dropout": 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
},
|
25 |
+
"training": {},
|
26 |
+
"data": {
|
27 |
+
"text_length": 1024,
|
28 |
+
"audio_length": 3072,
|
29 |
+
"channels": 9,
|
30 |
+
"text_pad_value": 0,
|
31 |
+
"audio_eos_value": 1024,
|
32 |
+
"audio_pad_value": 1025,
|
33 |
+
"audio_bos_value": 1026,
|
34 |
+
"delay_pattern": [
|
35 |
+
0,
|
36 |
+
8,
|
37 |
+
9,
|
38 |
+
10,
|
39 |
+
11,
|
40 |
+
12,
|
41 |
+
13,
|
42 |
+
14,
|
43 |
+
15
|
44 |
+
]
|
45 |
+
}
|
46 |
}
|
model.safetensors
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:caba289b60f6d7d1e58fc744f4dc25aae88995fcca46be3d05e220b971486a26
|
3 |
-
size 6444682848
|
|
|
|
|
|
|
|