Files changed (3) hide show
  1. README.md +6 -12
  2. config.json +43 -47
  3. model.safetensors +0 -3
README.md CHANGED
@@ -1,25 +1,19 @@
1
  ---
2
  license: apache-2.0
3
- pipeline_tag: text-to-speech
4
  language:
5
  - en
6
  tags:
7
- - model_hub_mixin
8
- - pytorch_model_hub_mixin
9
- widget:
10
- - text: "[S1] Dia is an open weights text to dialogue model. [S2] You get full control over scripts and voices. [S1] Wow. Amazing. (laughs) [S2] Try it now on Git hub or Hugging Face."
11
- example_title: "Dia intro"
12
- - text: "[S1] Oh fire! Oh my goodness! What's the procedure? What to we do people? The smoke could be coming through an air duct! [S2] Oh my god! Okay.. it's happening. Everybody stay calm! [S1] What's the procedure... [S2] Everybody stay fucking calm!!!... Everybody fucking calm down!!!!! [S1] No! No! If you touch the handle, if its hot there might be a fire down the hallway!"
13
- example_title: "Panic protocol"
14
  ---
15
-
16
  <center>
17
  <a href="https://github.com/nari-labs/dia">
18
  <img src="https://github.com/nari-labs/dia/raw/main/dia/static/images/banner.png">
19
  </a>
20
  </center>
21
 
22
- Dia is a 1.6B parameter text to speech model created by Nari Labs. It was pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration.
23
 
24
  Dia **directly generates highly realistic dialogue from a transcript**. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
25
 
@@ -28,7 +22,7 @@ To accelerate research, we are providing access to pretrained model checkpoints
28
  We also provide a [demo page](https://yummy-fir-7a4.notion.site/dia) comparing our model to [ElevenLabs Studio](https://elevenlabs.io/studio) and [Sesame CSM-1B](https://github.com/SesameAILabs/csm).
29
 
30
  - (Update) We have a ZeroGPU Space running! Try it now [here](https://huggingface.co/spaces/nari-labs/Dia-1.6B). Thanks to the HF team for the support :)
31
- - Join our [discord server](https://discord.gg/yBrqQ9Dd) for community support and access to new features.
32
  - Play with a larger version of Dia: generate fun conversations, remix content, and share with friends. 🔮 Join the [waitlist](https://tally.so/r/meokbo) for early access.
33
 
34
  ## ⚡️ Quickstart
@@ -120,7 +114,7 @@ By using this model, you agree to uphold relevant legal standards and ethical re
120
  ## 🤝 Contributing
121
 
122
  We are a tiny team of 1 full-time and 1 part-time research-engineers. We are extra-welcome to any contributions!
123
- Join our [Discord Server](https://discord.gg/yBrqQ9Dd) for discussions.
124
 
125
  ## 🤗 Acknowledgements
126
 
 
1
  ---
2
  license: apache-2.0
 
3
  language:
4
  - en
5
  tags:
6
+ - Text-to-Speech
7
+ pipeline_tag: text-to-speech
8
+ library_name: dia-tts
 
 
 
 
9
  ---
 
10
  <center>
11
  <a href="https://github.com/nari-labs/dia">
12
  <img src="https://github.com/nari-labs/dia/raw/main/dia/static/images/banner.png">
13
  </a>
14
  </center>
15
 
16
+ Dia is a 1.6B parameter text to speech model created by Nari Labs.
17
 
18
  Dia **directly generates highly realistic dialogue from a transcript**. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
19
 
 
22
  We also provide a [demo page](https://yummy-fir-7a4.notion.site/dia) comparing our model to [ElevenLabs Studio](https://elevenlabs.io/studio) and [Sesame CSM-1B](https://github.com/SesameAILabs/csm).
23
 
24
  - (Update) We have a ZeroGPU Space running! Try it now [here](https://huggingface.co/spaces/nari-labs/Dia-1.6B). Thanks to the HF team for the support :)
25
+ - Join our [discord server](https://discord.gg/pgdB5YRe) for community support and access to new features.
26
  - Play with a larger version of Dia: generate fun conversations, remix content, and share with friends. 🔮 Join the [waitlist](https://tally.so/r/meokbo) for early access.
27
 
28
  ## ⚡️ Quickstart
 
114
  ## 🤝 Contributing
115
 
116
  We are a tiny team of 1 full-time and 1 part-time research-engineers. We are extra-welcome to any contributions!
117
+ Join our [Discord Server](https://discord.gg/pgdB5YRe) for discussions.
118
 
119
  ## 🤗 Acknowledgements
120
 
config.json CHANGED
@@ -1,50 +1,46 @@
1
  {
2
- "data": {
3
- "audio_bos_value": 1026,
4
- "audio_eos_value": 1024,
5
- "audio_length": 3072,
6
- "audio_pad_value": 1025,
7
- "channels": 9,
8
- "delay_pattern": [
9
- 0,
10
- 8,
11
- 9,
12
- 10,
13
- 11,
14
- 12,
15
- 13,
16
- 14,
17
- 15
18
- ],
19
- "text_length": 1024,
20
- "text_pad_value": 0
21
- },
22
- "model": {
23
- "decoder": {
24
- "cross_head_dim": 128,
25
- "cross_query_heads": 16,
26
- "gqa_head_dim": 128,
27
- "gqa_query_heads": 16,
28
- "kv_heads": 4,
29
- "n_embd": 2048,
30
- "n_hidden": 8192,
31
- "n_layer": 18
32
  },
33
- "dropout": 0.0,
34
- "encoder": {
35
- "head_dim": 128,
36
- "n_embd": 1024,
37
- "n_head": 16,
38
- "n_hidden": 4096,
39
- "n_layer": 12
40
- },
41
- "normalization_layer_epsilon": 1e-05,
42
- "rope_max_timescale": 10000,
43
- "rope_min_timescale": 1,
44
- "src_vocab_size": 256,
45
- "tgt_vocab_size": 1028,
46
- "weight_dtype": "float32"
47
- },
48
- "training": {},
49
- "version": "0.1"
 
 
 
 
50
  }
 
1
  {
2
+ "version": "0.1",
3
+ "model": {
4
+ "encoder": {
5
+ "n_layer": 12,
6
+ "n_embd": 1024,
7
+ "n_hidden": 4096,
8
+ "n_head": 16,
9
+ "head_dim": 128
10
+ },
11
+ "decoder": {
12
+ "n_layer": 18,
13
+ "n_embd": 2048,
14
+ "n_hidden": 8192,
15
+ "gqa_query_heads": 16,
16
+ "cross_query_heads": 16,
17
+ "kv_heads": 4,
18
+ "gqa_head_dim": 128,
19
+ "cross_head_dim": 128
20
+ },
21
+ "src_vocab_size": 256,
22
+ "tgt_vocab_size": 1028,
23
+ "dropout": 0.0
 
 
 
 
 
 
 
 
24
  },
25
+ "training": {},
26
+ "data": {
27
+ "text_length": 1024,
28
+ "audio_length": 3072,
29
+ "channels": 9,
30
+ "text_pad_value": 0,
31
+ "audio_eos_value": 1024,
32
+ "audio_pad_value": 1025,
33
+ "audio_bos_value": 1026,
34
+ "delay_pattern": [
35
+ 0,
36
+ 8,
37
+ 9,
38
+ 10,
39
+ 11,
40
+ 12,
41
+ 13,
42
+ 14,
43
+ 15
44
+ ]
45
+ }
46
  }
model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:caba289b60f6d7d1e58fc744f4dc25aae88995fcca46be3d05e220b971486a26
3
- size 6444682848