Uses 6.7GB of VRAM Under Inference - Not hard to Get it To Instruct Toxic Behavior, but will not come out on accident anymore.
Browse filesGood up to 4K tested, as its being trained at 4K currently, will extend to longer context once the models loss rate on validation decreases to an acceptible value.
- config.json +35 -35
- generation_config.json +1 -1
- model-00001-of-00005.safetensors +2 -2
- model-00002-of-00005.safetensors +2 -2
- model-00003-of-00005.safetensors +2 -2
- model-00004-of-00005.safetensors +2 -2
- model-00005-of-00005.safetensors +2 -2
- tokenizer_config.json +1 -0
config.json
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": ".\\BlackSheep",
|
3 |
-
"architectures": [
|
4 |
-
"MixtralForCausalLM"
|
5 |
-
],
|
6 |
-
"attention_bias": false,
|
7 |
-
"attention_dropout": 0.0,
|
8 |
-
"bos_token_id": 1,
|
9 |
-
"eos_token_id": 32000,
|
10 |
-
"hidden_act": "silu",
|
11 |
-
"hidden_size": 3072,
|
12 |
-
"initializer_range": 0.02,
|
13 |
-
"intermediate_size": 8192,
|
14 |
-
"max_position_embeddings":
|
15 |
-
"mlp_bias": false,
|
16 |
-
"model_type": "mixtral",
|
17 |
-
"num_attention_heads": 32,
|
18 |
-
"num_experts_per_tok":
|
19 |
-
"num_hidden_layers": 32,
|
20 |
-
"num_key_value_heads": 32,
|
21 |
-
"num_local_experts": 4,
|
22 |
-
"output_router_logits": false,
|
23 |
-
"pretraining_tp": 1,
|
24 |
-
"rms_norm_eps": 1e-05,
|
25 |
-
"rope_scaling": null,
|
26 |
-
"rope_theta": 10000.0,
|
27 |
-
"router_aux_loss_coef": 0.001,
|
28 |
-
"router_jitter_noise": 0.0,
|
29 |
-
"sliding_window": null,
|
30 |
-
"tie_word_embeddings": false,
|
31 |
-
"torch_dtype": "
|
32 |
-
"transformers_version": "4.
|
33 |
-
"use_cache": true,
|
34 |
-
"vocab_size": 32064
|
35 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": ".\\BlackSheep-MoE",
|
3 |
+
"architectures": [
|
4 |
+
"MixtralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 1,
|
9 |
+
"eos_token_id": 32000,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 3072,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 8192,
|
14 |
+
"max_position_embeddings": 16384,
|
15 |
+
"mlp_bias": false,
|
16 |
+
"model_type": "mixtral",
|
17 |
+
"num_attention_heads": 32,
|
18 |
+
"num_experts_per_tok": 2,
|
19 |
+
"num_hidden_layers": 32,
|
20 |
+
"num_key_value_heads": 32,
|
21 |
+
"num_local_experts": 4,
|
22 |
+
"output_router_logits": false,
|
23 |
+
"pretraining_tp": 1,
|
24 |
+
"rms_norm_eps": 1e-05,
|
25 |
+
"rope_scaling": null,
|
26 |
+
"rope_theta": 10000.0,
|
27 |
+
"router_aux_loss_coef": 0.001,
|
28 |
+
"router_jitter_noise": 0.0,
|
29 |
+
"sliding_window": null,
|
30 |
+
"tie_word_embeddings": false,
|
31 |
+
"torch_dtype": "bfloat16",
|
32 |
+
"transformers_version": "4.44.2",
|
33 |
+
"use_cache": true,
|
34 |
+
"vocab_size": 32064
|
35 |
+
}
|
generation_config.json
CHANGED
@@ -2,5 +2,5 @@
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
-
"transformers_version": "4.
|
6 |
}
|
|
|
2 |
"_from_model_config": true,
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
+
"transformers_version": "4.44.2"
|
6 |
}
|
model-00001-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1741f46daa03ca1ffcd1ba8eb64dfb6ca8f8af30f463fdec5b0e4b9a6a9efe40
|
3 |
+
size 4991365712
|
model-00002-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e445ac7171a8c08b455d0b312d03c3a2051a560aee995c8b415846a684074fe1
|
3 |
+
size 4995716272
|
model-00003-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a13c12a1303974c3f07cfe8657749a4bfcdb43c28e244f69d37c7e1c73d14ff
|
3 |
+
size 4957942672
|
model-00004-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b774a2ca21f24c8571c88292a6f46bf477bac000919874e3842942c604dfa29e
|
3 |
+
size 4995704152
|
model-00005-of-00005.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9dae58c3df48a77e7598340fa4078ec9a63fdb4256949fcbd655d0c555cf6a86
|
3 |
+
size 2197808096
|
tokenizer_config.json
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
|
|
4 |
"added_tokens_decoder": {
|
5 |
"0": {
|
6 |
"content": "<unk>",
|
|
|
1 |
{
|
2 |
"add_bos_token": true,
|
3 |
"add_eos_token": false,
|
4 |
+
"add_prefix_space": null,
|
5 |
"added_tokens_decoder": {
|
6 |
"0": {
|
7 |
"content": "<unk>",
|