Uses 6.7GB of VRAM Under Inference - Not hard to Get it To Instruct Toxic Behavior, but will not come out on accident anymore.

Good up to 4K tested, as its being trained at 4K currently, will extend to longer context once the models loss rate on validation decreases to an acceptible value.

Files changed (8) hide show

config.json +35 -35
generation_config.json +1 -1
model-00001-of-00005.safetensors +2 -2
model-00002-of-00005.safetensors +2 -2
model-00003-of-00005.safetensors +2 -2
model-00004-of-00005.safetensors +2 -2
model-00005-of-00005.safetensors +2 -2
tokenizer_config.json +1 -0

config.json CHANGED Viewed

@@ -1,35 +1,35 @@
-{
-  "_name_or_path": ".\\BlackSheep",
-  "architectures": [
-    "MixtralForCausalLM"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 1,
-  "eos_token_id": 32000,
-  "hidden_act": "silu",
-  "hidden_size": 3072,
-  "initializer_range": 0.02,
-  "intermediate_size": 8192,
-  "max_position_embeddings": 8192,
-  "mlp_bias": false,
-  "model_type": "mixtral",
-  "num_attention_heads": 32,
-  "num_experts_per_tok": 4,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 32,
-  "num_local_experts": 4,
-  "output_router_logits": false,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "router_aux_loss_coef": 0.001,
-  "router_jitter_noise": 0.0,
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.40.2",
-  "use_cache": true,
-  "vocab_size": 32064
-}

+{
+  "_name_or_path": ".\\BlackSheep-MoE",
+  "architectures": [
+    "MixtralForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 32000,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 16384,
+  "mlp_bias": false,
+  "model_type": "mixtral",
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "num_local_experts": 4,
+  "output_router_logits": false,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "vocab_size": 32064
+}

generation_config.json CHANGED Viewed

@@ -2,5 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 32000,
-  "transformers_version": "4.40.2"
 }

   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 32000,
+  "transformers_version": "4.44.2"
 }

model-00001-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e11d3bf4da5ea34df18794ccb10237709f2c9077bdf78d1eebfbae66a73f143c
-size 4991365576

 version https://git-lfs.github.com/spec/v1
+oid sha256:1741f46daa03ca1ffcd1ba8eb64dfb6ca8f8af30f463fdec5b0e4b9a6a9efe40
+size 4991365712

model-00002-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:583b23c13c9940449241e4af1e4e3af1f697346eacfe1edb1a78f1a222012d06
-size 4995716136

 version https://git-lfs.github.com/spec/v1
+oid sha256:e445ac7171a8c08b455d0b312d03c3a2051a560aee995c8b415846a684074fe1
+size 4995716272

model-00003-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21268d9ffc47a6c37a703e4f781b51f8b13bf16fd9c06420fa54e2256422b640
-size 4957942536

 version https://git-lfs.github.com/spec/v1
+oid sha256:8a13c12a1303974c3f07cfe8657749a4bfcdb43c28e244f69d37c7e1c73d14ff
+size 4957942672

model-00004-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ff135b7255bbdeefcfe775cfa2e7490f6df5b807b71bedda666d13a5bfdbd1b
-size 4995704008

 version https://git-lfs.github.com/spec/v1
+oid sha256:b774a2ca21f24c8571c88292a6f46bf477bac000919874e3842942c604dfa29e
+size 4995704152

model-00005-of-00005.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eadcd8f3c92b3712d7ea9e8bfaf218443011c01ec9327f5340a766f3b0ce556b
-size 2197808032

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dae58c3df48a77e7598340fa4078ec9a63fdb4256949fcbd655d0c555cf6a86
+size 2197808096

tokenizer_config.json CHANGED Viewed

@@ -1,6 +1,7 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "add_prefix_space": null,
   "added_tokens_decoder": {
     "0": {
       "content": "<unk>",