ferxalb commited on
Commit
29d4e2a
·
verified ·
1 Parent(s): 7c95e57

Upload trained model folder

Browse files
Files changed (4) hide show
  1. README.md +17 -3
  2. config.json +27 -0
  3. pytorch_model.bin +3 -0
  4. training_metrics.json +0 -0
README.md CHANGED
@@ -1,3 +1,17 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: moeob
3
+ license: mit
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - byte-level
7
+ - experimental
8
+ - mixture-of-experts
9
+ - model_hub_mixin
10
+ - pytorch_model_hub_mixin
11
+ - summary-then-generate
12
+ ---
13
+
14
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
15
+ - Code: https://github.com/enosislabs/moeob
16
+ - Paper: [More Information Needed]
17
+ - Docs: [More Information Needed]
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 256,
3
+ "hidden_dim": 64,
4
+ "patch_encoder_dim": 32,
5
+ "n_layers": 3,
6
+ "n_heads": 4,
7
+ "n_experts": 4,
8
+ "top_k_experts": 2,
9
+ "expert_hidden_mult": 2,
10
+ "patch_encoder_layers": 1,
11
+ "max_seq_length": 512,
12
+ "max_patches": 64,
13
+ "learning_rate": 0.001,
14
+ "weight_decay": 0.01,
15
+ "beta1": 0.9,
16
+ "beta2": 0.98,
17
+ "epsilon": 1e-08,
18
+ "batch_size": 256,
19
+ "micro_batch_size": 4,
20
+ "gradient_clip": 0.5,
21
+ "load_balance_coefficient": 0.01,
22
+ "dropout": 0.0,
23
+ "use_prenorm": true,
24
+ "entropy_threshold": 0.4,
25
+ "min_patch_size": 4,
26
+ "max_patch_size": 32
27
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e675277d54fabd402bbdda525961c5371a7e2a928fce4baec3a377be98ecc917
3
+ size 1254791
training_metrics.json ADDED
The diff for this file is too large to render. See raw diff