Text Generation
scaling
GregorZiegltrumAA commited on
Commit
a1c7328
0 Parent(s):

Initial commit

Browse files
Files changed (40) hide show
  1. .gitattributes +35 -0
  2. LICENSE +31 -0
  3. README.md +50 -0
  4. config.yml +101 -0
  5. model_state_layer_0_EmbeddingInput.pt +3 -0
  6. model_state_layer_10_TransformerLayer.pt +3 -0
  7. model_state_layer_11_TransformerLayer.pt +3 -0
  8. model_state_layer_12_TransformerLayer.pt +3 -0
  9. model_state_layer_13_TransformerLayer.pt +3 -0
  10. model_state_layer_14_TransformerLayer.pt +3 -0
  11. model_state_layer_15_TransformerLayer.pt +3 -0
  12. model_state_layer_16_TransformerLayer.pt +3 -0
  13. model_state_layer_17_TransformerLayer.pt +3 -0
  14. model_state_layer_18_TransformerLayer.pt +3 -0
  15. model_state_layer_19_TransformerLayer.pt +3 -0
  16. model_state_layer_1_TransformerLayer.pt +3 -0
  17. model_state_layer_20_TransformerLayer.pt +3 -0
  18. model_state_layer_21_TransformerLayer.pt +3 -0
  19. model_state_layer_22_TransformerLayer.pt +3 -0
  20. model_state_layer_23_TransformerLayer.pt +3 -0
  21. model_state_layer_24_TransformerLayer.pt +3 -0
  22. model_state_layer_25_TransformerLayer.pt +3 -0
  23. model_state_layer_26_TransformerLayer.pt +3 -0
  24. model_state_layer_27_TransformerLayer.pt +3 -0
  25. model_state_layer_28_TransformerLayer.pt +3 -0
  26. model_state_layer_29_TransformerLayer.pt +3 -0
  27. model_state_layer_2_TransformerLayer.pt +3 -0
  28. model_state_layer_30_TransformerLayer.pt +3 -0
  29. model_state_layer_31_TransformerLayer.pt +3 -0
  30. model_state_layer_32_TransformerLayer.pt +3 -0
  31. model_state_layer_33_LayerNormWrapper.pt +3 -0
  32. model_state_layer_34_TransformerLMHead.pt +3 -0
  33. model_state_layer_3_TransformerLayer.pt +3 -0
  34. model_state_layer_4_TransformerLayer.pt +3 -0
  35. model_state_layer_5_TransformerLayer.pt +3 -0
  36. model_state_layer_6_TransformerLayer.pt +3 -0
  37. model_state_layer_7_TransformerLayer.pt +3 -0
  38. model_state_layer_8_TransformerLayer.pt +3 -0
  39. model_state_layer_9_TransformerLayer.pt +3 -0
  40. vocab.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The following applies to all files in this repository, unless otherwise noted:
2
+
3
+ Copyright (c) 2024 IPAI Aleph Alpha Research GmbH. All rights reserved.
4
+
5
+ This project is licensed under the terms of the Open Aleph License 1.0, available at
6
+ https://github.com/Aleph-Alpha/.github/blob/main/oal.pdf
7
+
8
+ ---
9
+ Excerpt from the license text:
10
+
11
+ Subject to the terms and conditions of this License, the Licensor grants you a non-exclusive, worldwide,
12
+ non-transferable, non-sublicensable, and royalty-free limited right to use, copy, modify, distribute, make
13
+ otherwise publicly available, and reproduce the Works and Derivative Works under Licensor’s copyright,
14
+ for any Non-Commercial and Non-Administrative purpose.
15
+ You may not use, copy, modify, distribute, make otherwise publicly available, reproduce, or sublicense the
16
+ Works or Derivative Works except as expressly provided under and in accordance with this License.
17
+ Your rights granted under this License will automatically terminate if you fail to comply with any of the
18
+ terms of this License.
19
+
20
+ EXCEPT FOR DAMAGES CAUSED BY INTENT OR FRAUDULENTLY CONCEALED
21
+ DEFECTS, AND EXCEPT FOR DAMAGES RESULTING FROM BREACH OF ANY
22
+ WARRANTY OR GUARANTEE EXPRESSLY GIVEN BY LICENSOR IN THE OPEN ALEPH LICENSE,
23
+ IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY
24
+ DAMAGES ARISING OUT OF THE OPEN ALEPH LICENSE OR THE USE OF THE WORK. ANY
25
+ MANDATORY STATUTORY LIABILITY UNDER APPLICABLE LAW REMAINS
26
+ UNAFFECTED.
27
+
28
+ EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR REQUIRED BY APPLICABLE
29
+ LAW, THE WORKS ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES
30
+ OF ANY KIND INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING
31
+ THE CONTENTS, ACCURACY, OR FITNESS FOR A PARTICULAR PURPOSE.
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: open-aleph-license
4
+ license_link: LICENSE
5
+ library_name: scaling
6
+ pipeline_tag: text-generation
7
+ ---
8
+
9
+
10
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/671a0238b080a748c29b8fea/v1rfcKVaL8vnjuCqWUmI-.png)
11
+
12
+
13
+ # u-μP: Stable training in low precision for a significant speed-up and memory reduction during training
14
+
15
+
16
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/671a0238b080a748c29b8fea/F1-zbAXF5LGvxpIRrYfU4.png)
17
+
18
+
19
+ This Repository holds the model weights for the 7B u-μP models trained at Aleph Alpha Research for 72k steps (300B tokens). Please note, that the released checkpoints are not fully converged models and are intended for research use only.
20
+
21
+ You can find all model weights and their corresponding safetensors conversions at the following links:
22
+ - [umup-research-7b-bf16](https://huggingface.co/Aleph-Alpha/umup-research-7b-bf16)
23
+ - [umup-research-7b-fp8](https://huggingface.co/Aleph-Alpha/umup-research-7b-fp8)
24
+ - [sp-baseline-research-7b-bf16](https://huggingface.co/Aleph-Alpha/sp-baseline-research-7b-bf16)
25
+
26
+ The Maximal Update Parametrization (μP) aims to make the optimal hyperparameters (HPs) of a model-independent of its size, allowing them to be swept using a cheap proxy model rather than the full-size target model. We present a new scheme, u-μP, which improves upon μP by combining it with Unit Scaling, a method for designing models that makes them easy to train in low precision. The two techniques have a natural affinity: μP ensures that the scale of activations is independent of model size, and Unit Scaling ensures that activations, weights, and gradients begin training with a scale of one. This synthesis opens the door to a simpler scheme, whose default values are near-optimal. This in turn facilitates a more efficient sweeping strategy, with u-μP models reaching a lower loss than comparable μP models and working out-of-the-box in FP8.
27
+
28
+
29
+
30
+ If you want to learn more details about u-μP, check out our [blog post](https://aleph-alpha.com/in-awe-at-the-scale-of-these-tensors-a-gentle-introduction-to-unit-scaled-maximal-update-parametrization/) and our [paper](https://arxiv.org/abs/2407.17465).
31
+
32
+ Unit-Scaled Maximal Update Parametrization (u-μP) is available in [Scaling](https://github.com/Aleph-Alpha/scaling), our official large-scale training codebase. Please note, that FP8-trained checkpoints only work on chips with FP8 support, like the Hopper architecture.
33
+
34
+ # Usage
35
+ You can generate tokens with the [Scaling](https://github.com/Aleph-Alpha/scaling) inference implementation:
36
+
37
+ ```python
38
+ from scaling.transformer.inference import TransformerInferenceModule
39
+ from pathlib import Path
40
+
41
+ ckpt_path = Path("<path_to_repo>/7B_umup_fp8")
42
+
43
+ model = TransformerInferenceModule.from_checkpoint(ckpt_path)
44
+
45
+ prompt = "Once upon a time"
46
+
47
+ output = model.generate(max_tokens=100, input_text=prompt)
48
+
49
+ print(output.completion_text)
50
+ ```
config.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 4
14
+ micro_batch_size: 2
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 2
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: true
28
+ learning_rate_scheduler:
29
+ learning_rate: 11.313708498984761
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 1.131370849898476
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude:
35
+ - norm
36
+ weight_decay: 0.0001221
37
+ transformer_architecture:
38
+ attention_bias: false
39
+ attention_num_kv_heads: null
40
+ attention_qkv_in_one: true
41
+ dropout_after_attention: 0.0
42
+ dropout_after_mlp: 0.0
43
+ dropout_attention_probs: 0.0
44
+ dropout_embedding: 0.0
45
+ dropout_image_encoder: 0.0
46
+ fp8_config_attention:
47
+ dtypes_forward:
48
+ left_dtype: e4m3
49
+ right_dtype: e4m3
50
+ dtypes_grad_input:
51
+ left_dtype: e5m2
52
+ right_dtype: e4m3
53
+ dtypes_grad_weight:
54
+ left_dtype: e4m3
55
+ right_dtype: e5m2
56
+ fp8_config_mlp:
57
+ dtypes_forward:
58
+ left_dtype: e4m3
59
+ right_dtype: e4m3
60
+ dtypes_grad_input:
61
+ left_dtype: e5m2
62
+ right_dtype: e4m3
63
+ dtypes_grad_weight:
64
+ left_dtype: e4m3
65
+ right_dtype: e5m2
66
+ hidden_size: 4096
67
+ image_encoder: false
68
+ key_query_norm: false
69
+ layernorm:
70
+ layernorm_epsilon: 1e-05
71
+ optimization_type: torch
72
+ local_attention_window_size: null
73
+ masked_softmax:
74
+ kernel: flash_attention
75
+ scale: 1.0
76
+ softmax_in_fp32: false
77
+ mlp_bias: false
78
+ mlp_factor: 2.66796875
79
+ mlp_type: swiglu
80
+ norm_type: rms
81
+ num_attention_heads: 32
82
+ num_layers: 32
83
+ num_local_attention_heads: 0
84
+ precision: bfloat16
85
+ relative_position_embedding_type: rotary_complex
86
+ reset_attention_mask: false
87
+ reset_position_ids: false
88
+ rotary_embedding_base: 10000
89
+ rotary_percentage: 1.0
90
+ sequence_length: 4096
91
+ umup:
92
+ act_mult: 1.0
93
+ attn_mult: 1.0
94
+ enable: true
95
+ loss_mult: 1.0
96
+ normalize_depth_to_num_layers: true
97
+ residual_attn_ratio: 0.25
98
+ residual_mult: 1.0
99
+ vocab_file: null
100
+ vocab_size: 65536
101
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a231ae564fec7d0538547295cd6a650f82e1907da3c2cfdb0025ab38cb241a7
3
+ size 536872395
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7faa4cee189672fd4370b7a4567d95f428a6ba155812a53cb6435068f089744
3
+ size 402803885
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33d381483ac987da4f1def484f8e7d838c8e8044d18c641c45d79aedd1a72854
3
+ size 402803885
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:076b074089a7dd7c700f306e94be5db9389642562937b91b6afd65bc3a0fe4dd
3
+ size 402803885
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d30169d26deb717b0e443aeb85846e82e531ae959b8a2814b2bf7cf44c15ae3
3
+ size 402803885
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:917156f9fef24f5159fae1dad1defa3bc1bae042dc31d365fd39f50af0f463c6
3
+ size 402803885
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c82364c46d7b3d67fac05474c46b0dc65abf836ee53e14c4e31fa4b17358995
3
+ size 402803885
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f65efab7e7296718512f8f272f0c32ce0c06e1d31394e40e63e17fd9bf1db0
3
+ size 402803885
model_state_layer_17_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76cb8a177cf9a77c105bc424a4a3f6df112abb6f53c74e9a741826cb0183199e
3
+ size 402803885
model_state_layer_18_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec06f64d40378137477c2da30503aec73f6e1489a385476ef16d94c504917b9e
3
+ size 402803885
model_state_layer_19_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff91b52269e4e124b52b10bf34ce15e8d23167e211a99813918dca1bf2b6112
3
+ size 402803885
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14a6e1c964c8da62a343b7da2d45404f8b35f83b6afea7b987cb6d0c2a91b81
3
+ size 402803874
model_state_layer_20_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7c72d971db74f628064286fb0f7ecf416e8ccb893c4074aeddb993d89d30bc
3
+ size 402803885
model_state_layer_21_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62dddec6d330e9f61406cab3a3fbc1570094f22714ed0cb3e11a9338726becd9
3
+ size 402803885
model_state_layer_22_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79fb6dfeb425ac8b57b50214d8dc795ce6cc91b58ad357e4c715959eb995b148
3
+ size 402803885
model_state_layer_23_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19f399896711cf4f0b1e59ca75982102eb35720071c606920a64364ed985167b
3
+ size 402803885
model_state_layer_24_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c54ef4d647ae58c490591d44974048349e4380b22ff9a9274fa64ad62ddf0a5d
3
+ size 402803885
model_state_layer_25_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5395e624161287112d852bf445cccd45fe161ffdc9f39a943ee6be96dc8c6c5d
3
+ size 402803885
model_state_layer_26_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad9cfc0b9f9210abc5f243c1e5b54614c6aa43dbdf9e314823295c86f16991ac
3
+ size 402803885
model_state_layer_27_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0d8704348468da90d998fe364fd797ad1bb8c241f057da77f51abb09ea89359
3
+ size 402803885
model_state_layer_28_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b864c7ff06f2f86abf422b4dc2711a684f513920771382a6aff6f8ef2e3bb2
3
+ size 402803885
model_state_layer_29_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491bef02815b5acfa1cacf9af7e1d0503fe49cc54393c1203169d818eb76876b
3
+ size 402803885
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17079c4dd7c1adbc6b5cf907a51241d0f32d5cc95adae2f5a4d2b3e8a8e82e9
3
+ size 402803874
model_state_layer_30_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a99c0b0b3bd02ce5b8026951b44b644e9451b938c310ba30b33c755ae92d0f63
3
+ size 402803885
model_state_layer_31_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82a2f6b819c97ee133d7fba1e761f761a6c2342330416da6cdc1847815ccbd7
3
+ size 402803885
model_state_layer_32_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:605f545510dc2b8fa66c102da27fe2f15f3325e0452c3480e4b869874cc396c2
3
+ size 402803885
model_state_layer_33_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fef3ad9a684ad078dc8af6a7034775dfb3fb66550f2a44945928150a3c162ed
3
+ size 9650
model_state_layer_34_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4001f5f525c856dbbc663e937ca635a73f20fa3469b8ba0701f6207ce50f7338
3
+ size 536872360
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc16f965d119a504c06a73ddbafe4dad49203c9d3ef0d888f35148b96b6b9882
3
+ size 402803874
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df0912a91b8a4eca662a6ee06c46f211b27269357ba4547ce5f506190a5444f
3
+ size 402803874
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edeb1640ac3eebec2577490b14a5285d265d0c0a2940e2dd74c2b4ce828fcd5c
3
+ size 402803874
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2180db549479bdf9deaa373c0f2844c1df3a5083e9e277e241871bda5794a82
3
+ size 402803874
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12c8db34868983038b6a3b6189f2e8a7d875b59b86f06077cd765b97fb023ad6
3
+ size 402803874
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5655718812e3ae200b3cae538d2b20325fbe43f905bab66cfc7c7885e8ed20a9
3
+ size 402803874
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a148acb4ef6d0054a3289dc80c99145e7f6459f2aeffc0f5daca91a2dbbb575
3
+ size 402803874
vocab.json ADDED
The diff for this file is too large to render. See raw diff