GregorZiegltrumAA
commited on
Commit
•
a1c7328
0
Parent(s):
Initial commit
Browse files- .gitattributes +35 -0
- LICENSE +31 -0
- README.md +50 -0
- config.yml +101 -0
- model_state_layer_0_EmbeddingInput.pt +3 -0
- model_state_layer_10_TransformerLayer.pt +3 -0
- model_state_layer_11_TransformerLayer.pt +3 -0
- model_state_layer_12_TransformerLayer.pt +3 -0
- model_state_layer_13_TransformerLayer.pt +3 -0
- model_state_layer_14_TransformerLayer.pt +3 -0
- model_state_layer_15_TransformerLayer.pt +3 -0
- model_state_layer_16_TransformerLayer.pt +3 -0
- model_state_layer_17_TransformerLayer.pt +3 -0
- model_state_layer_18_TransformerLayer.pt +3 -0
- model_state_layer_19_TransformerLayer.pt +3 -0
- model_state_layer_1_TransformerLayer.pt +3 -0
- model_state_layer_20_TransformerLayer.pt +3 -0
- model_state_layer_21_TransformerLayer.pt +3 -0
- model_state_layer_22_TransformerLayer.pt +3 -0
- model_state_layer_23_TransformerLayer.pt +3 -0
- model_state_layer_24_TransformerLayer.pt +3 -0
- model_state_layer_25_TransformerLayer.pt +3 -0
- model_state_layer_26_TransformerLayer.pt +3 -0
- model_state_layer_27_TransformerLayer.pt +3 -0
- model_state_layer_28_TransformerLayer.pt +3 -0
- model_state_layer_29_TransformerLayer.pt +3 -0
- model_state_layer_2_TransformerLayer.pt +3 -0
- model_state_layer_30_TransformerLayer.pt +3 -0
- model_state_layer_31_TransformerLayer.pt +3 -0
- model_state_layer_32_TransformerLayer.pt +3 -0
- model_state_layer_33_LayerNormWrapper.pt +3 -0
- model_state_layer_34_TransformerLMHead.pt +3 -0
- model_state_layer_3_TransformerLayer.pt +3 -0
- model_state_layer_4_TransformerLayer.pt +3 -0
- model_state_layer_5_TransformerLayer.pt +3 -0
- model_state_layer_6_TransformerLayer.pt +3 -0
- model_state_layer_7_TransformerLayer.pt +3 -0
- model_state_layer_8_TransformerLayer.pt +3 -0
- model_state_layer_9_TransformerLayer.pt +3 -0
- vocab.json +0 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The following applies to all files in this repository, unless otherwise noted:
|
2 |
+
|
3 |
+
Copyright (c) 2024 IPAI Aleph Alpha Research GmbH. All rights reserved.
|
4 |
+
|
5 |
+
This project is licensed under the terms of the Open Aleph License 1.0, available at
|
6 |
+
https://github.com/Aleph-Alpha/.github/blob/main/oal.pdf
|
7 |
+
|
8 |
+
---
|
9 |
+
Excerpt from the license text:
|
10 |
+
|
11 |
+
Subject to the terms and conditions of this License, the Licensor grants you a non-exclusive, worldwide,
|
12 |
+
non-transferable, non-sublicensable, and royalty-free limited right to use, copy, modify, distribute, make
|
13 |
+
otherwise publicly available, and reproduce the Works and Derivative Works under Licensor’s copyright,
|
14 |
+
for any Non-Commercial and Non-Administrative purpose.
|
15 |
+
You may not use, copy, modify, distribute, make otherwise publicly available, reproduce, or sublicense the
|
16 |
+
Works or Derivative Works except as expressly provided under and in accordance with this License.
|
17 |
+
Your rights granted under this License will automatically terminate if you fail to comply with any of the
|
18 |
+
terms of this License.
|
19 |
+
|
20 |
+
EXCEPT FOR DAMAGES CAUSED BY INTENT OR FRAUDULENTLY CONCEALED
|
21 |
+
DEFECTS, AND EXCEPT FOR DAMAGES RESULTING FROM BREACH OF ANY
|
22 |
+
WARRANTY OR GUARANTEE EXPRESSLY GIVEN BY LICENSOR IN THE OPEN ALEPH LICENSE,
|
23 |
+
IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY
|
24 |
+
DAMAGES ARISING OUT OF THE OPEN ALEPH LICENSE OR THE USE OF THE WORK. ANY
|
25 |
+
MANDATORY STATUTORY LIABILITY UNDER APPLICABLE LAW REMAINS
|
26 |
+
UNAFFECTED.
|
27 |
+
|
28 |
+
EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR REQUIRED BY APPLICABLE
|
29 |
+
LAW, THE WORKS ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES
|
30 |
+
OF ANY KIND INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING
|
31 |
+
THE CONTENTS, ACCURACY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
README.md
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
license_name: open-aleph-license
|
4 |
+
license_link: LICENSE
|
5 |
+
library_name: scaling
|
6 |
+
pipeline_tag: text-generation
|
7 |
+
---
|
8 |
+
|
9 |
+
|
10 |
+
![image/png](https://cdn-uploads.huggingface.co/production/uploads/671a0238b080a748c29b8fea/v1rfcKVaL8vnjuCqWUmI-.png)
|
11 |
+
|
12 |
+
|
13 |
+
# u-μP: Stable training in low precision for a significant speed-up and memory reduction during training
|
14 |
+
|
15 |
+
|
16 |
+
![image/png](https://cdn-uploads.huggingface.co/production/uploads/671a0238b080a748c29b8fea/F1-zbAXF5LGvxpIRrYfU4.png)
|
17 |
+
|
18 |
+
|
19 |
+
This Repository holds the model weights for the 7B u-μP models trained at Aleph Alpha Research for 72k steps (300B tokens). Please note, that the released checkpoints are not fully converged models and are intended for research use only.
|
20 |
+
|
21 |
+
You can find all model weights and their corresponding safetensors conversions at the following links:
|
22 |
+
- [umup-research-7b-bf16](https://huggingface.co/Aleph-Alpha/umup-research-7b-bf16)
|
23 |
+
- [umup-research-7b-fp8](https://huggingface.co/Aleph-Alpha/umup-research-7b-fp8)
|
24 |
+
- [sp-baseline-research-7b-bf16](https://huggingface.co/Aleph-Alpha/sp-baseline-research-7b-bf16)
|
25 |
+
|
26 |
+
The Maximal Update Parametrization (μP) aims to make the optimal hyperparameters (HPs) of a model-independent of its size, allowing them to be swept using a cheap proxy model rather than the full-size target model. We present a new scheme, u-μP, which improves upon μP by combining it with Unit Scaling, a method for designing models that makes them easy to train in low precision. The two techniques have a natural affinity: μP ensures that the scale of activations is independent of model size, and Unit Scaling ensures that activations, weights, and gradients begin training with a scale of one. This synthesis opens the door to a simpler scheme, whose default values are near-optimal. This in turn facilitates a more efficient sweeping strategy, with u-μP models reaching a lower loss than comparable μP models and working out-of-the-box in FP8.
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
If you want to learn more details about u-μP, check out our [blog post](https://aleph-alpha.com/in-awe-at-the-scale-of-these-tensors-a-gentle-introduction-to-unit-scaled-maximal-update-parametrization/) and our [paper](https://arxiv.org/abs/2407.17465).
|
31 |
+
|
32 |
+
Unit-Scaled Maximal Update Parametrization (u-μP) is available in [Scaling](https://github.com/Aleph-Alpha/scaling), our official large-scale training codebase. Please note, that FP8-trained checkpoints only work on chips with FP8 support, like the Hopper architecture.
|
33 |
+
|
34 |
+
# Usage
|
35 |
+
You can generate tokens with the [Scaling](https://github.com/Aleph-Alpha/scaling) inference implementation:
|
36 |
+
|
37 |
+
```python
|
38 |
+
from scaling.transformer.inference import TransformerInferenceModule
|
39 |
+
from pathlib import Path
|
40 |
+
|
41 |
+
ckpt_path = Path("<path_to_repo>/7B_umup_fp8")
|
42 |
+
|
43 |
+
model = TransformerInferenceModule.from_checkpoint(ckpt_path)
|
44 |
+
|
45 |
+
prompt = "Once upon a time"
|
46 |
+
|
47 |
+
output = model.generate(max_tokens=100, input_text=prompt)
|
48 |
+
|
49 |
+
print(output.completion_text)
|
50 |
+
```
|
config.yml
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
optimizer:
|
2 |
+
allreduce_bucket_size: 500000000
|
3 |
+
beta1: 0.9
|
4 |
+
beta2: 0.95
|
5 |
+
debug_log: false
|
6 |
+
eps: 1e-08
|
7 |
+
gradient_clipping: 0.0
|
8 |
+
zero: true
|
9 |
+
zero_save_static: false
|
10 |
+
topology:
|
11 |
+
activation_checkpointing_type: disabled
|
12 |
+
global_batch_size: 1024
|
13 |
+
gradient_accumulation_steps: 4
|
14 |
+
micro_batch_size: 2
|
15 |
+
model_parallel_size: 1
|
16 |
+
pipe_parallel_size: 2
|
17 |
+
pipe_partition_method: balanced
|
18 |
+
pipe_partition_overwrite: null
|
19 |
+
sequence_parallel: false
|
20 |
+
trainer:
|
21 |
+
seed: 42
|
22 |
+
train_iterations: 72000
|
23 |
+
training:
|
24 |
+
allow_missing_params_in_optimizer: true
|
25 |
+
training_groups:
|
26 |
+
- group_name: param_group
|
27 |
+
independent_weight_decay: true
|
28 |
+
learning_rate_scheduler:
|
29 |
+
learning_rate: 11.313708498984761
|
30 |
+
learning_rate_decay_iters: 72000
|
31 |
+
learning_rate_decay_style: cosine
|
32 |
+
learning_rate_minimum: 1.131370849898476
|
33 |
+
learning_rate_warmup_steps: 500
|
34 |
+
parameters_exclude:
|
35 |
+
- norm
|
36 |
+
weight_decay: 0.0001221
|
37 |
+
transformer_architecture:
|
38 |
+
attention_bias: false
|
39 |
+
attention_num_kv_heads: null
|
40 |
+
attention_qkv_in_one: true
|
41 |
+
dropout_after_attention: 0.0
|
42 |
+
dropout_after_mlp: 0.0
|
43 |
+
dropout_attention_probs: 0.0
|
44 |
+
dropout_embedding: 0.0
|
45 |
+
dropout_image_encoder: 0.0
|
46 |
+
fp8_config_attention:
|
47 |
+
dtypes_forward:
|
48 |
+
left_dtype: e4m3
|
49 |
+
right_dtype: e4m3
|
50 |
+
dtypes_grad_input:
|
51 |
+
left_dtype: e5m2
|
52 |
+
right_dtype: e4m3
|
53 |
+
dtypes_grad_weight:
|
54 |
+
left_dtype: e4m3
|
55 |
+
right_dtype: e5m2
|
56 |
+
fp8_config_mlp:
|
57 |
+
dtypes_forward:
|
58 |
+
left_dtype: e4m3
|
59 |
+
right_dtype: e4m3
|
60 |
+
dtypes_grad_input:
|
61 |
+
left_dtype: e5m2
|
62 |
+
right_dtype: e4m3
|
63 |
+
dtypes_grad_weight:
|
64 |
+
left_dtype: e4m3
|
65 |
+
right_dtype: e5m2
|
66 |
+
hidden_size: 4096
|
67 |
+
image_encoder: false
|
68 |
+
key_query_norm: false
|
69 |
+
layernorm:
|
70 |
+
layernorm_epsilon: 1e-05
|
71 |
+
optimization_type: torch
|
72 |
+
local_attention_window_size: null
|
73 |
+
masked_softmax:
|
74 |
+
kernel: flash_attention
|
75 |
+
scale: 1.0
|
76 |
+
softmax_in_fp32: false
|
77 |
+
mlp_bias: false
|
78 |
+
mlp_factor: 2.66796875
|
79 |
+
mlp_type: swiglu
|
80 |
+
norm_type: rms
|
81 |
+
num_attention_heads: 32
|
82 |
+
num_layers: 32
|
83 |
+
num_local_attention_heads: 0
|
84 |
+
precision: bfloat16
|
85 |
+
relative_position_embedding_type: rotary_complex
|
86 |
+
reset_attention_mask: false
|
87 |
+
reset_position_ids: false
|
88 |
+
rotary_embedding_base: 10000
|
89 |
+
rotary_percentage: 1.0
|
90 |
+
sequence_length: 4096
|
91 |
+
umup:
|
92 |
+
act_mult: 1.0
|
93 |
+
attn_mult: 1.0
|
94 |
+
enable: true
|
95 |
+
loss_mult: 1.0
|
96 |
+
normalize_depth_to_num_layers: true
|
97 |
+
residual_attn_ratio: 0.25
|
98 |
+
residual_mult: 1.0
|
99 |
+
vocab_file: null
|
100 |
+
vocab_size: 65536
|
101 |
+
weight_tying: false
|
model_state_layer_0_EmbeddingInput.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a231ae564fec7d0538547295cd6a650f82e1907da3c2cfdb0025ab38cb241a7
|
3 |
+
size 536872395
|
model_state_layer_10_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7faa4cee189672fd4370b7a4567d95f428a6ba155812a53cb6435068f089744
|
3 |
+
size 402803885
|
model_state_layer_11_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33d381483ac987da4f1def484f8e7d838c8e8044d18c641c45d79aedd1a72854
|
3 |
+
size 402803885
|
model_state_layer_12_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:076b074089a7dd7c700f306e94be5db9389642562937b91b6afd65bc3a0fe4dd
|
3 |
+
size 402803885
|
model_state_layer_13_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d30169d26deb717b0e443aeb85846e82e531ae959b8a2814b2bf7cf44c15ae3
|
3 |
+
size 402803885
|
model_state_layer_14_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:917156f9fef24f5159fae1dad1defa3bc1bae042dc31d365fd39f50af0f463c6
|
3 |
+
size 402803885
|
model_state_layer_15_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c82364c46d7b3d67fac05474c46b0dc65abf836ee53e14c4e31fa4b17358995
|
3 |
+
size 402803885
|
model_state_layer_16_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7f65efab7e7296718512f8f272f0c32ce0c06e1d31394e40e63e17fd9bf1db0
|
3 |
+
size 402803885
|
model_state_layer_17_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76cb8a177cf9a77c105bc424a4a3f6df112abb6f53c74e9a741826cb0183199e
|
3 |
+
size 402803885
|
model_state_layer_18_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec06f64d40378137477c2da30503aec73f6e1489a385476ef16d94c504917b9e
|
3 |
+
size 402803885
|
model_state_layer_19_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ff91b52269e4e124b52b10bf34ce15e8d23167e211a99813918dca1bf2b6112
|
3 |
+
size 402803885
|
model_state_layer_1_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e14a6e1c964c8da62a343b7da2d45404f8b35f83b6afea7b987cb6d0c2a91b81
|
3 |
+
size 402803874
|
model_state_layer_20_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f7c72d971db74f628064286fb0f7ecf416e8ccb893c4074aeddb993d89d30bc
|
3 |
+
size 402803885
|
model_state_layer_21_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62dddec6d330e9f61406cab3a3fbc1570094f22714ed0cb3e11a9338726becd9
|
3 |
+
size 402803885
|
model_state_layer_22_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79fb6dfeb425ac8b57b50214d8dc795ce6cc91b58ad357e4c715959eb995b148
|
3 |
+
size 402803885
|
model_state_layer_23_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19f399896711cf4f0b1e59ca75982102eb35720071c606920a64364ed985167b
|
3 |
+
size 402803885
|
model_state_layer_24_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c54ef4d647ae58c490591d44974048349e4380b22ff9a9274fa64ad62ddf0a5d
|
3 |
+
size 402803885
|
model_state_layer_25_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5395e624161287112d852bf445cccd45fe161ffdc9f39a943ee6be96dc8c6c5d
|
3 |
+
size 402803885
|
model_state_layer_26_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad9cfc0b9f9210abc5f243c1e5b54614c6aa43dbdf9e314823295c86f16991ac
|
3 |
+
size 402803885
|
model_state_layer_27_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0d8704348468da90d998fe364fd797ad1bb8c241f057da77f51abb09ea89359
|
3 |
+
size 402803885
|
model_state_layer_28_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b864c7ff06f2f86abf422b4dc2711a684f513920771382a6aff6f8ef2e3bb2
|
3 |
+
size 402803885
|
model_state_layer_29_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:491bef02815b5acfa1cacf9af7e1d0503fe49cc54393c1203169d818eb76876b
|
3 |
+
size 402803885
|
model_state_layer_2_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b17079c4dd7c1adbc6b5cf907a51241d0f32d5cc95adae2f5a4d2b3e8a8e82e9
|
3 |
+
size 402803874
|
model_state_layer_30_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a99c0b0b3bd02ce5b8026951b44b644e9451b938c310ba30b33c755ae92d0f63
|
3 |
+
size 402803885
|
model_state_layer_31_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f82a2f6b819c97ee133d7fba1e761f761a6c2342330416da6cdc1847815ccbd7
|
3 |
+
size 402803885
|
model_state_layer_32_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:605f545510dc2b8fa66c102da27fe2f15f3325e0452c3480e4b869874cc396c2
|
3 |
+
size 402803885
|
model_state_layer_33_LayerNormWrapper.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fef3ad9a684ad078dc8af6a7034775dfb3fb66550f2a44945928150a3c162ed
|
3 |
+
size 9650
|
model_state_layer_34_TransformerLMHead.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4001f5f525c856dbbc663e937ca635a73f20fa3469b8ba0701f6207ce50f7338
|
3 |
+
size 536872360
|
model_state_layer_3_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc16f965d119a504c06a73ddbafe4dad49203c9d3ef0d888f35148b96b6b9882
|
3 |
+
size 402803874
|
model_state_layer_4_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1df0912a91b8a4eca662a6ee06c46f211b27269357ba4547ce5f506190a5444f
|
3 |
+
size 402803874
|
model_state_layer_5_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edeb1640ac3eebec2577490b14a5285d265d0c0a2940e2dd74c2b4ce828fcd5c
|
3 |
+
size 402803874
|
model_state_layer_6_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2180db549479bdf9deaa373c0f2844c1df3a5083e9e277e241871bda5794a82
|
3 |
+
size 402803874
|
model_state_layer_7_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12c8db34868983038b6a3b6189f2e8a7d875b59b86f06077cd765b97fb023ad6
|
3 |
+
size 402803874
|
model_state_layer_8_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5655718812e3ae200b3cae538d2b20325fbe43f905bab66cfc7c7885e8ed20a9
|
3 |
+
size 402803874
|
model_state_layer_9_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a148acb4ef6d0054a3289dc80c99145e7f6459f2aeffc0f5daca91a2dbbb575
|
3 |
+
size 402803874
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|