Spaces:
Sleeping
Sleeping
Upload 33 files
Browse files- aglib/meliad/transformer/configs/options/debug_mode.gin +5 -0
- aglib/meliad/transformer/configs/options/enable_scan.gin +5 -0
- aglib/meliad/transformer/configs/options/external_memory_32k.gin +17 -0
- aglib/meliad/transformer/configs/options/external_memory_8k.gin +17 -0
- aglib/meliad/transformer/configs/options/final_mlp.gin +14 -0
- aglib/meliad/transformer/configs/options/lr_cosine_decay.gin +13 -0
- aglib/meliad/transformer/configs/options/lr_cosine_decay_spike.gin +13 -0
- aglib/meliad/transformer/configs/options/lr_linear_decay.gin +10 -0
- aglib/meliad/transformer/configs/options/lr_max_steps_125k.gin +3 -0
- aglib/meliad/transformer/configs/options/lr_max_steps_250k.gin +3 -0
- aglib/meliad/transformer/configs/options/lr_max_steps_500k.gin +3 -0
- aglib/meliad/transformer/configs/options/lr_rsqrt_decay.gin +9 -0
- aglib/meliad/transformer/configs/options/lr_rsqrt_decay_std.gin +11 -0
- aglib/meliad/transformer/configs/options/lr_scale_05.gin +3 -0
- aglib/meliad/transformer/configs/options/lr_scale_2.gin +3 -0
- aglib/meliad/transformer/configs/options/no_norm.gin +3 -0
- aglib/meliad/transformer/configs/options/positions_absolute.gin +7 -0
- aglib/meliad/transformer/configs/options/positions_fourier.gin +7 -0
- aglib/meliad/transformer/configs/options/positions_rotary.gin +7 -0
- aglib/meliad/transformer/configs/options/positions_t5.gin +7 -0
- aglib/meliad/transformer/configs/options/seq_1024.gin +7 -0
- aglib/meliad/transformer/configs/options/seq_1024_nocache.gin +8 -0
- aglib/meliad/transformer/configs/options/seq_2048.gin +7 -0
- aglib/meliad/transformer/configs/options/seq_2048_nocache.gin +8 -0
- aglib/meliad/transformer/configs/options/seq_4096.gin +7 -0
- aglib/meliad/transformer/configs/options/seq_512.gin +7 -0
- aglib/meliad/transformer/configs/options/seq_512_nocache.gin +8 -0
- aglib/meliad/transformer/configs/options/stack_window_512.gin +6 -0
- aglib/meliad/transformer/configs/options/window_1024.gin +4 -0
- aglib/meliad/transformer/configs/options/window_128.gin +4 -0
- aglib/meliad/transformer/configs/options/window_2048.gin +4 -0
- aglib/meliad/transformer/configs/options/window_256.gin +4 -0
- aglib/meliad/transformer/configs/options/window_512.gin +4 -0
aglib/meliad/transformer/configs/options/debug_mode.gin
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
replicate_mode = False
|
4 |
+
trace_debug_mode = True
|
5 |
+
print_variables = True
|
aglib/meliad/transformer/configs/options/enable_scan.gin
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
DTYPE="float32" # Required when using scan for stability.
|
3 |
+
|
4 |
+
transformer_layer.TransformerLayer:
|
5 |
+
max_unrolled_windows = 0 # Never unroll.
|
aglib/meliad/transformer/configs/options/external_memory_32k.gin
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
include "memory_configuration.gin"
|
3 |
+
|
4 |
+
from transformer import attention
|
5 |
+
from transformer import memory_factory
|
6 |
+
|
7 |
+
NUM_MEMORY_HEADS = %NUM_HEADS
|
8 |
+
|
9 |
+
decoder_stack.DecoderStack:
|
10 |
+
memory_factory = @memory_factory.memory_on_tpu_factory
|
11 |
+
memory_layer_indices = (-4,)
|
12 |
+
|
13 |
+
transformer_layer.TransformerLayer:
|
14 |
+
memory_num_neighbors = 128
|
15 |
+
|
16 |
+
memory_factory.memory_on_tpu_factory:
|
17 |
+
database_size = 32768
|
aglib/meliad/transformer/configs/options/external_memory_8k.gin
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
include "memory_configuration.gin"
|
3 |
+
|
4 |
+
from transformer import attention
|
5 |
+
from transformer import memory_factory
|
6 |
+
|
7 |
+
NUM_MEMORY_HEADS = %NUM_HEADS
|
8 |
+
|
9 |
+
decoder_stack.DecoderStack:
|
10 |
+
memory_factory = @memory_factory.memory_on_tpu_factory
|
11 |
+
memory_layer_indices = (-4,)
|
12 |
+
|
13 |
+
transformer_layer.TransformerLayer:
|
14 |
+
memory_num_neighbors = 128
|
15 |
+
|
16 |
+
memory_factory.memory_on_tpu_factory:
|
17 |
+
database_size = 8192
|
aglib/meliad/transformer/configs/options/final_mlp.gin
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformer import decoder_stack
|
3 |
+
from transformer import nn_components
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
final_mlp_factory = @decoder_final_mlp/nn_components.MLP
|
7 |
+
|
8 |
+
# Add a final MLP for token prediction after the last transformer layer.
|
9 |
+
decoder_final_mlp/nn_components.MLP:
|
10 |
+
num_hidden_units = %MLP_DIM
|
11 |
+
num_layers = 2
|
12 |
+
activation_function = "relu"
|
13 |
+
use_bias = False
|
14 |
+
dtype = %DTYPE
|
aglib/meliad/transformer/configs/options/lr_cosine_decay.gin
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import optimizer_config
|
3 |
+
|
4 |
+
training_loop.Trainer:
|
5 |
+
learning_rate_schedule = @optimizer_config.lr_cosine_decay
|
6 |
+
|
7 |
+
optimizer_config.lr_cosine_decay:
|
8 |
+
max_lr = 0.01
|
9 |
+
min_lr = 0.001
|
10 |
+
decay_after = True
|
11 |
+
spike_steps = 0
|
12 |
+
spike_lr = 0.0
|
13 |
+
|
aglib/meliad/transformer/configs/options/lr_cosine_decay_spike.gin
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import optimizer_config
|
3 |
+
|
4 |
+
training_loop.Trainer:
|
5 |
+
learning_rate_schedule = @optimizer_config.lr_cosine_decay
|
6 |
+
|
7 |
+
optimizer_config.lr_cosine_decay:
|
8 |
+
max_lr = 0.01
|
9 |
+
min_lr = 0.001
|
10 |
+
decay_after = True
|
11 |
+
spike_steps = 10_000
|
12 |
+
spike_lr = 0.04
|
13 |
+
|
aglib/meliad/transformer/configs/options/lr_linear_decay.gin
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import optimizer_config
|
3 |
+
|
4 |
+
training_loop.Trainer:
|
5 |
+
learning_rate_schedule = @optimizer_config.lr_linear_decay
|
6 |
+
|
7 |
+
optimizer_config.lr_linear_decay:
|
8 |
+
max_lr = 0.01
|
9 |
+
min_lr = 0.001
|
10 |
+
decay_after = True
|
aglib/meliad/transformer/configs/options/lr_max_steps_125k.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
max_scheduled_steps = 125_000
|
aglib/meliad/transformer/configs/options/lr_max_steps_250k.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
max_scheduled_steps = 250_000
|
aglib/meliad/transformer/configs/options/lr_max_steps_500k.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
max_scheduled_steps = 500_000
|
aglib/meliad/transformer/configs/options/lr_rsqrt_decay.gin
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import optimizer_config
|
3 |
+
|
4 |
+
training_loop.Trainer:
|
5 |
+
learning_rate_schedule = @optimizer_config.lr_rsqrt_decay
|
6 |
+
|
7 |
+
optimizer_config.lr_rsqrt_decay:
|
8 |
+
max_lr = 0.05
|
9 |
+
min_lr = 0.001
|
aglib/meliad/transformer/configs/options/lr_rsqrt_decay_std.gin
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import optimizer_config
|
3 |
+
|
4 |
+
# Implement standard rsqrt decay as used in the memorizing and block-recurrent
|
5 |
+
# transformer papers, which does not decay to a specified minimum learning
|
6 |
+
# rate over max_steps.
|
7 |
+
training_loop.Trainer:
|
8 |
+
learning_rate_schedule = @optimizer_config.lr_rsqrt_decay_std
|
9 |
+
|
10 |
+
optimizer_config.lr_rsqrt_decay_std:
|
11 |
+
max_lr = None
|
aglib/meliad/transformer/configs/options/lr_scale_05.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
learning_rate_multiplier = 0.5
|
aglib/meliad/transformer/configs/options/lr_scale_2.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
training_loop.Trainer:
|
3 |
+
learning_rate_multiplier = 2.0
|
aglib/meliad/transformer/configs/options/no_norm.gin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_base.TransformerBase:
|
3 |
+
normalize_keys = False
|
aglib/meliad/transformer/configs/options/positions_absolute.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_layer.TransformerLayer:
|
3 |
+
relative_position_type = None
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
use_absolute_positions = True
|
7 |
+
|
aglib/meliad/transformer/configs/options/positions_fourier.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_layer.TransformerLayer:
|
3 |
+
relative_position_type = "fourier"
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
use_absolute_positions = False
|
7 |
+
|
aglib/meliad/transformer/configs/options/positions_rotary.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_layer.TransformerLayer:
|
3 |
+
relative_position_type = "rotary"
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
use_absolute_positions = False
|
7 |
+
|
aglib/meliad/transformer/configs/options/positions_t5.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_layer.TransformerLayer:
|
3 |
+
relative_position_type = "t5"
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
use_absolute_positions = False
|
7 |
+
|
aglib/meliad/transformer/configs/options/seq_1024.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 1024
|
4 |
+
batch_size = 4
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
use_long_xl_architecture = True
|
aglib/meliad/transformer/configs/options/seq_1024_nocache.gin
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 1024
|
4 |
+
batch_size = 4
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
window_length = 1024
|
8 |
+
use_long_xl_architecture = False
|
aglib/meliad/transformer/configs/options/seq_2048.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 2048
|
4 |
+
batch_size = 2
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
use_long_xl_architecture = True
|
aglib/meliad/transformer/configs/options/seq_2048_nocache.gin
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 2048
|
4 |
+
batch_size = 2
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
window_length = 2048
|
8 |
+
use_long_xl_architecture = False
|
aglib/meliad/transformer/configs/options/seq_4096.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 4096
|
4 |
+
batch_size = 1
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
use_long_xl_architecture = True
|
aglib/meliad/transformer/configs/options/seq_512.gin
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 512
|
4 |
+
batch_size = 8
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
use_long_xl_architecture = True
|
aglib/meliad/transformer/configs/options/seq_512_nocache.gin
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
decoder_stack.TransformerTaskConfig:
|
3 |
+
sequence_length = 512
|
4 |
+
batch_size = 8
|
5 |
+
|
6 |
+
transformer_layer.TransformerLayer:
|
7 |
+
window_length = 512
|
8 |
+
use_long_xl_architecture = False
|
aglib/meliad/transformer/configs/options/stack_window_512.gin
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
transformer_layer.TransformerLayer:
|
3 |
+
window_length = 512
|
4 |
+
|
5 |
+
decoder_stack.DecoderStack:
|
6 |
+
dstack_window_length = 512
|
aglib/meliad/transformer/configs/options/window_1024.gin
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sequence length must be larger than window_length.
|
3 |
+
transformer_layer.TransformerLayer:
|
4 |
+
window_length = 1024
|
aglib/meliad/transformer/configs/options/window_128.gin
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sequence length must be larger than window_length.
|
3 |
+
transformer_layer.TransformerLayer:
|
4 |
+
window_length = 128
|
aglib/meliad/transformer/configs/options/window_2048.gin
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sequence length must be larger than window_length.
|
3 |
+
transformer_layer.TransformerLayer:
|
4 |
+
window_length = 2048
|
aglib/meliad/transformer/configs/options/window_256.gin
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sequence length must be larger than window_length.
|
3 |
+
transformer_layer.TransformerLayer:
|
4 |
+
window_length = 256
|
aglib/meliad/transformer/configs/options/window_512.gin
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Sequence length must be larger than window_length.
|
3 |
+
transformer_layer.TransformerLayer:
|
4 |
+
window_length = 512
|