HugoVoxx commited on
Commit
2e4e57e
·
verified ·
1 Parent(s): 4e5ee36

Upload 33 files

Browse files
Files changed (33) hide show
  1. aglib/meliad/transformer/configs/options/debug_mode.gin +5 -0
  2. aglib/meliad/transformer/configs/options/enable_scan.gin +5 -0
  3. aglib/meliad/transformer/configs/options/external_memory_32k.gin +17 -0
  4. aglib/meliad/transformer/configs/options/external_memory_8k.gin +17 -0
  5. aglib/meliad/transformer/configs/options/final_mlp.gin +14 -0
  6. aglib/meliad/transformer/configs/options/lr_cosine_decay.gin +13 -0
  7. aglib/meliad/transformer/configs/options/lr_cosine_decay_spike.gin +13 -0
  8. aglib/meliad/transformer/configs/options/lr_linear_decay.gin +10 -0
  9. aglib/meliad/transformer/configs/options/lr_max_steps_125k.gin +3 -0
  10. aglib/meliad/transformer/configs/options/lr_max_steps_250k.gin +3 -0
  11. aglib/meliad/transformer/configs/options/lr_max_steps_500k.gin +3 -0
  12. aglib/meliad/transformer/configs/options/lr_rsqrt_decay.gin +9 -0
  13. aglib/meliad/transformer/configs/options/lr_rsqrt_decay_std.gin +11 -0
  14. aglib/meliad/transformer/configs/options/lr_scale_05.gin +3 -0
  15. aglib/meliad/transformer/configs/options/lr_scale_2.gin +3 -0
  16. aglib/meliad/transformer/configs/options/no_norm.gin +3 -0
  17. aglib/meliad/transformer/configs/options/positions_absolute.gin +7 -0
  18. aglib/meliad/transformer/configs/options/positions_fourier.gin +7 -0
  19. aglib/meliad/transformer/configs/options/positions_rotary.gin +7 -0
  20. aglib/meliad/transformer/configs/options/positions_t5.gin +7 -0
  21. aglib/meliad/transformer/configs/options/seq_1024.gin +7 -0
  22. aglib/meliad/transformer/configs/options/seq_1024_nocache.gin +8 -0
  23. aglib/meliad/transformer/configs/options/seq_2048.gin +7 -0
  24. aglib/meliad/transformer/configs/options/seq_2048_nocache.gin +8 -0
  25. aglib/meliad/transformer/configs/options/seq_4096.gin +7 -0
  26. aglib/meliad/transformer/configs/options/seq_512.gin +7 -0
  27. aglib/meliad/transformer/configs/options/seq_512_nocache.gin +8 -0
  28. aglib/meliad/transformer/configs/options/stack_window_512.gin +6 -0
  29. aglib/meliad/transformer/configs/options/window_1024.gin +4 -0
  30. aglib/meliad/transformer/configs/options/window_128.gin +4 -0
  31. aglib/meliad/transformer/configs/options/window_2048.gin +4 -0
  32. aglib/meliad/transformer/configs/options/window_256.gin +4 -0
  33. aglib/meliad/transformer/configs/options/window_512.gin +4 -0
aglib/meliad/transformer/configs/options/debug_mode.gin ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ replicate_mode = False
4
+ trace_debug_mode = True
5
+ print_variables = True
aglib/meliad/transformer/configs/options/enable_scan.gin ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ DTYPE="float32" # Required when using scan for stability.
3
+
4
+ transformer_layer.TransformerLayer:
5
+ max_unrolled_windows = 0 # Never unroll.
aglib/meliad/transformer/configs/options/external_memory_32k.gin ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ include "memory_configuration.gin"
3
+
4
+ from transformer import attention
5
+ from transformer import memory_factory
6
+
7
+ NUM_MEMORY_HEADS = %NUM_HEADS
8
+
9
+ decoder_stack.DecoderStack:
10
+ memory_factory = @memory_factory.memory_on_tpu_factory
11
+ memory_layer_indices = (-4,)
12
+
13
+ transformer_layer.TransformerLayer:
14
+ memory_num_neighbors = 128
15
+
16
+ memory_factory.memory_on_tpu_factory:
17
+ database_size = 32768
aglib/meliad/transformer/configs/options/external_memory_8k.gin ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ include "memory_configuration.gin"
3
+
4
+ from transformer import attention
5
+ from transformer import memory_factory
6
+
7
+ NUM_MEMORY_HEADS = %NUM_HEADS
8
+
9
+ decoder_stack.DecoderStack:
10
+ memory_factory = @memory_factory.memory_on_tpu_factory
11
+ memory_layer_indices = (-4,)
12
+
13
+ transformer_layer.TransformerLayer:
14
+ memory_num_neighbors = 128
15
+
16
+ memory_factory.memory_on_tpu_factory:
17
+ database_size = 8192
aglib/meliad/transformer/configs/options/final_mlp.gin ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformer import decoder_stack
3
+ from transformer import nn_components
4
+
5
+ decoder_stack.DecoderStack:
6
+ final_mlp_factory = @decoder_final_mlp/nn_components.MLP
7
+
8
+ # Add a final MLP for token prediction after the last transformer layer.
9
+ decoder_final_mlp/nn_components.MLP:
10
+ num_hidden_units = %MLP_DIM
11
+ num_layers = 2
12
+ activation_function = "relu"
13
+ use_bias = False
14
+ dtype = %DTYPE
aglib/meliad/transformer/configs/options/lr_cosine_decay.gin ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import optimizer_config
3
+
4
+ training_loop.Trainer:
5
+ learning_rate_schedule = @optimizer_config.lr_cosine_decay
6
+
7
+ optimizer_config.lr_cosine_decay:
8
+ max_lr = 0.01
9
+ min_lr = 0.001
10
+ decay_after = True
11
+ spike_steps = 0
12
+ spike_lr = 0.0
13
+
aglib/meliad/transformer/configs/options/lr_cosine_decay_spike.gin ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import optimizer_config
3
+
4
+ training_loop.Trainer:
5
+ learning_rate_schedule = @optimizer_config.lr_cosine_decay
6
+
7
+ optimizer_config.lr_cosine_decay:
8
+ max_lr = 0.01
9
+ min_lr = 0.001
10
+ decay_after = True
11
+ spike_steps = 10_000
12
+ spike_lr = 0.04
13
+
aglib/meliad/transformer/configs/options/lr_linear_decay.gin ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import optimizer_config
3
+
4
+ training_loop.Trainer:
5
+ learning_rate_schedule = @optimizer_config.lr_linear_decay
6
+
7
+ optimizer_config.lr_linear_decay:
8
+ max_lr = 0.01
9
+ min_lr = 0.001
10
+ decay_after = True
aglib/meliad/transformer/configs/options/lr_max_steps_125k.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ max_scheduled_steps = 125_000
aglib/meliad/transformer/configs/options/lr_max_steps_250k.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ max_scheduled_steps = 250_000
aglib/meliad/transformer/configs/options/lr_max_steps_500k.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ max_scheduled_steps = 500_000
aglib/meliad/transformer/configs/options/lr_rsqrt_decay.gin ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import optimizer_config
3
+
4
+ training_loop.Trainer:
5
+ learning_rate_schedule = @optimizer_config.lr_rsqrt_decay
6
+
7
+ optimizer_config.lr_rsqrt_decay:
8
+ max_lr = 0.05
9
+ min_lr = 0.001
aglib/meliad/transformer/configs/options/lr_rsqrt_decay_std.gin ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import optimizer_config
3
+
4
+ # Implement standard rsqrt decay as used in the memorizing and block-recurrent
5
+ # transformer papers, which does not decay to a specified minimum learning
6
+ # rate over max_steps.
7
+ training_loop.Trainer:
8
+ learning_rate_schedule = @optimizer_config.lr_rsqrt_decay_std
9
+
10
+ optimizer_config.lr_rsqrt_decay_std:
11
+ max_lr = None
aglib/meliad/transformer/configs/options/lr_scale_05.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ learning_rate_multiplier = 0.5
aglib/meliad/transformer/configs/options/lr_scale_2.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ training_loop.Trainer:
3
+ learning_rate_multiplier = 2.0
aglib/meliad/transformer/configs/options/no_norm.gin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ transformer_base.TransformerBase:
3
+ normalize_keys = False
aglib/meliad/transformer/configs/options/positions_absolute.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ transformer_layer.TransformerLayer:
3
+ relative_position_type = None
4
+
5
+ decoder_stack.DecoderStack:
6
+ use_absolute_positions = True
7
+
aglib/meliad/transformer/configs/options/positions_fourier.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ transformer_layer.TransformerLayer:
3
+ relative_position_type = "fourier"
4
+
5
+ decoder_stack.DecoderStack:
6
+ use_absolute_positions = False
7
+
aglib/meliad/transformer/configs/options/positions_rotary.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ transformer_layer.TransformerLayer:
3
+ relative_position_type = "rotary"
4
+
5
+ decoder_stack.DecoderStack:
6
+ use_absolute_positions = False
7
+
aglib/meliad/transformer/configs/options/positions_t5.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ transformer_layer.TransformerLayer:
3
+ relative_position_type = "t5"
4
+
5
+ decoder_stack.DecoderStack:
6
+ use_absolute_positions = False
7
+
aglib/meliad/transformer/configs/options/seq_1024.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 1024
4
+ batch_size = 4
5
+
6
+ transformer_layer.TransformerLayer:
7
+ use_long_xl_architecture = True
aglib/meliad/transformer/configs/options/seq_1024_nocache.gin ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 1024
4
+ batch_size = 4
5
+
6
+ transformer_layer.TransformerLayer:
7
+ window_length = 1024
8
+ use_long_xl_architecture = False
aglib/meliad/transformer/configs/options/seq_2048.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 2048
4
+ batch_size = 2
5
+
6
+ transformer_layer.TransformerLayer:
7
+ use_long_xl_architecture = True
aglib/meliad/transformer/configs/options/seq_2048_nocache.gin ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 2048
4
+ batch_size = 2
5
+
6
+ transformer_layer.TransformerLayer:
7
+ window_length = 2048
8
+ use_long_xl_architecture = False
aglib/meliad/transformer/configs/options/seq_4096.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 4096
4
+ batch_size = 1
5
+
6
+ transformer_layer.TransformerLayer:
7
+ use_long_xl_architecture = True
aglib/meliad/transformer/configs/options/seq_512.gin ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 512
4
+ batch_size = 8
5
+
6
+ transformer_layer.TransformerLayer:
7
+ use_long_xl_architecture = True
aglib/meliad/transformer/configs/options/seq_512_nocache.gin ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ decoder_stack.TransformerTaskConfig:
3
+ sequence_length = 512
4
+ batch_size = 8
5
+
6
+ transformer_layer.TransformerLayer:
7
+ window_length = 512
8
+ use_long_xl_architecture = False
aglib/meliad/transformer/configs/options/stack_window_512.gin ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ transformer_layer.TransformerLayer:
3
+ window_length = 512
4
+
5
+ decoder_stack.DecoderStack:
6
+ dstack_window_length = 512
aglib/meliad/transformer/configs/options/window_1024.gin ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ # Sequence length must be larger than window_length.
3
+ transformer_layer.TransformerLayer:
4
+ window_length = 1024
aglib/meliad/transformer/configs/options/window_128.gin ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ # Sequence length must be larger than window_length.
3
+ transformer_layer.TransformerLayer:
4
+ window_length = 128
aglib/meliad/transformer/configs/options/window_2048.gin ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ # Sequence length must be larger than window_length.
3
+ transformer_layer.TransformerLayer:
4
+ window_length = 2048
aglib/meliad/transformer/configs/options/window_256.gin ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ # Sequence length must be larger than window_length.
3
+ transformer_layer.TransformerLayer:
4
+ window_length = 256
aglib/meliad/transformer/configs/options/window_512.gin ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ # Sequence length must be larger than window_length.
3
+ transformer_layer.TransformerLayer:
4
+ window_length = 512