Spaces:
Runtime error
Runtime error
| import optimizer_config | |
| import training_loop | |
| from transformer import models | |
| from transformer import text_dataset | |
| # Training setup. | |
| training_loop.Trainer: | |
| model_definition = @models.DecoderOnlyLanguageModel | |
| num_steps = 250_000 | |
| status_every_steps = 10 | |
| log_every_steps = 1000 | |
| test_every_steps = 1000 | |
| num_test_steps = 400 | |
| generate_every_steps = 5000 | |
| print_input_every_steps = 5000 | |
| checkpoint_every_steps = 5000 | |
| save_checkpoints = True | |
| restore_checkpoints = True | |
| use_separate_metric_directories = False | |
| optimizer_factory = @optimizer_config.FlaxAdafactorConfig() | |
| learning_rate_schedule = @optimizer_config.lr_cosine_decay | |
| max_scheduled_steps = 0 # Use num_steps as max_scheduled_steps. | |
| warmup_steps = 1000 | |
| learning_rate_multiplier = 1.0 | |
| rng_key_names = ("dropout", "sample") | |
| text_dataset.load_text_dataset: | |
| verbose = False # if true, prints the start of every book/repo read from disk | |
| # Use cosine decay to max_scheduled_steps, as described in Chinchilla: | |
| # https://arxiv.org/abs/2203.15556 | |
| optimizer_config.lr_cosine_decay: | |
| max_lr = 0.01 | |
| min_lr = 0.001 | |
| decay_after = True | |
| spike_steps = 0 | |
| spike_lr = 0.0 | |
| # Adam optimizer configuration. | |
| # optimizer_config.AdamConfig: | |
| # learning_rate = 0.05 # Will be multiplied by the LR schedule. | |
| # beta1 = 0.9 | |
| # beta2 = 0.98 | |
| # weight_decay_rate = 0.0 | |
| # Adafactor optimizer configuration. | |
| optimizer_config.FlaxAdafactorConfig: | |
| learning_rate = 1.0 # Will be multiplied by the LR schedule. | |
| beta1 = 0.9 # Can be "None". | |