Spaces:

HugoVoxx
/

GeoGenSolve

Runtime error

App Files Files

HugoVoxx commited on Nov 13, 2024

Commit

a5ccd04

verified ·

1 Parent(s): 9fa58f4

Upload 5 files

Browse files

Files changed (5) hide show

aglib/meliad/metrics_summary.py +309 -0
aglib/meliad/optimizer_config.py +281 -0
aglib/meliad/requirements.txt +11 -0
aglib/meliad/training_loop.py +757 -0
aglib/meliad/training_task.py +216 -0

aglib/meliad/metrics_summary.py ADDED Viewed

	@@ -0,0 +1,309 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Class to handle summarizing of metrics over multiple training steps."""
+import abc
+from typing import Any, Dict, Mapping, Optional, Tuple, Union
+from absl import logging
+from clu import metric_writers
+import gin
+import jax
+from jax import numpy as jnp
+import numpy as np
+Array = Union[jnp.ndarray, np.ndarray]
+class Aggregator(abc.ABC):  # Superclass for type checks
+  @abc.abstractmethod
+  def add(self, value: Any):
+    pass
+  @abc.abstractmethod
+  def is_valid(self) -> bool:
+    pass
+  @abc.abstractmethod
+  def to_value(self):
+    pass
+class _MeanAggregator(Aggregator):
+  """Maintains the mean of incoming values."""
+  mean: float = 0.0
+  weight: float = 0.0
+  def add(self, new_value: Any):
+    """Aggregates a new value into the mean."""
+    if np.ndim(new_value) == 0:  # is a scalar; works with int, float, Array
+      val, weight = new_value, 1.0  # assuming weight 1 by default
+    else:
+      val, weight = new_value
+    if weight < 0.0:
+      raise ValueError("Adding value with negative weight.")
+    total_weight = self.weight + weight
+    if total_weight != 0.0 and weight > 0.0:
+      delta = (val - self.mean) * weight / total_weight
+      self.mean += delta
+      self.weight = total_weight
+  def is_valid(self) -> bool:
+    return self.weight > 0.0
+  def to_value(self):
+    assert self.weight > 0.0
+    return self.mean
+class _SumAggregator(_MeanAggregator):
+  # We aggregate sum and mean in the same way as a tuple of the form:
+  # (weighted mean, total weights). "sum" can then be computed by
+  # multiplying the two values.
+  def is_valid(self) -> bool:
+    return True
+  def to_value(self):
+    return self.mean * self.weight
+class _LastAggregator(Aggregator):
+  """Remembers the last value given."""
+  last_value: Optional[float] = None
+  def add(self, new_value: Any):
+    self.last_value = new_value
+  def is_valid(self) -> bool:
+    return self.last_value is not None
+  def to_value(self):
+    assert self.last_value is not None
+    return self.last_value
+@gin.configurable
+class MetricsSummary:
+  """Summarizes a set of a metrics over multiple training steps."""
+  def __init__(self,
+               metric_types: Mapping[str, str],
+               upscale_images: bool = True,
+               remove_outliers: bool = False):
+    """Creates a MetricSummarizer.
+    Args:
+      metric_types: Map from metrics to the type of summary.  Types are:
+         "mean" = Compute the cumulative moving average.
+         "sum" =  Compute the sum.
+         "last" = No summary, just return the last value.
+      upscale_images: Upscale small images for easier viewing.
+      remove_outliers: Remove outliers from histograms.
+    """
+    self.metric_dict = {}  # type: Dict[str, Aggregator]
+    self.text_dict = {}
+    self.metric_types = metric_types
+    self.upscale_images = upscale_images
+    self.remove_outliers = remove_outliers
+    self.constructor_map = {
+        "mean": _MeanAggregator,
+        "sum": _SumAggregator,
+        "last": _LastAggregator,
+    }
+    logging.debug("Registered metrics: %r", metric_types)
+  def current_metric_dict(self) -> Mapping[str, Aggregator]:
+    return self.metric_dict
+  def _is_image(self, image: Array) -> bool:
+    if image.ndim != 4:
+      return False
+    # Greyscale or RGB image.
+    return image.shape[-1] == 1 or image.shape[-1] == 3
+  def _upscale_image(self, image: Array) -> Array:
+    """Upscale small images to more pixels, for easier viewing."""
+    if not self.upscale_images:
+      return image
+    assert image.ndim == 4  # (num_images, ysize, xsize, num_channels)
+    ys = image.shape[1]
+    xs = image.shape[2]
+    if xs > 512 or ys > 512:
+      return image   # No scaling.
+    elif xs > 256 or ys > 256:
+      scale = 2
+    else:
+      scale = 4
+    yidx = np.arange(ys * scale) // scale
+    xidx = np.arange(xs * scale) // scale
+    scaled_image = image[:, yidx, :, :][:, :, xidx, :]
+    return scaled_image
+  def _remove_outliers(self, v, std_range: float = 4):
+    if not self.remove_outliers:
+      return v
+    v_mean = np.mean(v)
+    v_std = np.std(v)
+    return np.where(np.abs(v) > (v_std * std_range), v_mean, v)
+  @staticmethod
+  def merge_replicated_metrics(device_metrics: Mapping[str, Any],
+                               metric_types: Mapping[str, str]):
+    """Merge metrics across devices by psum over "batch" axis.
+    Args:
+      device_metrics: dictionary of device metrics.
+      metric_types: map from the metric name to { "mean", "sum" }
+    Returns:
+      A dictionary of metrics.
+    """
+    logging.info("Merging metrics across devices %r: ",
+                 [(k, metric_types[k] if k in metric_types else None)
+                  for k in device_metrics.keys()])
+    def aggregate_sum(value: Array) -> Array:
+      assert not isinstance(value, tuple), (
+          "Weighted sums are not supported when aggregating over devices.")
+      return jax.lax.psum(value, axis_name="batch")
+    def aggregate_mean(value: Array, weight: Array) -> Tuple[Array, Array]:
+      weighted_value = value * weight
+      weighted_value = jax.lax.psum(weighted_value, axis_name="batch")
+      weight = jax.lax.psum(weight, axis_name="batch")
+      return weighted_value / (weight + 1.0e-6), weight
+    aggregated_metrics = dict(device_metrics)
+    for k, value in aggregated_metrics.items():
+      if k not in metric_types:
+        # If no metric type is given, metric remains untouched.
+        continue
+      if metric_types[k] == "sum":
+        aggregated_metrics[k] = aggregate_sum(value)
+      elif metric_types[k] == "mean":
+        if not isinstance(aggregated_metrics[k], tuple):
+          logging.info("Metric '%s' has no weight; assuming 1.0.", k)
+          value = (value, jnp.array(1.0))
+        aggregated_metrics[k] = aggregate_mean(*value)
+      else:
+        raise ValueError("Can only aggregate 'sum' and 'mean' over devices. "
+                         f"Got {metric_types[k]}.")
+    return aggregated_metrics
+  def _new_aggregator(self, key) -> Aggregator:
+    if key in self.metric_types:
+      return self.constructor_map[self.metric_types[key]]()
+    else:
+      # TODO(mrabe): The default to last_value is not obvious. Force all metric
+      # types to be given explicitly.
+      logging.debug("No metric type for accumulator: %s", key)
+      return _LastAggregator()
+  def add(self, metrics: Mapping[str, Any]):
+    """Add metrics from the current training step to the summary.
+    Args:
+      metrics: Dictionary of metrics.
+    """
+    for k, new_value in metrics.items():
+      if k not in self.metric_dict:
+        self.metric_dict[k] = self._new_aggregator(k)
+      self.metric_dict[k].add(new_value)
+  def add_text(self, text_metrics: Mapping[str, str]):
+    """Add text metrics from the current step to the summary."""
+    for (k, v) in text_metrics.items():
+      self.text_dict[k] = str(v)
+  def empty(self):
+    """Return true if there are no summaries to write."""
+    return not (self.metric_dict or self.text_dict)
+  def clear(self):
+    """Clear acculumated summaries."""
+    self.metric_dict = {}
+    self.text_dict = {}
+  def write(self, writer: metric_writers.MetricWriter, step: int, prefix: str):
+    """Write metrics using summary_writer, and clear all summaries."""
+    if self.empty():
+      return
+    # Special logic for organizing metrics under tensorboard.
+    # Tensorboard has top-level groups, but doesn't have subgroups.
+    # Scalars are put into separate top-level groups for easier viewing.
+    # e.g. all scalars in "train", "test", etc.
+    # For images, each set of images should be a different top-level group,
+    # otherwise all images will get tossed into a single group under,
+    # e.g. "generate".
+    if prefix:
+      s_prefix = prefix + "/"
+      i_prefix = prefix + "_"
+    else:
+      # Each prefix is stored in a separate subdirectory already.
+      s_prefix = ""
+      i_prefix = ""
+    # Split metrics into different types.
+    scalars = {}
+    images = {}
+    histograms = {}
+    text_dict = {}
+    # Sort metrics into scalars, images, text, and histograms.
+    for k, aggregator in self.metric_dict.items():
+      if not isinstance(aggregator, Aggregator):
+        raise ValueError("Internal error: metric_dict should contain only "
+                         "_Aggregator objects; contained %s" % aggregator)
+      if not aggregator.is_valid():
+        raise ValueError(f"No valid value for metric {k}.")
+      v = aggregator.to_value()
+      s_key = s_prefix + k
+      i_key = i_prefix + k
+      finite_mask = np.isfinite(v)
+      if not np.all(finite_mask):
+        logging.warning("Item %s contains non-finite elements.", k)
+        v = np.where(finite_mask, v, np.zeros_like(v))
+      if v is None:
+        logging.warning("Invalid value for %s", k)
+      elif np.ndim(v) == 0:
+        scalars[s_key] = v
+      elif self._is_image(v):
+        images[i_key] = self._upscale_image(v)
+      else:
+        histograms[s_key] = self._remove_outliers(v)
+    # Handle text data.
+    for (k, v) in self.text_dict.items():
+      s_key = s_prefix + k
+      text_dict[s_key] = v
+    # Write metrics.
+    if scalars:
+      writer.write_scalars(step, scalars)
+    if images:
+      writer.write_images(step, images)
+    if histograms:
+      writer.write_histograms(step, histograms)
+    if text_dict:
+      writer.write_texts(step, text_dict)
+    # Clear accumulated summaries.
+    self.clear()

aglib/meliad/optimizer_config.py ADDED Viewed

	@@ -0,0 +1,281 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gin configurable optimizer definitions.
+"""
+from typing import Any, Optional
+from absl import logging
+from flax import optim
+from flax import struct
+import gin
+import jax.numpy as jnp
+import numpy as np
+OptimizerDef = Any
+@struct.dataclass
+class OptimizerConfig:
+  """Base class for optimizer configurations."""
+  learning_rate: float = 0.01    # All optimizers have a learning rate.
+  def create_optimizer_def(self) -> OptimizerDef:
+    raise ValueError("Not implemented.")
+@gin.configurable
+@struct.dataclass
+class AdamConfig(OptimizerConfig):
+  """Creates and configures the Adam optimizer."""
+  # Adam does not use parameter scale, and thus requires a smaller lrate.
+  # This will be multiplied by the learning rate schedule.
+  learning_rate: float = 0.05
+  beta1: float = 0.9               # For moving average of gradient.
+  beta2: float = 0.98              # For moving average of gradient magnitude.
+  weight_decay_rate: float = 0.0   # Relative to learning rate.
+  def create_optimizer_def(self) -> optim.OptimizerDef:
+    logging.info("Using Adam Optimizer. lr=%f, b1=%f, b2=%f",
+                 self.learning_rate, self.beta1, self.beta2)
+    return optim.Adam(beta1=self.beta1,
+                      beta2=self.beta2,
+                      weight_decay=self.weight_decay_rate)
+@gin.configurable
+@struct.dataclass
+class FlaxAdafactorConfig(OptimizerConfig):
+  """Creates and configures the Adafactor optimizer."""
+  # Adafactor scales gradients according to parameter scale.
+  # This will be multiplied by the learning rate schedule.
+  learning_rate: float = 1.0
+  beta1: Optional[float] = 0.9      # Enables momentum with extra memory cost.
+  def create_optimizer_def(self) -> optim.OptimizerDef:
+    # Use wd_lr_exponent to get weight_decay relative to learning rate.
+    logging.info("Using Flax Adafactor Optimizer. lr=%f, b1=%f",
+                 self.learning_rate, self.beta1)
+    return optim.Adafactor(beta1=self.beta1)
+# ----------------------------------------------------------------------------
+# Learning rate schedules for use with any optimizer.
+#
+# In keeping with the Chinchilla model: https://arxiv.org/abs/2203.15556.
+# A learning rate schedule is a function that decays the learning rate from
+# step zero to max_steps.  The desired maximum number of steps must be set at
+# the start of training.
+# ----------------------------------------------------------------------------
+@gin.configurable
+def lr_constant(step: jnp.ndarray, max_steps: int,
+                learning_rate: float = 0.01) -> jnp.ndarray:
+  """Returns constant_lr on each step.
+  Args:
+    step: The current training step (unused).
+    max_steps: Unused.
+    learning_rate: The constant learning rate to use.
+  Returns:
+    The learning rate for the current step.
+  """
+  del step
+  del max_steps
+  return jnp.asarray(learning_rate, dtype=jnp.float32)
+@gin.configurable
+def lr_rsqrt_decay_std(step: jnp.ndarray, max_steps: int,
+                       max_lr: Optional[float] = None) -> jnp.ndarray:
+  """Inverse square root decay function: LR = 1/sqrt(step).
+  Provided for compatibility.  No min_lr, and it ignores max_steps.
+  Should be used with warmup: pass step = max(step, warmup_steps).
+  Maximum learning rate is 1/sqrt(warmup_steps) ~= 0.03 for 1000 warmup steps.
+  Args:
+    step: The current training step.
+    max_steps: Unused.
+    max_lr: If specified, learning rate will be clipped to the maximum value.
+  Returns:
+    The learning rate for the current step.
+  """
+  # This function implements standard rsqrt decay as used in the memorizing
+  # and block-recurrent transformer papers, (https://arxiv.org/abs/2203.08913,
+  # https://arxiv.org/abs/2203.07852) which does not decay to a specified
+  # minimum learning rate over max_steps.
+  del max_steps
+  # Avoid divide by zero; force at least 100 warmup steps and a max LR of 0.1.
+  step = jnp.maximum(step, 100.0)
+  lrate = 1.0 / jnp.sqrt(step)
+  if max_lr is not None:
+    lrate = jnp.minimum(lrate, max_lr)  # Clip to max_lr
+  return lrate
+@gin.configurable
+def lr_rsqrt_decay(step: jnp.ndarray, max_steps: int,
+                   max_lr: float = 0.05,
+                   min_lr: float = 0.001) -> jnp.ndarray:
+  """Inverse sqrt decay from max_lr to min_lr over max_steps.
+  This function implements rsqrt decay, but adjusts the decay rate so that
+  min_lr is reached at max_steps.
+  Note: with a warmup period, the maximum LR produced by the schedule is:
+  min_lr / sqrt(warmup_steps / max_steps), which may be less than max_lr.
+  e.g. if min_lr is 0.001, then the maximum LR will be 0.01 for
+  warmup_steps=1000 and max_steps=100_000.
+  Args:
+    step: The current training step.
+    max_steps: The step value at the end of training.
+    max_lr: LR will be clipped to max at the start of training.
+    min_lr: LR to output at max_steps.
+  Returns:
+    The learning rate for the current step.
+  """
+  assert max_lr > min_lr
+  # Avoid divide by zero; force at least 100 warmup steps and a max LR of 0.1.
+  step = jnp.maximum(step, 100.0)
+  lrate = min_lr / jnp.sqrt(step / float(max_steps))
+  lrate = jnp.minimum(lrate, max_lr)  # Clip to max_lr
+  return lrate
+@gin.configurable
+def lr_exponential_decay(step: jnp.ndarray, max_steps: int,
+                         max_lr: float = 0.01,
+                         min_lr: float = 0.001) -> jnp.ndarray:
+  """Exponential decay from max_lr to min_lr over max_steps.
+  Continues to decay at the same rate after max_steps.
+  Args:
+    step: The current training step.
+    max_steps: The step value at the end of training.
+    max_lr: LR to output at step 0.
+    min_lr: LR to output at max_steps.
+  Returns:
+    The learning rate for the current step.
+  """
+  assert max_lr > min_lr
+  lrate = max_lr * jnp.power(min_lr / max_lr, step / float(max_steps))
+  return lrate
+@gin.configurable
+def lr_linear_decay(step: jnp.ndarray, max_steps: int,
+                    max_lr: float = 0.01,
+                    min_lr: float = 0.001,
+                    decay_after: bool = True) -> jnp.ndarray:
+  """Linear decay from max_lr to min_lr over max_steps.
+  If decay_after, then LR will continue to decay exponentially by a factor
+  of 2 every max_steps after the linear decay.
+  Args:
+    step: The current training step.
+    max_steps: The step value at the end of training.
+    max_lr: LR to output at step 0.
+    min_lr: LR to output at max_steps.
+    decay_after: If true, do exponential decay after the linear decay,
+        by a factor of 2 every max_steps.
+  Returns:
+    The learning rate for the current step.
+  """
+  assert max_lr > min_lr
+  lrate = min_lr + (max_lr - min_lr) * ((max_steps - step) / max_steps)
+  lrate = jnp.maximum(lrate, min_lr)
+  if decay_after:
+    exp_lrate = lr_exponential_decay(step, max_steps,
+                                     max_lr=2*min_lr, min_lr=min_lr)
+    lrate = jnp.where(step < max_steps, lrate, exp_lrate)
+  return lrate
+@gin.configurable
+def lr_cosine_decay(step: jnp.ndarray, max_steps: int,
+                    max_lr: float = 0.01,
+                    min_lr: float = 0.001,
+                    decay_after: bool = True,
+                    spike_steps: int = 0,
+                    spike_lr: float = 0.0) -> jnp.ndarray:
+  """Cosine decay function from max_lr to min_lr over max_steps.
+  Used in the Chinchilla model: https://arxiv.org/abs/2203.15556.
+  If decay_after, then LR will continue to decay exponentially by a factor
+  of 2 every max_steps after the original ramp.
+  If spike_steps > 0, there will be an initial linear decay from spike_lr
+  down to max_lr over the first spike_steps steps.  This implements a brief
+  period of higher LR early in training, similar to the curve for rsqrt_decay.
+  The model can generally tolerate a high LR early in training, and make a
+  lot of progress very quickly.  Try spike_steps=10_000, spike_lr = 0.04.
+  Args:
+    step: The current training step.
+    max_steps: The number of training steps to decay over.
+    max_lr: The maximum learning rate at the start of training.
+    min_lr: The minimum learning rate at the end of training.
+    decay_after: If true, do exponential decay after the cosine day,
+        by a factor of 2 every max_steps.
+    spike_steps: The number of steps for the initial spike.
+    spike_lr: The maximum LR during the initial spike.
+  Returns:
+    The learning rate for the current step.
+  """
+  assert max_lr > min_lr
+  pi = float(np.pi)
+  step_ramp = jnp.minimum(step, max_steps) / max_steps  # ramp: 0 to 1.0.
+  lrate = (1 + jnp.cos(pi * step_ramp)) * 0.5   # ranges from 1 to 0.
+  lrate = min_lr + lrate * (max_lr - min_lr)
+  if spike_steps > 0 and spike_lr > 0.0:
+    assert spike_lr > max_lr
+    spike_lrate = spike_lr * ((spike_steps - step) / spike_steps)
+    lrate = jnp.maximum(lrate, spike_lrate)
+  if decay_after:
+    exp_lrate = lr_exponential_decay(step, max_steps,
+                                     max_lr=2*min_lr, min_lr=min_lr)
+    lrate = jnp.where(step < max_steps, lrate, exp_lrate)
+  return lrate

aglib/meliad/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+absl-py>=1.0.0
+clu>=0.0.7
+gin-config>=0.5.0
+flax>=0.5.0
+jax>=0.3.13
+optax>=0.1.2
+numpy>=1.22.4
+sentencepiece>=0.1.96
+seqio>=0.0.7
+tensorflow>=2.9.1
+tensorflow-datasets>=4.5.2

aglib/meliad/training_loop.py ADDED Viewed

	@@ -0,0 +1,757 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generic JAX training loop for experiments."""
+import functools
+import os
+from typing import (Any, Callable, Dict, Optional, Sequence, Tuple)
+from absl import logging
+from clu import metric_writers
+import flax
+from flax import jax_utils
+from flax import linen as nn
+from flax import struct
+from flax.training import checkpoints
+import gin
+import jax
+import jax.numpy as jnp
+import  metrics_summary
+import  optimizer_config as opt_config
+import  training_task
+import numpy as np
+import tensorflow.compat.v2 as tf
+PRNGKeys = training_task.PRNGKeys
+TrainState = training_task.TrainState
+TrainingTask = training_task.TrainingTask
+StepFunction = training_task.StepFunction
+Metrics = training_task.Metrics
+MetricWriter = metric_writers.MetricWriter
+MetricsSummary = metrics_summary.MetricsSummary
+gfile = tf.io.gfile
+unfreeze = flax.core.unfreeze
+flatten_dict = flax.traverse_util.flatten_dict
+should_run = training_task.should_run
+# TODO(cstaats): Use a Protocol to specify that it must be possible to call
+# the function with parameters (step: int, mode: str). This won't be feasible
+# until we start using Python 3.8 or later.
+StepModeCallable = Callable[..., None]
+# This variable should *only* be set from register_interstep_callbacks.
+_interstep_callbacks: Optional[Tuple[StepModeCallable, ...]] = None
+@gin.configurable
+def register_interstep_callbacks(**kwargs: StepModeCallable) -> None:
+  """Populates _interstep_callbacks from gin.
+  This function should be called exactly ONCE and that call should happen AFTER
+  flag initialization (and more specifically, after gin parsing). And the caller
+  should NOT specify any arguments.
+  In gin configurations, a callback can be specified with an arbitrary name
+  like so:
+      register_interstep_callbacks.my_callback_name = @my_callback_function
+  Multiple callbacks can be registered without overriding each other as long as
+  they all have different names. Conversely, if you *want* to override a
+  callback, you need to give that callback the same name.
+  Args:
+    **kwargs: Specified by gin. Each argument should be a function (callable)
+      that can be called as my_function(step, mode), where step is an int and
+      mode is a str.
+  Raises:
+    ValueError: Raised on the second (and any subsequent) function call.
+  """
+  global _interstep_callbacks
+  logging.info("registering functions: %s", kwargs.keys())
+  if _interstep_callbacks is not None:
+    raise ValueError("register_interstep_callbacks may only be called once.")
+  _interstep_callbacks = tuple(kwargs.values())
+def clear_interstep_callbacks():
+  """Clear all registered callbacks, so that new ones can be registered."""
+  global _interstep_callbacks
+  _interstep_callbacks = None
+def run_interstep_callbacks(mode: str, step: int, sub_step: int = 0):
+  """Run the registered callbacks.
+  Args:
+    mode: mode of the task to execute callbacks for.
+    step: training step number.
+    sub_step: For tasks that execute multiple iterations within a step.
+      E.g. a test cycle that runs multiple testing steps.
+  """
+  for func in _interstep_callbacks:
+    func(sub_step or step, mode)
+@gin.configurable
+@struct.dataclass
+class Trainer:
+  """Implements a JAX training loop."""
+  # Returns a Flax module for the model.
+  # Takes a single argument mode, which can be "test", "train", or "generate".
+  model_definition: Any = gin.REQUIRED
+  # Iterator over trainining data.
+  get_training_dataset_iterator: Callable[[], Any] = gin.REQUIRED
+  # Iterator over test data.
+  get_test_dataset_iterator: Optional[Callable[[], Any]] = None
+  workdir: str = ""                    # Working directory for checkpoints.
+  load_dir: str = ""                   # Optional directory to load model.
+  num_steps: int = 100000              # Number of steps to train.
+  status_every_steps: int = 10         # Log step number every N steps.
+  log_every_steps: int = 100           # Log scalar data every N steps.
+  test_every_steps: int = 10           # Test model every N steps.
+  num_test_steps: int = 1              # Number of iterations to test.
+  generate_every_steps: int = 1000     # Generate examples every N steps.
+  print_input_every_steps: int = 1000  # Print example data every N steps.
+  save_checkpoints: bool = True        # Save training checkpoints
+  checkpoint_every_steps: int = 5000   # Save checkpoints every N steps.
+  restore_checkpoints: bool = True     # Restore from previous checkpoint.
+  restore_state_variables: bool = True  # Restore TrainState.state from chkpt.
+  # Record metrics for "train", "test", etc. in separate directories.
+  # Otherwise they will be saved with separate prefixes.
+  use_separate_metric_directories: bool = True
+  # Optimizer options.
+  optimizer_factory: opt_config.OptimizerConfig = gin.REQUIRED
+  learning_rate_schedule: Callable[[jnp.ndarray, int], jnp.ndarray] = (
+      opt_config.lr_cosine_decay)
+  # Maximum steps for the LR schedule.  Zero means use num_steps.
+  max_scheduled_steps: int = 0
+  warmup_steps: int = 1000               # Number of warmup steps.
+  learning_rate_multiplier: float = 1.0  # Used to scale the learning rate.
+  random_seed: int = 42                  # Initial random seed.
+  # Names of random number generators used by the model.
+  rng_key_names: Optional[Sequence[str]] = ("dropout",)
+  # Debug options.
+  replicate_mode: bool = True     # pmap over multiple replicas.
+  trace_debug_mode: bool = False  # Run in eager mode to trace results.
+  print_variables: bool = False   # Dump parameters/variables to stdout.
+  # Function to compute additional summary information.
+  # Takes a MetricsSummary object and a mode string (e.g. "test") as arguments,
+  # returns a MetricsSummary object.
+  process_summaries_function: Optional[Callable[[Any, str], Any]] = None
+  # Function to pretty print the input for each training step.
+  pretty_print_input_function: Optional[Callable[[Any], Any]] = None
+  # Classes to use for summarizing metrics.
+  metrics_summary_factory: Any = metrics_summary.MetricsSummary
+  extra_summaries_fn: training_task.ExtraSummariesFunction = (
+      lambda mode, step: dict())
+  post_save_checkpoint_fn: Callable[[str, int], None] = lambda mode, step: None
+  post_load_checkpoint_fn: Callable[[str, int], None] = lambda mode, step: None
+  def learning_rate_schedule_fn(self, step):
+    """Returns the learning rate for the given step."""
+    # There are four components to the learning rate.
+    #
+    # The base_lrate is defined by the optimizer, and different optimizers have
+    # different relative rates, e.g. Adafactor requires a higher LR than Adam.
+    # By default, the base_lrate is 1.0 for Adafactor.
+    #
+    # The base_lrate is then multiplied by the learning rate decay schedule,
+    # which typically starts at a maximum value and decays over time.
+    # Each schedule can be individually configured, e.g. from 0.01 to 0.001.
+    # The max_scheduled_steps parameter controls the decay rate of the schedule.
+    #
+    # Finally, the LR is scaled by the learning_rate_multiplier, which provides
+    # an easy way to scale the LR for hyperparameter tuning in a way that is
+    # independent of the choice of schedule or optimizer.  The default is 1.0.
+    #
+    # During the warmp period, the learning rate ramps up linearly from zero.
+    step = jnp.asarray(step, dtype=jnp.float32)
+    if self.max_scheduled_steps == 0:
+      max_steps = self.num_steps
+    else:
+      max_steps = self.max_scheduled_steps
+    base_lrate = float(self.optimizer_factory.learning_rate)
+    lr_multiplier = float(self.learning_rate_multiplier)
+    # Linear increase in learning rate up to warmup_steps.
+    warmup_steps = float(self.warmup_steps)
+    lr_warmup_ramp = jnp.minimum(step, warmup_steps) / warmup_steps
+    # Hold step at a constant value during the warmup period.
+    # Required for some schedules, like rsqrt_decay.
+    step = jnp.maximum(step, warmup_steps)
+    # Get the scheduled learning rate.
+    lrate = self.learning_rate_schedule(step, max_steps)
+    # Multiply lrate by the base, warmup and multiplier factors.
+    lrate = lrate * base_lrate * lr_warmup_ramp * lr_multiplier
+    return jnp.asarray(lrate, dtype=jnp.float32)
+  def _init_rngs(self, rngs: PRNGKeys, step: int) -> PRNGKeys:
+    # Get a new random number generator for each step
+    rngs = jax.random.fold_in(rngs, step)
+    rngs = jax.random.split(rngs, len(self.rng_key_names))
+    rngs = {key: rngs[i] for i, key in enumerate(self.rng_key_names)}
+    return rngs
+  def train_step(self, model: nn.Module, tstate: TrainState, x: Any,
+                 rngs: PRNGKeys) -> Tuple[TrainState, Metrics]:
+    """Perform a training step, pmapped over multiple devices.
+    Args:
+      model:  The model to use for the step function.
+      tstate: Values for state variables, and the optimizer.
+      x:      A batch of inputs to train on.
+      rngs:   PRNGKey (possibly replicated).
+    Returns:
+      Tuple of (new_tstate, metrics: dictionary of scalar values)
+    """
+    mutable_keys = [k for (k, _) in tstate.state.items()]
+    step = tstate.optimizer.state.step
+    rngs = self._init_rngs(rngs, step)
+    # Refactor the model as a loss function from trainable params to loss, so
+    # that we can differentiate with jax and get {d}loss/{d}params.
+    # Inputs and non-trainable params are bound within the closure.
+    # model:: x, { state_params } -> (loss, metrics), { new_state_params }
+    # loss_fn:: params -> (loss, (metrics, new_state))
+    def loss_fn(params):
+      """Loss function."""
+      (loss, mets), nstate = model.apply({"params": params, **tstate.state},
+                                         x,
+                                         rngs=rngs,
+                                         mutable=mutable_keys)
+      return loss, (mets, nstate)
+    # grad_fn:: params -> ((loss, (aux, nstate)), param_gradients)
+    grad_fn = jax.value_and_grad(loss_fn, has_aux=True)
+    # Run forward and backward pass.
+    (loss, (metrics, new_state)), param_grads = grad_fn(tstate.optimizer.target)
+    del loss  # loss is only recorded if it is part of the metrics
+    if self.replicate_mode:
+      param_grads = jax.lax.pmean(param_grads, axis_name="batch")
+    lrate = self.learning_rate_schedule_fn(step)
+    new_optimizer = tstate.optimizer.apply_gradient(
+        param_grads, learning_rate=lrate)
+    # Metrics are summary values that will be logged.
+    if self.replicate_mode:
+      # Merge metrics (take mean/sum etc.) over replicas on-device.
+      summary_class = self.metrics_summary_factory
+      metrics = summary_class.merge_replicated_metrics(
+          metrics, model.metrics_summary_operations(aggregate_over="devices"))
+    metrics["learning_rate"] = lrate
+    return (TrainState(new_optimizer, new_state), metrics)
+  def other_step(self, model: nn.Module, tstate: TrainState, x: Any,
+                 rngs: PRNGKeys) -> Tuple[TrainState, Metrics]:
+    """Perform a test or generate step, pmapped over multiple devices.
+    Args:
+      model:  The model to use for the step function.
+      tstate: Values for state variables, and the optimizer.
+      x:      A batch of inputs to train on.
+      rngs:   PRNGKey (possibly replicated).
+    Returns:
+      Tuple of (new_tstate, metrics: dictionary of scalar values)
+    """
+    mutable_keys = [k for (k, _) in tstate.state.items()]
+    step = tstate.optimizer.state.step
+    rngs = self._init_rngs(rngs, step)
+    params = tstate.optimizer.target
+    (loss, metrics), new_state = model.apply({"params": params, **tstate.state},
+                                             x,
+                                             rngs=rngs,
+                                             mutable=mutable_keys)
+    del loss  # loss is only recorded if it is part of the metrics
+    # Metrics are summary values that will be logged.
+    if self.replicate_mode:
+      # Merge metrics (take mean/sum etc.) over replicas on-device.
+      summary_class = self.metrics_summary_factory
+      metrics = summary_class.merge_replicated_metrics(
+          metrics, model.metrics_summary_operations(aggregate_over="devices"))
+    return (TrainState(tstate.optimizer, new_state), metrics)
+  def initialize_model(self) -> Tuple[TrainState, int, nn.Module, PRNGKeys]:
+    """Initialize the model and/or load it from a checkpoint.
+    Returns:
+      (tstate: TrainState,  -- The parameters and state for the the model.
+       start_step: int,     -- The step number, when restoring from checkpoint.
+       imodel: nn.Module,   -- A model object (created with mode "init").
+       rngs: PRNGkeys)      -- Initial random numbers.
+    """
+    # Set up random number generators.
+    # ---------------------------------
+    logging.info("==== Training loop: initializing model ====")
+    logging.info("Process %d of %d", jax.process_index(), jax.process_count())
+    logging.info("Local device count = %d", jax.local_device_count())
+    logging.info("Number of replicas = %d",
+                 jax.process_count() * jax.local_device_count())
+    logging.info("Using random number seed %d", self.random_seed)
+    prng = jax.random.PRNGKey(self.random_seed)
+    prng, init_rng = jax.random.split(prng)
+    # Grab rngs, which provide different random numbers for each replica.
+    if self.replicate_mode:
+      prngs = jax.random.split(prng, jax.local_device_count())
+    else:
+      prngs = prng
+    del prng
+    # Create a dictionary of prng keys for initialization.
+    rng_key_names_init = list(self.rng_key_names) + ["params"]
+    init_rngs = jax.random.split(init_rng, len(rng_key_names_init))
+    init_rngs = {key: init_rngs[i] for i, key in enumerate(rng_key_names_init)}
+    del init_rng
+    # Build Model
+    # -------------------------------------------------------------------------
+    logging.info("Initializing the model.")
+    # Create a model, which will be used to initialize trainable parameters.
+    imodel = self.model_definition(mode="init")
+    # The init function will lazily initialize the model, given a fake input.
+    # It returns initialized variables, without doing a fwd pass.
+    model_init_fn = jax.jit(imodel.init)
+    variables = model_init_fn(init_rngs, imodel.get_fake_input())
+    # Split variables into trainable and non-trainable sets.
+    mstate, params = variables.pop("params")
+    del variables  # Delete to avoid wasting resources.
+    # Create an optimizer for params.
+    optimizer_def = self.optimizer_factory.create_optimizer_def()
+    optimizer = optimizer_def.create(params)
+    # tstate holds the full training state of the model.
+    tstate = TrainState(optimizer, mstate)
+    if self.print_variables:
+      logging.info("params = %s", tstate.optimizer.target)
+      logging.info("state = %s", tstate.state)
+    # Load a pre-trained model or restore it from checkpoint.
+    if self.workdir or self.load_dir:
+      restore_checkpoints = self.restore_checkpoints
+    else:
+      restore_checkpoints = False
+    start_step = 0
+    if restore_checkpoints:
+      tstate = self.restore_checkpoint(tstate)
+      start_step = int(tstate.optimizer.state.step)
+    # Log info on trainable parameters (before replicating them).
+    self._write_parameter_info(tstate)
+    # raise ValueError("That's all folks!")
+    # Replicate the training state across local devices.
+    if self.replicate_mode:
+      tstate = jax_utils.replicate(tstate)
+    return (tstate, start_step, imodel, prngs)
+  def restore_checkpoint(self, train_state: TrainState) -> TrainState:
+    """Load a pre-trained model or restore it from a checkpoint."""
+    # Figure out if we have an existing checkpoint.
+    if not self.workdir:
+      logging.info("No working directory specified.")
+      existing_checkpoint = False
+    elif not gfile.exists(self.workdir):
+      logging.info("No existing checkpoint directory %s", self.workdir)
+      existing_checkpoint = False
+    elif not gfile.isdir(self.workdir):
+      raise ValueError(f"workdir {self.workdir} must be a directory.")
+    else:
+      ckpath = checkpoints.latest_checkpoint(self.workdir, "checkpoint_")
+      if ckpath:
+        logging.info("Found existing checkpoint in %s", self.workdir)
+        existing_checkpoint = True
+      else:
+        logging.info("No existing checkpoint in %s", self.workdir)
+        existing_checkpoint = False
+    # If any checkpoints exist in workdir, then use those first.
+    # This will ensure that the task will restore properly if it's preempted.
+    if existing_checkpoint:
+      logging.info("Restoring model from last checkpoint %s:", self.workdir)
+      load_dir = self.workdir
+    elif self.load_dir:
+      logging.info("Loading pre-trained model from %s:", self.load_dir)
+      load_dir = self.load_dir
+    else:
+      logging.warning("Unable to load model.")
+      return train_state
+    loaded_train_state = checkpoints.restore_checkpoint(load_dir, train_state)
+    step = int(loaded_train_state.optimizer.state.step)
+    self.post_load_checkpoint_fn(load_dir, step)
+    if self.restore_state_variables:
+      # Restore complete state.
+      logging.info("Restoring all variables and state.")
+      train_state = loaded_train_state
+      del loaded_train_state
+    else:
+      # Restore trainable variables, but not other state.
+      logging.info("Only restoring trainable parameters.")
+      train_state = TrainState(loaded_train_state.optimizer, train_state.state)
+      del loaded_train_state
+    return train_state
+  def save_checkpoint(self, tstate: TrainState, step: int,
+                      param_summary: Optional[MetricsSummary]):
+    """Save a checkpoint with the model state.
+    Args:
+      tstate: The training state.
+      step: The current step number.
+      param_summary: Optional metrics summary to write parameter statistics.
+    """
+    logging.info("Saving checkpoint in directory %s", self.workdir)
+    if self.replicate_mode:
+      save_state = jax_utils.unreplicate(tstate)
+    else:
+      save_state = tstate
+    checkpoints.save_checkpoint(self.workdir, save_state, step)
+    # While we're at it, record distributions of trainable parameters.
+    if param_summary is not None:
+      logging.info("Recording parameter distributions.")
+      params_dict = jax.device_get(
+          _flatten_dict_string_keys(save_state.optimizer.target))
+      param_distribs = self._compute_parameter_distributions(params_dict)
+      param_summary.add(param_distribs)
+  def create_training_task(self, mode: str, imodel: nn.Module, prngs: PRNGKeys,
+                           writers: Dict[str, MetricWriter]) -> TrainingTask:
+    """Create a new TrainingTask for the given mode.
+    Args:
+      mode: The mode for the task, e.g. "train", "test", "generate".
+      imodel: The model object from initialize_model.
+      prngs: The PRNGKeys from initialize_model.
+      writers: A dictionary of summary writers.
+    Returns:
+      A TrainingTask object.
+    """
+    logging.info("Training loop: creating task for mode %s", mode)
+    if self.use_separate_metric_directories:
+      prefix = ""
+    else:
+      prefix = mode
+    if mode == "train":
+      ds = self.get_training_dataset_iterator
+    elif mode == "test":
+      ds = self.get_test_dataset_iterator
+    else:
+      ds = None
+    # We summarize metrics over multiple training steps.
+    # These types control how the summary is computed.
+    metric_summary_ops = {
+        "step_time": "mean",
+        "learning_rate": "last",
+        **imodel.metrics_summary_operations(aggregate_over="steps")
+    }
+    summary = self.metrics_summary_factory(metric_summary_ops)
+    extra_summary = self.metrics_summary_factory({})
+    summary_writer = self._get_summary_writer(mode, writers)
+    return TrainingTask(
+        mode=mode,
+        dataset=ds,
+        step_function=self._compile_step_function(mode),
+        prng_keys=prngs,
+        summary=summary,
+        extra_summary=extra_summary,
+        summary_writer=summary_writer,
+        summary_prefix=prefix,
+        # --- options ---
+        replicate_mode=self.replicate_mode,
+        print_input_every_steps=self.print_input_every_steps,
+        pretty_print_input_function=self.pretty_print_input_function,
+        process_summaries_function=self.process_summaries_function,
+        extra_summaries_function=self.extra_summaries_fn)
+  def train(self):
+    """Runs the training and evaluation loop."""
+    # The master process saves checkpoints and summaries to disk.
+    is_master_process = jax.process_index() == 0
+    if self.workdir:
+      save_checkpoints = self.save_checkpoints
+    else:
+      save_checkpoints = False
+    # --- Create and initialize the model. ---
+    (tstate, start_step, imodel, prngs) = self.initialize_model()
+    # Log experiment hyper-parameters.
+    writers = {}
+    train_writer = self._get_summary_writer("train", writers)
+    if start_step == 0:
+      self._write_config(train_writer)
+    # Additional summary objects.
+    param_summary = self.metrics_summary_factory({})  # Parameter statistics.
+    # --- Create task objects for test, train, and generate. ---
+    tasks = {}
+    train_task = self.create_training_task("train", imodel, prngs, writers)
+    tasks["train"] = train_task
+    if (self.get_test_dataset_iterator is not None and
+        self.test_every_steps != 0):
+      test_task = self.create_training_task("test", imodel, prngs, writers)
+      tasks["test"] = test_task
+      if self.generate_every_steps != 0:
+        gen_task = self.create_training_task("generate", imodel, prngs,
+                                             writers)
+        tasks["generate"] = gen_task
+    # Register any additional actions.
+    register_interstep_callbacks()
+    # Main Training Loop
+    # --------------------------------------------------------------------------
+    logging.info("==== Training loop: starting main loop ====")
+    with metric_writers.ensure_flushes(*writers.values()):
+      for step in range(start_step, self.num_steps):
+        # Log status every so often to monitor progress.
+        if should_run(step, self.status_every_steps):
+          logging.info("Step: %d", step)
+        # Train.
+        train_x = train_task.get_next_input()
+        (tstate, _) = train_task.run_step(tstate, train_x, step)
+        run_interstep_callbacks("train", step)
+        del train_x
+        # Test.
+        if should_run(step, self.test_every_steps):
+          if self.num_test_steps > 1:
+            logging.info("Test cycle: %d iterations.", self.num_test_steps)
+          for sub_step in range(0, self.num_test_steps):
+            test_x = test_task.get_next_input()
+            # TODO(delesley): This is an ugly hack to run generate steps.
+            # Run a generate step using test data.
+            # Generate is run just *before* the last test iteration.
+            if ((sub_step == self.num_test_steps - 1) and
+                should_run(step, self.generate_every_steps)):
+              logging.info("Generate cycle.")
+              (tstate, _) = gen_task.run_step(tstate, test_x, step)
+              run_interstep_callbacks("generate", step)
+            (tstate, _) = test_task.run_step(tstate, test_x, step,
+                                             sub_step=sub_step)
+            run_interstep_callbacks("test", step, sub_step)
+          del test_x
+        # --- Save checkpoints on the master host. ---
+        is_last_step = (step == self.num_steps - 1)
+        checkpoint_current_step = (
+            save_checkpoints and
+            (should_run(step, self.checkpoint_every_steps) or is_last_step))
+        if checkpoint_current_step:
+          if is_master_process:
+            self.save_checkpoint(tstate, step, param_summary)
+          self.post_save_checkpoint_fn(self.workdir, step)
+        # --- Flush summaries to disk. ---
+        if should_run(step, self.log_every_steps):
+          for tsk in tasks.values():
+            tsk.flush(step)
+          param_summary.write(train_writer, step, prefix="params")
+    logging.info("Training Finished.")
+    if self.replicate_mode:
+      tstate = jax_utils.unreplicate(tstate)
+    if self.print_variables:
+      logging.info("params = %s", tstate.optimizer.target)
+      logging.info("state = %s", tstate.state)
+  def _compile_step_function(self, mode: str) -> StepFunction:
+    """Compile a step function (training or test)."""
+    # Create a model object, and a step function that is a closure over the
+    # object.  Flax modules are supposed to be "stateless", in that all state
+    # is contained the TrainState object that is passed as an input parameter.
+    # However, creating the model object may involve allocating expensive
+    # data structures, or launching processes, and should only be done once.
+    model = self.model_definition(mode=mode)
+    if mode == "train":
+      step_fn = functools.partial(self.train_step, model)
+    else:
+      step_fn = functools.partial(self.other_step, model)
+    if self.replicate_mode:
+      assert not self.trace_debug_mode
+      logging.info("Compiling mode %s with pmap.", mode)
+      p_fn = jax.pmap(step_fn, donate_argnums=(0,), axis_name="batch")
+    elif self.trace_debug_mode:
+      logging.info("Compiling mode %s with trace_debug.", mode)
+      p_fn = step_fn
+    else:
+      logging.info("Compiling mode %s with jit.", mode)
+      p_fn = jax.jit(step_fn, donate_argnums=(0,))
+    return p_fn
+  def _get_summary_writer(self, mode: str,
+                          writers: Dict[str, MetricWriter]) -> MetricWriter:
+    """Create a summary writer for the given mode.
+    Args:
+      mode: the mode for the summaries, e.g. "test", "train"
+      writers: a dictionary which caches previously-created writers.
+    Returns:
+      A writer for the given mode.
+    """
+    if self.use_separate_metric_directories:
+      # Create a separate writer & directory for each mode.
+      w_mode = mode
+      summary_dir = os.path.join(self.workdir, mode)
+    else:
+      # Create a single default writer for all modes.
+      w_mode = "train"
+      summary_dir = self.workdir
+    if w_mode in writers:
+      # Return previously created and cached writer.
+      logging.info("Returning cached summary writer (%s) for mode %s",
+                   w_mode, mode)
+      return writers[w_mode]
+    if not self.workdir:
+      # No working directory, so log only.
+      logging.info("Creating logging writer (%s) for mode %s", w_mode, mode)
+      writer = metric_writers.LoggingWriter()
+    else:
+      # Create a new writer for workdir.
+      # Only the master will actually write summaries to workdir.
+      logging.info("Creating summary writer (%s) for mode %s in directory %s",
+                   w_mode, mode, summary_dir)
+      is_master = jax.process_index() == 0
+      gfile.makedirs(summary_dir)
+      writer = metric_writers.create_default_writer(summary_dir,
+                                                    just_logging=not is_master)
+    writers[w_mode] = writer
+    return writer
+  def _write_config(self, writer):
+    """Write the configuration file to the working directory."""
+    is_master = jax.process_index() == 0
+    config_str = gin.operative_config_str()
+    logging.info("Gin config: \n%s", config_str)
+    # Write configuration to workdir.
+    if is_master and self.workdir:
+      config_file_name = os.path.join(self.workdir, "config.gin")
+      with gfile.GFile(config_file_name, "w") as f:
+        f.write(config_str)
+    # Write config string text to tensorboard.
+    writer.write_texts(0, {"config": gin.markdown(config_str)})
+  def _write_parameter_info(self, tstate: TrainState):
+    """Write information on state and trainable parameters to the log."""
+    # Write information on parameters to log file.
+    params_dict = _flatten_dict_string_keys(tstate.optimizer.target)
+    total_nparams = 0
+    for (k, v) in params_dict.items():
+      nparams = np.prod(v.shape)
+      total_nparams += nparams
+      logging.info("parameter: %s, shape %s, size %d", k, v.shape, nparams)
+    logging.info("Total parameters: %d", total_nparams)
+    # Write information on state variables to log file.
+    state_dict = _flatten_dict_string_keys(tstate.state)
+    state_size = 0
+    total_state = 0
+    for (k, v) in state_dict.items():
+      if hasattr(v, "shape"):
+        state_size = np.prod(v.shape)
+        total_state += state_size
+        logging.info("state: %s, shape %s, size %d", k, v.shape, state_size)
+      else:
+        # Some other stuff may be stored in the state.
+        logging.info("state: %s [unknown]", k)
+    logging.info("Total state size: %d", total_state)
+  def _compute_parameter_distributions(self, params_dict):
+    """Compute info on distributions of parameters."""
+    scalar_params_dict = {}
+    for (k, v) in params_dict.items():
+      # Convert from bfloat16, which crashes when serializing a NaN.
+      v = np.asarray(v, dtype=jnp.float32)
+      scalar_params_dict[k + "_mean"] = np.mean(v)
+      scalar_params_dict[k + "_stddev"] = np.std(v)
+      # scalar_params_dict[k + "_min"] = np.min(v)
+      # scalar_params_dict[k + "_max"] = np.max(v)
+    return scalar_params_dict
+def _flatten_dict_string_keys(params):
+  """Flattens a nested dictionary to have string keys and '/' separators."""
+  return {"/".join(k): v for k, v in flatten_dict(unfreeze(params)).items()}

aglib/meliad/training_task.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright 2022 Google.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TrainingTask encapsulates the state associated with model step."""
+import time
+from typing import (Any, Callable, Dict, Iterator, Mapping, Optional, Tuple)
+from absl import logging
+from clu import metric_writers
+from flax import optim
+from flax import struct
+import jax
+import  metrics_summary
+import numpy as np
+@struct.dataclass
+class TrainState:
+  optimizer: optim.Optimizer   # Trainable parameters.
+  state: Any                   # Other state, e.g. XL cache or memory.
+PRNGKeys = Any
+Metrics = Dict[str, Any]
+MetricsSummary = metrics_summary.MetricsSummary
+Dataset = Callable[[], Iterator[Any]]
+StepFunction = Callable[[TrainState, Any, Any], Tuple[TrainState, Metrics]]
+PrettyPrintInputFunction = Optional[Callable[[Any], str]]
+ProcessSummariesFunction = Optional[Callable[[Any, str], Any]]
+ExtraSummariesFunction = Optional[Callable[[str, int], Mapping[str, Any]]]
+def should_run(step: int, every_steps: int) -> bool:
+  """Returns true if a periodic action should be run."""
+  return (step > 0) and (every_steps > 0) and (step % every_steps == 0)
+class TrainingTask:
+  """A TrainingTask encapsulates the state associated with a training task.
+  Examples of tasks include training steps, test or validation runs,
+  or inference (generation).  State includes the input pipeline, and
+  summary information that is averaged over multiple steps.
+  """
+  def __init__(
+      self,
+      *,  # Pass arguments by keyword only.
+      mode: str,
+      dataset: Dataset,
+      step_function: StepFunction,
+      prng_keys: PRNGKeys,
+      summary: MetricsSummary,
+      extra_summary: MetricsSummary,
+      summary_writer: metric_writers.MetricWriter,
+      summary_prefix: str = "",
+      # --- Options from TrainingLoop ---
+      replicate_mode: bool = True,
+      print_input_every_steps: int = 0,
+      pretty_print_input_function: PrettyPrintInputFunction = None,
+      process_summaries_function: ProcessSummariesFunction = None,
+      extra_summaries_function: Optional[ExtraSummariesFunction] = None):
+    # Local state.
+    self.mode = mode
+    self.dataset = dataset
+    self.step_function = step_function
+    self.prng_keys = prng_keys
+    self.summary = summary
+    self.extra_summary = extra_summary
+    self.summary_writer = summary_writer
+    self.summary_prefix = summary_prefix
+    # Options carried over from TrainingLoop.
+    self.replicate_mode = replicate_mode
+    self.print_input_every_steps = print_input_every_steps
+    self.pretty_print_input_fn = pretty_print_input_function
+    self.process_summaries_fn = process_summaries_function
+    self.extra_summaries_fn = extra_summaries_function
+    # Local state.
+    if self.dataset is not None:
+      self.ds_iterator = self.dataset()
+    self.epoch = 0
+  def _get_metrics(self, device_metrics: Metrics) -> Metrics:
+    """Read a dictionary of metrics from device."""
+    if self.replicate_mode:
+      # x[0] gets the metric from device 0 -- the first replica.
+      # We assume that merge_replicated_metrics has already combined the
+      # metrics from multiple devices.
+      device_metrics = jax.tree_map(lambda x: x[0], device_metrics)
+    metrics_np = jax.device_get(device_metrics)  # Get numpy arrays.
+    return metrics_np
+  def get_next_input(self) -> Any:
+    """Grab the next input from the data pipeline."""
+    if self.dataset is None:
+      logging.warning("No dataset for mode %s", self.mode)
+      return None
+    try:
+      x = next(self.ds_iterator)
+    except StopIteration:
+      logging.info("End of epoch %d for mode %s.", self.epoch, self.mode)
+      self.ds_iterator = self.dataset()
+      x = next(self.ds_iterator)
+      self.epoch += 1
+    return x
+  def run_step(self, tstate: TrainState, x: Any,
+               step: int, sub_step: int = 0) -> Tuple[TrainState, Metrics]:
+    """Run the model for a single step.
+    Args:
+      tstate: The current model state.
+      x: The input for the model -- from get_next_input.
+      step: The training step number.
+      sub_step: For tasks that run multiple iterations within a step.
+        E.g. A test cycle will call run_step multiple times to cover the test
+        set.  The step counter will not increment, but sub_step will.
+    Returns:
+      An updated model state.
+    """
+    start_time = time.perf_counter()
+    # Split a batch of inputs among local replicas.
+    if self.replicate_mode:
+      x = split_batch_dimension(x, jax.local_device_count())
+    # Pretty-print the input to the summary and log file every so often.
+    if (sub_step == 0 and self.pretty_print_input_fn is not None and
+        should_run(step, self.print_input_every_steps)):
+      x_first = jax.tree_map(lambda x: x[0], x) if self.replicate_mode else x
+      x_strs = self.pretty_print_input_fn(x_first)
+      logging.info("[%d] Input (%s) = %s", step, self.mode, x_strs)
+      self.summary.add_text({"input": x_strs})
+    # Run the step function on the input.
+    with jax.profiler.StepTraceAnnotation(self.mode, step_num=step):
+      (tstate, metrics) = self.step_function(tstate, x, self.prng_keys)
+    # Read metrics from device.
+    metrics_np = self._get_metrics(metrics)
+    end_time = time.perf_counter()
+    metrics_np["step_time"] = end_time - start_time
+    if "epoch" not in metrics_np.keys():
+      metrics_np["epoch"] = self.epoch
+    # Add metrics to the current summary.
+    self.summary.add(metrics_np)
+    return (tstate, metrics_np)
+  def flush(self, step: int):
+    """Flush accumulated metric summaries to disk."""
+    if self.summary_writer is None:
+      self.summary.clear()  # Clear summary if we can't write it.
+      return
+    if self.summary.empty():
+      return
+    # Do post-processing of the summaries.
+    if self.process_summaries_fn is not None:
+      self.summary = self.process_summaries_fn(self.summary, self.mode)  # pylint: disable=not-callable
+    # Write and clear summary data.
+    logging.info("Writing summaries for mode %s.", self.mode)
+    self.summary.write(self.summary_writer, step, prefix=self.summary_prefix)
+    # Add extra summaries that are not computed by the step function.
+    if self.extra_summaries_fn is not None:
+      self.extra_summary.add(self.extra_summaries_fn(self.mode, step))
+      self.extra_summary.write(self.summary_writer, step, prefix="")
+def split_batch_dimension(inputs: Any, num_replicas: int) -> Any:
+  """Splits the leading batch dimension.
+  Given inputs of shape [num_replicas * batch_size, ...], it will reshape
+  them to [num_replicas, batch_size, ...].  This operation is intended to be
+  used right before calling pmap, which will eliminate the num_replicas
+  dimension.
+  Args:
+    inputs: Tuple of inputs to split.
+    num_replicas: Number of replicas.
+  Returns:
+    inputs with extra batch dimension.
+  """
+  def split_batch_dim(x):
+    assert x.ndim > 0
+    if (x.shape[0] % num_replicas) != 0:
+      raise ValueError(f"Can't split {x.shape} into {num_replicas} replicas.")
+    batch_size = x.shape[0] // num_replicas
+    split_shape = [num_replicas, batch_size] + list(x.shape[1:])
+    return np.reshape(x, split_shape)
+  return jax.tree_map(split_batch_dim, inputs)