Spaces:

tom-doerr
/

logo_generator

Runtime error

App Files Files Community

boris commited on Apr 9, 2022

Commit

07a6f9a

unverified ·

1 Parent(s): 0199604

feat: scan layers + gradient checkpointing (#161)

Browse files

* scan layers for faster compilation
* support gradient checkpointing

Files changed (5) hide show

src/dalle_mini/model/configuration.py +9 -3
src/dalle_mini/model/modeling.py +158 -53
src/dalle_mini/model/partitions.py +10 -1
tools/train/config/mega/config.json +2 -2
tools/train/train.py +73 -5

src/dalle_mini/model/configuration.py CHANGED Viewed

@@ -51,7 +51,8 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         activation_dropout=0.0,
         init_std=0.02,
         scale_embedding=False,
-        gradient_checkpointing=False,
         use_cache=True,
         is_encoder_decoder=True,
         forced_eos_token_id=None,
@@ -59,7 +60,7 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         do_sample=True,
         # transformer variants
         use_bias=False,  # use bias in attention and dense layers (except for lm_head)
-        ln_type="layernorm",  # layer normalization type, "rmsnorm", "layernorm"
         ln_positions="normformer",  # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
         use_head_scale=False,  # used in NormFormer
         use_cosine_attention=False,  # used in Swin v2
@@ -67,7 +68,7 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         use_absolute_position_embeddings=True,  # default
         use_swin_position_embeddings=False,  # used in Swin v1/v2
         use_deepnet_scaling=False,  # used in Deepnet
-        use_glu=False,  # "GLU Variants Improve Transformer"
         use_alibi=False,  # Not implemented yet - from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
         sinkhorn_iters=1,  # used in SinkFormers
         use_final_ln_encoder=True,  # final layer normalization in encoder
@@ -136,6 +137,11 @@ class DalleBartConfig(PretrainedFromWandbMixin, PretrainedConfig):
         self.init_std = init_std
         self.use_cache = use_cache
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = (
             scale_embedding  # scale factor will be sqrt(d_model) if True
         )

         activation_dropout=0.0,
         init_std=0.02,
         scale_embedding=False,
+        gradient_checkpointing=True,
+        use_scan=None,
         use_cache=True,
         is_encoder_decoder=True,
         forced_eos_token_id=None,
         do_sample=True,
         # transformer variants
         use_bias=False,  # use bias in attention and dense layers (except for lm_head)
+        ln_type="rmsnorm",  # layer normalization type, "rmsnorm", "layernorm"
         ln_positions="normformer",  # layer normalization positions, "normformer", "swinv2", "cogview", "postln", "preln", "deepnet" (same as postln)
         use_head_scale=False,  # used in NormFormer
         use_cosine_attention=False,  # used in Swin v2
         use_absolute_position_embeddings=True,  # default
         use_swin_position_embeddings=False,  # used in Swin v1/v2
         use_deepnet_scaling=False,  # used in Deepnet
+        use_glu=True,  # "GLU Variants Improve Transformer"
         use_alibi=False,  # Not implemented yet - from "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation"
         sinkhorn_iters=1,  # used in SinkFormers
         use_final_ln_encoder=True,  # final layer normalization in encoder
         self.init_std = init_std
         self.use_cache = use_cache
         self.gradient_checkpointing = gradient_checkpointing
+        # all layers are the same in most configurations
+        self.use_scan = use_scan if use_scan is not None else ln_positions != "swinv2"
+        assert not (
+            self.use_scan and ln_positions == "swinv2"
+        ), "scan cannot be used with 'swinv2'"
         self.scale_embedding = (
             scale_embedding  # scale factor will be sqrt(d_model) if True
         )

src/dalle_mini/model/modeling.py CHANGED Viewed

@@ -619,6 +619,9 @@ class FlaxBartEncoderLayer(nn.Module):
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
         res_gain = (
             deepnet_gain["encoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
@@ -679,12 +682,8 @@ class FlaxBartEncoderLayer(nn.Module):
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
-        if self.add_norm or self.config.ln_positions in ["postln"]:
-            use_scale = (
-                self.use_scale
-                or self.config.ln_positions == "postln"
-                or self.config.force_ln_scale
-            )
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -697,6 +696,9 @@ class FlaxBartEncoderLayer(nn.Module):
         if output_attentions:
             outputs += (attn_weights,)
         return outputs
@@ -710,7 +712,7 @@ class FlaxBartDecoderLayer(nn.Module):
     config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     add_norm: bool = False
-    use_scale: bool = False
     @nn.compact
     def __call__(
@@ -724,6 +726,9 @@ class FlaxBartDecoderLayer(nn.Module):
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
         res_gain = (
             deepnet_gain["decoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
@@ -831,12 +836,8 @@ class FlaxBartDecoderLayer(nn.Module):
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
-        if self.add_norm or self.config.ln_positions in ["postln"]:
-            use_scale = (
-                self.use_scale
-                or self.config.ln_positions == "postln"
-                or self.config.force_ln_scale
-            )
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
@@ -849,6 +850,9 @@ class FlaxBartDecoderLayer(nn.Module):
         if output_attentions:
             outputs += (attn_weights, cross_attn_weights)
         return outputs
@@ -876,35 +880,80 @@ class FlaxBartEncoderLayerCollection(nn.Module):
         n_layers = self.config.encoder_layers
         layer = (
-            remat(FlaxBartEncoderLayer, static_argnums=(2, 3))
             if self.config.gradient_checkpointing
             else FlaxBartEncoderLayer
         )
-        for i in range(n_layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            # final layernorm on the output of the last layer
-            # or every 6 layers for Swin v2
-            add_norm = (
-                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
-            ) or (self.config.use_final_ln_encoder and (i == n_layers - 1))
-            # we don't need to scale the norm for the last layer
-            use_scale = i != n_layers - 1
-            layer_outputs = layer(
-                self.config, dtype=self.dtype, add_norm=add_norm, use_scale=use_scale
             )(
                 hidden_states,
                 attention_mask,
                 output_attentions,
                 deterministic,
             )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-        # add hidden states from the last layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
         outputs = [
             hidden_states,
@@ -953,22 +1002,39 @@ class FlaxBartDecoderLayerCollection(nn.Module):
         n_layers = self.config.decoder_layers
         layer = (
-            remat(FlaxBartDecoderLayer, static_argnums=(4, 5, 6))
             if self.config.gradient_checkpointing
             else FlaxBartDecoderLayer
         )
-        for i in range(n_layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            # final layernorm on the output of the last layer
-            # or every 6 layers for Swin v2
-            add_norm = (
-                self.config.ln_positions == "swinv2" and ((i + 1) % 6 == 0)
-            ) or (self.config.use_final_ln_decoder and (i == n_layers - 1))
-            # we don't need to scale the norm for the last layer
-            use_scale = i != n_layers - 1
-            layer_outputs = layer(
-                self.config, dtype=self.dtype, add_norm=add_norm, use_scale=use_scale
             )(
                 hidden_states,
                 attention_mask,
@@ -978,17 +1044,56 @@ class FlaxBartDecoderLayerCollection(nn.Module):
                 output_attentions,
                 deterministic,
             )
-            hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
         outputs = [
             hidden_states,

         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
         res_gain = (
             deepnet_gain["encoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
+        if self.add_norm:
+            use_scale = self.use_scale or self.config.force_ln_scale
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         if output_attentions:
             outputs += (attn_weights,)
+        if self.config.use_scan:
+            outputs = (outputs, None)
         return outputs
     config: DalleBartConfig
     dtype: jnp.dtype = jnp.float32
     add_norm: bool = False
+    use_scale: bool = True
     @nn.compact
     def __call__(
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
+        if self.config.use_scan:
+            hidden_states = hidden_states[0]
         res_gain = (
             deepnet_gain["decoder"]["alpha"](self.config)
             if self.config.use_deepnet_scaling
         )
         hidden_states = ff_block(hidden_states, deterministic=deterministic)
         hidden_states = residual * res_gain + hidden_states
+        if self.add_norm:
+            use_scale = self.use_scale or self.config.force_ln_scale
             hidden_states = norm(
                 self.config.ln_type,
                 dtype=self.dtype,
         if output_attentions:
             outputs += (attn_weights, cross_attn_weights)
+        if self.config.use_scan:
+            outputs = (outputs, None)
         return outputs
         n_layers = self.config.encoder_layers
         layer = (
+            remat(
+                FlaxBartEncoderLayer,
+                static_argnums=(2, 3),
+                prevent_cse=not self.config.use_scan,
+            )
             if self.config.gradient_checkpointing
             else FlaxBartEncoderLayer
         )
+        if self.config.use_scan:
+            # all blocks are the same so we use nn.scan
+            assert not output_attentions, "cannot scan with output_attentions"
+            assert not output_hidden_states, "cannot scan with output_hidden_states"
+            hidden_states = (hidden_states,)
+            # we use a scale on all norms (even last layer) to allow scanning
+            hidden_states, _ = nn.scan(
+                layer,
+                variable_axes={"params": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(nn.broadcast, nn.broadcast, nn.broadcast),
+                length=n_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                add_norm=self.config.ln_positions == "postln",
+                name="FlaxBartEncoderLayers",
             )(
                 hidden_states,
                 attention_mask,
                 output_attentions,
                 deterministic,
             )
+            hidden_states = hidden_states[0]
+        else:
+            for i in range(n_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                # final layernorm on the output of the last layer
+                # or every 6 layers for Swin v2
+                add_norm = self.config.ln_positions == "postln" or (
+                    self.config.ln_positions == "swinv2"
+                    and ((i + 1) % 6 == 0)
+                    and (i != n_layers - 1)
+                )
+                # we don't need to scale the norm for the last layer
+                use_scale = i != n_layers - 1
+                layer_outputs = layer(
+                    self.config,
+                    dtype=self.dtype,
+                    add_norm=add_norm,
+                    use_scale=use_scale,
+                    name=f"FlaxBartEncoderLayer_{i}",
+                )(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+            # add hidden states from the last layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        # postln is already applied in every layer
+        if self.config.use_final_ln_encoder and self.config.ln_positions != "postln":
+            hidden_states = norm(
+                self.config.ln_type,
+                dtype=self.dtype,
+                epsilon=1e-05,
+                use_scale=self.config.force_ln_scale,
+            )(hidden_states)
         outputs = [
             hidden_states,
         n_layers = self.config.decoder_layers
         layer = (
+            remat(
+                FlaxBartDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.config.use_scan,
+            )
             if self.config.gradient_checkpointing
             else FlaxBartDecoderLayer
         )
+        if self.config.use_scan:
+            # all blocks are the same so we use nn.scan
+            assert not output_attentions, "cannot scan with output_attentions"
+            assert not output_hidden_states, "cannot scan with output_hidden_states"
+            hidden_states = (hidden_states,)
+            # we use a scale on all norms (even last layer) to allow scanning
+            hidden_states, _ = nn.scan(
+                layer,
+                variable_axes={"params": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                length=n_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                add_norm=self.config.ln_positions == "postln",
+                name="FlaxBartEncoderLayers",
             )(
                 hidden_states,
                 attention_mask,
                 output_attentions,
                 deterministic,
             )
+            hidden_states = hidden_states[0]
+        else:
+            for i in range(n_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                # final layernorm on the output of the last layer
+                # or every 6 layers for Swin v2
+                add_norm = self.config.ln_positions == "postln" or (
+                    self.config.ln_positions == "swinv2"
+                    and ((i + 1) % 6 == 0)
+                    and (i != n_layers - 1)
+                )
+                # we don't need to scale the norm for the last layer
+                use_scale = i != n_layers - 1
+                layer_outputs = layer(
+                    self.config,
+                    dtype=self.dtype,
+                    add_norm=add_norm,
+                    use_scale=use_scale,
+                    name=f"FlaxBartDecoderLayer_{i}",
+                )(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    init_cache,
+                    output_attentions,
+                    deterministic,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        # postln is already applied in every layer
+        if self.config.use_final_ln_decoder and self.config.ln_positions != "postln":
+            hidden_states = norm(
+                self.config.ln_type,
+                dtype=self.dtype,
+                epsilon=1e-05,
+                use_scale=self.config.force_ln_scale,
+            )(hidden_states)
         outputs = [
             hidden_states,

src/dalle_mini/model/partitions.py CHANGED Viewed

@@ -55,7 +55,7 @@ def _get_partition_rules():
     ]
-def set_partitions(in_dict):
     rules = _get_partition_rules()
     replace = _replacement_rules(rules)
     initd = {k: _unmatched for k in flatten_dict(in_dict)}
@@ -63,5 +63,14 @@ def set_partitions(in_dict):
     for k, v in result.items():
         if v == _unmatched:
             print(f"Unmatched -> {k}")
     assert _unmatched not in result.values(), "Incomplete partition spec."
     return freeze(unflatten_dict(result))

     ]
+def set_partitions(in_dict, use_scan):
     rules = _get_partition_rules()
     replace = _replacement_rules(rules)
     initd = {k: _unmatched for k in flatten_dict(in_dict)}
     for k, v in result.items():
         if v == _unmatched:
             print(f"Unmatched -> {k}")
+    l = list(result.keys())
+    if use_scan:
+        # add None dimension to scanned layers
+        result = {
+            k: (P(*(None,) + v) if v is not None else None)
+            if any(x in k for x in ["FlaxBartEncoderLayers", "FlaxBartDecoderLayers"])
+            else v
+            for k, v in result.items()
+        }
     assert _unmatched not in result.values(), "Incomplete partition spec."
     return freeze(unflatten_dict(result))

tools/train/config/mega/config.json CHANGED Viewed

@@ -7,14 +7,14 @@
   "decoder_attention_heads": 32,
   "decoder_ffn_dim": 4096,
   "decoder_layerdrop": 0.0,
-  "decoder_layers": 25,
   "decoder_start_token_id": 16384,
   "do_sample": true,
   "dropout": 0.0,
   "encoder_attention_heads": 32,
   "encoder_ffn_dim": 4096,
   "encoder_layerdrop": 0.0,
-  "encoder_layers": 25,
   "encoder_vocab_size": 50272,
   "eos_token_id": 16385,
   "force_ln_scale": false,

   "decoder_attention_heads": 32,
   "decoder_ffn_dim": 4096,
   "decoder_layerdrop": 0.0,
+  "decoder_layers": 26,
   "decoder_start_token_id": 16384,
   "do_sample": true,
   "dropout": 0.0,
   "encoder_attention_heads": 32,
   "encoder_ffn_dim": 4096,
   "encoder_layerdrop": 0.0,
+  "encoder_layers": 26,
   "encoder_vocab_size": 50272,
   "eos_token_id": 16385,
   "force_ln_scale": false,

tools/train/train.py CHANGED Viewed

@@ -42,6 +42,7 @@ from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
 from flax.training.common_utils import onehot
 from jax.experimental import PartitionSpec, maps
 from jax.experimental.compilation_cache import compilation_cache as cc
 from jax.experimental.pjit import pjit, with_sharding_constraint
@@ -531,6 +532,54 @@ class TrainState(train_state.TrainState):
     train_time: float = 0.0  # total time the model trained
     train_samples: int = 0  # number of samples seen
 def main():
     # See all possible arguments by passing the --help flag to this script.
@@ -618,7 +667,7 @@ def main():
     model_metadata = model_args.get_metadata()
     # get PartitionSpec for model params (required to be a dict)
-    param_spec = set_partitions(model.params)
     # convert params to frozen dict
     model._params = freeze(model.params)
@@ -743,6 +792,23 @@ def main():
     learning_rate_fn = create_learning_rate_fn()
     # create adam optimizer
     if training_args.optim == "distributed_shampoo":
         # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
@@ -795,10 +861,12 @@ def main():
         )
         # get the real optimizer and helper functions
         update_fn = optimizer.update
-        optimizer = optimizer.init(model.params)
         opt_fn = NamedTuple("opt_fn", pspec_fn=Any, shape_and_dtype_fn=Any)(
             optimizer.pspec_fn, optimizer.shape_and_dtype_fn
         )
         optimizer = optax.GradientTransformation(optimizer.init_fn, update_fn)
     elif training_args.optim == "adam":
@@ -819,7 +887,7 @@ def main():
     # get PartitionSpec for optimizer state
     def get_opt_state_spec_and_shape(param_spec):
         # get opt_state shape without actual init
-        opt_state_shape = jax.eval_shape(optimizer.init, model.params)
         if training_args.optim == "adam":
@@ -844,7 +912,7 @@ def main():
         elif training_args.optim == "distributed_shampoo":
             opt_state_spec = opt_fn.pspec_fn(
-                params=model.params,
                 params_partition_spec=param_spec,
                 partition_spec_for_statistics=PartitionSpec(None, "dp", None),
             )
@@ -852,7 +920,7 @@ def main():
             raise NotImplementedError
         return opt_state_spec, opt_state_shape
-    opt_state_spec, opt_state_shape = get_opt_state_spec_and_shape(param_spec)
     # create a mesh
     mesh_shape = (training_args.dp_devices, training_args.mp_devices)

 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
 from flax.training.common_utils import onehot
+from jax import ShapeDtypeStruct
 from jax.experimental import PartitionSpec, maps
 from jax.experimental.compilation_cache import compilation_cache as cc
 from jax.experimental.pjit import pjit, with_sharding_constraint
     train_time: float = 0.0  # total time the model trained
     train_samples: int = 0  # number of samples seen
+    def apply_gradients(self, *, grads, **kwargs):
+        params = self.unscan(self.params)
+        updates, new_opt_state = self.tx.update(
+            self.unscan(grads), self.opt_state, params
+        )
+        params = optax.apply_updates(params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=self.rescan(params),
+            opt_state=new_opt_state,
+            **kwargs,
+        )
+    @classmethod
+    def create(cls, *, apply_fn, params, tx, **kwargs):
+        opt_state = tx.init(cls.unscan(params))
+        return cls(
+            step=0,
+            apply_fn=apply_fn,
+            params=params,
+            tx=tx,
+            opt_state=opt_state,
+            **kwargs,
+        )
+    @staticmethod
+    def unscan(params):
+        params = unfreeze(params)
+        for l in ["encoder", "decoder"]:
+            params["model"][l]["layers"] = jax.tree_map(
+                lambda x: {f"{i}": x[i] for i in range(len(x))},
+                params["model"][l]["layers"],
+            )
+        params = freeze(params)
+        return params
+    @staticmethod
+    def rescan(params):
+        params = unfreeze(params)
+        for l in ["encoder", "decoder"]:
+            params["model"][l]["layers"] = jax.tree_map(
+                lambda x: jnp.stack([x[f"{i}"] for i in range(len(x))]),
+                params["model"][l]["layers"],
+                is_leaf=lambda x: "0" in x,
+            )
+        params = freeze(params)
+        return params
 def main():
     # See all possible arguments by passing the --help flag to this script.
     model_metadata = model_args.get_metadata()
     # get PartitionSpec for model params (required to be a dict)
+    param_spec = set_partitions(model.params, model.config.use_scan)
     # convert params to frozen dict
     model._params = freeze(model.params)
     learning_rate_fn = create_learning_rate_fn()
+    # reshape params to split scanned layers for optimizers
+    if model.config.use_scan:
+        params_struct = unfreeze(model.params)
+        for l in ["encoder", "decoder"]:
+            params_struct["model"][l]["layers"] = jax.tree_map(
+                lambda x: {
+                    f"{i}": ShapeDtypeStruct(shape=x.shape[1:], dtype=x.dtype)
+                    for i in range(len(x))
+                },
+                params_struct["model"][l]["layers"],
+            )
+        params_struct = freeze(params_struct)
+    else:
+        params_struct = model.params
+    opt_param_spec = set_partitions(params_struct, False)
     # create adam optimizer
     if training_args.optim == "distributed_shampoo":
         # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
         )
         # get the real optimizer and helper functions
         update_fn = optimizer.update
+        optimizer = optimizer.init(params_struct)
         opt_fn = NamedTuple("opt_fn", pspec_fn=Any, shape_and_dtype_fn=Any)(
             optimizer.pspec_fn, optimizer.shape_and_dtype_fn
         )
         optimizer = optax.GradientTransformation(optimizer.init_fn, update_fn)
     elif training_args.optim == "adam":
     # get PartitionSpec for optimizer state
     def get_opt_state_spec_and_shape(param_spec):
         # get opt_state shape without actual init
+        opt_state_shape = jax.eval_shape(optimizer.init, params_struct)
         if training_args.optim == "adam":
         elif training_args.optim == "distributed_shampoo":
             opt_state_spec = opt_fn.pspec_fn(
+                params=params_struct,
                 params_partition_spec=param_spec,
                 partition_spec_for_statistics=PartitionSpec(None, "dp", None),
             )
             raise NotImplementedError
         return opt_state_spec, opt_state_shape
+    opt_state_spec, opt_state_shape = get_opt_state_spec_and_shape(opt_param_spec)
     # create a mesh
     mesh_shape = (training_args.dp_devices, training_args.mp_devices)