Warn about megablocks more clearly and less often (#20)

Browse files

- warn about megablocks more clearly and less often (8a4d4d9a7f96bf4ffe71c72251432824ebfd90d4)

Co-authored-by: Cebtenzzre <[email protected]>

Files changed (1) hide show

modeling_hf_nomic_bert.py +11 -6

modeling_hf_nomic_bert.py CHANGED Viewed

@@ -3,13 +3,15 @@
 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
 import collections
 import logging
-# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import math
 import os
 import re
 from collections import OrderedDict
 from functools import partial
 from typing import List, Optional, Tuple, Union
@@ -54,8 +56,9 @@ try:
     from megablocks.layers import dmoe
     from megablocks.layers.arguments import Arguments
 except ImportError:
-    logger.warning("!!!!!!!!!!!!megablocks not available, using torch.matmul instead")
     dmoe = None
@@ -1612,7 +1615,7 @@ class NomicBertBlock(NomicBertPreTrainedModel):
         )
         self.moe = moe
         if moe:
-            if dmoe is not None:
                 megablocks_args = Arguments(
                     moe_num_experts=config.num_experts,
                     moe_top_k=config.moe_top_k,
@@ -1628,6 +1631,8 @@ class NomicBertBlock(NomicBertPreTrainedModel):
                 )
                 self.mlp = dmoe.dMoE(megablocks_args)
             else:
                 self.mlp = NomicMoELayer(
                     config
                 )
@@ -1698,7 +1703,7 @@ class NomicBertBlock(NomicBertPreTrainedModel):
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
             if self.moe:
-                hidden_states = self.mlp(hidden_states, torch.where(attention_mask.squeeze() == 0, 1, 0))
             else:
                 hidden_states = self.mlp(hidden_states)
@@ -1715,7 +1720,7 @@ class NomicBertBlock(NomicBertPreTrainedModel):
             )
             hidden_states = self.norm1((self.dropout1(attn_outputs) + hidden_states).to(dtype=self.norm1.weight.dtype))
             if self.moe:
-                mlp_out = self.mlp(hidden_states, torch.where(attention_mask.squeeze() == 0, 1, 0))
             else:
                 mlp_out = self.mlp(hidden_states)

 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
+# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import collections
+import inspect
 import logging
 import math
 import os
 import re
+import warnings
 from collections import OrderedDict
 from functools import partial
 from typing import List, Optional, Tuple, Union
     from megablocks.layers import dmoe
     from megablocks.layers.arguments import Arguments
 except ImportError:
     dmoe = None
+else:
+    dmoe_is_nomic = 'attention_mask' in inspect.signature(dmoe.dMoE.forward).parameters
         )
         self.moe = moe
         if moe:
+            if dmoe is not None and dmoe_is_nomic:
                 megablocks_args = Arguments(
                     moe_num_experts=config.num_experts,
                     moe_top_k=config.moe_top_k,
                 )
                 self.mlp = dmoe.dMoE(megablocks_args)
             else:
+                warnings.warn("Install Nomic's megablocks fork for better speed: " +
+                              "`pip install git+https://github.com/nomic-ai/megablocks.git`")
                 self.mlp = NomicMoELayer(
                     config
                 )
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
             if self.moe:
+                hidden_states = self.mlp(hidden_states, attention_mask=torch.where(attention_mask.squeeze() == 0, 1, 0))
             else:
                 hidden_states = self.mlp(hidden_states)
             )
             hidden_states = self.norm1((self.dropout1(attn_outputs) + hidden_states).to(dtype=self.norm1.weight.dtype))
             if self.moe:
+                mlp_out = self.mlp(hidden_states, attention_mask=torch.where(attention_mask.squeeze() == 0, 1, 0))
             else:
                 mlp_out = self.mlp(hidden_states)