BabyLM-community
/

babylm-baseline-100m-gpt-bert-masked-focus

@@ -17,6 +17,58 @@ from transformers.modeling_outputs import (
 from typing import Optional, Union
 class Layer(nn.Module):
     def __init__(self: Layer, config: ModelConfig, layer_idx: int = 0):
@@ -284,7 +336,14 @@ class GPTBERT(GPTBERTPreTrainedModel):
         self.hidden_size = config.hidden_size
         self.embedding = Embedding(config)
-        self.layers = nn.ModuleList([Layer(config) for _ in range(config.num_layers)])
         self.is_causal = is_causal
     def get_input_embeddings(self):
@@ -316,8 +375,13 @@ class GPTBERT(GPTBERTPreTrainedModel):
         static_embeddings, relative_embeddings = self.embedding(input_ids.t())
         contextualized_embeddings = [static_embeddings]
         attention_probs = []
-        for layer in self.layers:
-            layer_embeddings, layer_attention_probs = layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
             contextualized_embeddings.append(layer_embeddings)
             attention_probs.append(layer_attention_probs)
         contextualized_embeddings = [emb.transpose(0, 1) for emb in contextualized_embeddings]

 from typing import Optional, Union
+# From https://github.com/epfml/DenseFormer
+class InPlaceSetSlice(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, full_tensor, last_slice, x_idx, x_val):
+        full_tensor[x_idx] = x_val
+        ctx.x_idx = x_idx
+        ret = torch.Tensor().to(full_tensor.device)
+        ret.set_(full_tensor[:x_idx + 1])
+        return ret
+    @staticmethod
+    def backward(ctx, grad_out):
+        if ctx.x_idx == 0:
+            return None, None, None, grad_out[ctx.x_idx]
+        else:
+            return None, grad_out[:ctx.x_idx], None, grad_out[ctx.x_idx]
+def apply_inplace_set(x_acc, x_idx, x_val):
+    full_tensor, last_slice = x_acc
+    new_slice = InPlaceSetSlice.apply(full_tensor, last_slice, x_idx, x_val)
+    return full_tensor, new_slice
+class DWAModules(torch.nn.Module):
+    def __init__(self, hidden_size, n_blocks):
+        super().__init__()
+        self.n_blocks = n_blocks
+        self.alphas = nn.ParameterList([nn.Parameter(torch.zeros(i + 2)) for i in range(n_blocks)])
+        self.accumulator = None
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.alphas:
+            module.data.zero_()
+            module.data[-1] = 1.0
+    def init_accumulator(self, x):
+        self.accumulator = (torch.zeros((self.n_blocks + 1, *x.shape), device=x.device, dtype=x.dtype), None)
+        self.accumulator = apply_inplace_set(self.accumulator, 0, x)
+    def forward(self, x, block_idx):
+        assert self.accumulator is not None, "`init_accumulator(x)` needs to be called first"
+        self.accumulator = apply_inplace_set(
+            self.accumulator,
+            block_idx + 1,
+            x
+        )
+        x = torch.tensordot(self.alphas[block_idx], self.accumulator[1], dims=1)
+        return x
 class Layer(nn.Module):
     def __init__(self: Layer, config: ModelConfig, layer_idx: int = 0):
         self.hidden_size = config.hidden_size
         self.embedding = Embedding(config)
+        self.attention_layers = nn.ModuleList([Attention(config) for _ in range(config.num_layers)])
+        self.mlp_layers = nn.ModuleList([FeedForward(config) for _ in range(config.num_layers)])
+        self.dwa_modules = DWAModules(config.hidden_size, config.num_hidden_layers * 2)
+        for i, layer in enumerate(self.mlp_layers):
+            layer.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+            layer.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
         self.is_causal = is_causal
     def get_input_embeddings(self):
         static_embeddings, relative_embeddings = self.embedding(input_ids.t())
         contextualized_embeddings = [static_embeddings]
         attention_probs = []
+        self.dwa_modules.init_accumulator(static_embeddings)
+        for i, (attention_layer, mlp_layer) in enumerate(zip(self.attention_layers, self.mlp_layers)):
+            attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
+            layer_embeddings = contextualized_embeddings[-1] + attention
+            layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
+            layer_embeddings += mlp_layer(layer_embeddings)
+            layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
             contextualized_embeddings.append(layer_embeddings)
             attention_probs.append(layer_attention_probs)
         contextualized_embeddings = [emb.transpose(0, 1) for emb in contextualized_embeddings]