Add main & ema weights for srp

Browse files

Files changed (13) hide show

README.md +73 -0
config.json +33 -0
configuration_gpt_bert.py +30 -0
model.safetensors +3 -0
model_ema.safetensors +3 -0
modeling_gpt_bert.py +630 -0
original_project_config.json +16 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
srp-2gpu-100steps.bin +3 -0
srp-2gpu-100steps_ema.bin +3 -0
tokenizer.json +0 -0
tokenizer_config.json +141 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+library_name: transformers
+pipeline_tag: fill-mask
+tags: [gpt-bert, babylm, remote-code]
+license: other
+---
+# jumelet/gptbert-srp-100steps-small
+GPT-BERT style BabyBabyLLM model for language **srp**.
+This repository may include both *main* and *EMA* variants.
+**Default variant exposed to generic loaders:** `ema`
+## Variants Available
+ema, main
+## Files
+- model.safetensors (alias of default variant)
+- model_ema.safetensors
+- pytorch_model.bin (legacy PyTorch format)
+- srp-2gpu-100steps.bin (raw training checkpoint)
+- srp-2gpu-100steps_ema.bin (raw training checkpoint)
+## Configuration
+```json
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "intermediate_size": 1280,
+  "max_position_embeddings": 512,
+  "position_bucket_size": 32,
+  "num_attention_heads": 6,
+  "num_hidden_layers": 12,
+  "vocab_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "force_causal_mask": true,
+  "classifier_dropout": 0.1,
+  "classifier_layer_norm_eps": 1e-05,
+  "num_labels": 2
+}
+```
+Tokenizer file: `tokenizer_srp_vs8192.json`
+## Quick Usage
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+model_id = 'jumelet/gptbert-srp-100steps-small'
+tok = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForMaskedLM.from_pretrained(model_id, trust_remote_code=True)
+out = model(**tok('Hello world', return_tensors='pt'))
+```
+### Forced Causal Attention
+Causal attention is enforced during inference by applying a triangular future mask inside the remote code.
+This prevents the hybrid GPT-BERT layers from attending to future tokens even when a bidirectional mask is provided.
+### Sequence Classification
+`GPTBertForSequenceClassification` mirrors the original GLUE classifier head for downstream fine-tuning.
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+model_id = 'jumelet/gptbert-srp-100steps-small'
+tok = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSequenceClassification.from_pretrained(model_id, trust_remote_code=True)
+outputs = model(**tok('This movie was great!', return_tensors='pt'))
+print(outputs.logits)
+```
+## Notes
+- Converted on 2025-10-04T22:22:24.212092+00:00
+- Weights are the exact trained parameters; no new layers were initialized.
+- Requires `trust_remote_code=True` due to custom architecture.

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "architectures": [
+    "GPTBertForMaskedLM",
+    "GPTBertForCausalLM",
+    "GPTBertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_gpt_bert.GPTBertConfig",
+    "AutoModel": "modeling_gpt_bert.GPTBertForMaskedLM",
+    "AutoModelForCausalLM": "modeling_gpt_bert.GPTBertForCausalLM",
+    "AutoModelForMaskedLM": "modeling_gpt_bert.GPTBertForMaskedLM",
+    "AutoModelForSequenceClassification": "modeling_gpt_bert.GPTBertForSequenceClassification"
+  },
+  "bos_token_id": 1,
+  "classifier_dropout": 0.1,
+  "classifier_layer_norm_eps": 1e-05,
+  "eos_token_id": 2,
+  "force_causal_mask": true,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "intermediate_size": 1280,
+  "layer_norm_eps": 1e-05,
+  "mask_token_id": 4,
+  "max_position_embeddings": 512,
+  "model_type": "gpt_bert",
+  "num_attention_heads": 6,
+  "num_hidden_layers": 12,
+  "num_labels": 2,
+  "pad_token_id": 3,
+  "position_bucket_size": 32,
+  "vocab_size": 8192
+}

configuration_gpt_bert.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from transformers import PretrainedConfig
+class GPTBertConfig(PretrainedConfig):
+    model_type = 'gpt_bert'
+    def __init__(self, **kwargs):
+        self.attention_probs_dropout_prob = kwargs.pop('attention_probs_dropout_prob', 0.1)
+        self.hidden_dropout_prob = kwargs.pop('hidden_dropout_prob', 0.1)
+        self.hidden_size = kwargs.pop('hidden_size', 768)
+        self.intermediate_size = kwargs.pop('intermediate_size', 2560)
+        self.max_position_embeddings = kwargs.pop('max_position_embeddings', 512)
+        self.position_bucket_size = kwargs.pop('position_bucket_size', 32)
+        self.num_attention_heads = kwargs.pop('num_attention_heads', 12)
+        self.num_hidden_layers = kwargs.pop('num_hidden_layers', 12)
+        self.vocab_size = kwargs.pop('vocab_size', 16384)
+        self.layer_norm_eps = kwargs.pop('layer_norm_eps', 1e-5)
+        self.force_causal_mask = kwargs.pop('force_causal_mask', True)
+        self.classifier_dropout = kwargs.pop('classifier_dropout', 0.1)
+        self.classifier_layer_norm_eps = kwargs.pop('classifier_layer_norm_eps', 1e-05)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.problem_type = kwargs.pop('problem_type', None)
+        self.auto_map = {
+    'AutoConfig': 'configuration_gpt_bert.GPTBertConfig',
+    'AutoModel': 'modeling_gpt_bert.GPTBertForMaskedLM',
+    'AutoModelForCausalLM': 'modeling_gpt_bert.GPTBertForCausalLM',
+    'AutoModelForMaskedLM': 'modeling_gpt_bert.GPTBertForMaskedLM',
+    'AutoModelForSequenceClassification': 'modeling_gpt_bert.GPTBertForSequenceClassification',
+        }
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:907c55a08c0d03f8fa0292d7323f69e35539490bbf63bcc5d09e52c30f882bfd
+size 157333928

model_ema.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:907c55a08c0d03f8fa0292d7323f69e35539490bbf63bcc5d09e52c30f882bfd
+size 157333928

modeling_gpt_bert.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# Original training architecture (verbatim)
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import _softmax_backward_data as _softmax_backward_data
+class Bert(nn.Module):
+    def __init__(self, config, activation_checkpointing=False):
+        super().__init__()
+        self.embedding = Embedding(config)
+        self.transformer = Encoder(config, activation_checkpointing)
+        self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight)
+    def get_contextualized(self, input_ids, attention_mask):
+        static_embeddings, relative_embedding = self.embedding(input_ids)
+        contextualized_embeddings = self.transformer(static_embeddings, attention_mask.unsqueeze(1), relative_embedding)
+        return contextualized_embeddings
+    def forward(self, input_ids, attention_mask, masked_lm_labels, num_masked=None, ratio=None):
+        contextualized_embeddings = self.get_contextualized(input_ids, attention_mask)
+        if num_masked is None:
+            subword_prediction = self.classifier(contextualized_embeddings, masked_lm_labels, num_masked)
+            gold_labels = masked_lm_labels.flatten()
+            gold_labels = gold_labels[gold_labels != -100]
+            loss = F.cross_entropy(subword_prediction, gold_labels, reduction="none").mean()
+            z_loss = torch.logsumexp(subword_prediction, dim=-1).pow(2).mean()
+            with torch.no_grad():
+                accuracy = (subword_prediction.argmax(-1) == gold_labels).float().mean()
+            num_tokens = gold_labels.size(0)
+            return loss, accuracy, z_loss, num_tokens
+        else:
+            masked_subword_prediction, causal_subword_prediction = self.classifier(contextualized_embeddings, masked_lm_labels, num_masked)
+            if masked_subword_prediction is not None:
+                masked_gold_labels = masked_lm_labels[:, :num_masked].flatten()
+                masked_gold_labels = masked_gold_labels[masked_gold_labels != -100]
+                masked_loss = F.cross_entropy(masked_subword_prediction, masked_gold_labels)
+                masked_z_loss = torch.logsumexp(masked_subword_prediction, dim=-1).pow(2).mean()
+                with torch.no_grad():
+                    masked_accuracy = (masked_subword_prediction.argmax(-1) == masked_gold_labels).float().mean()
+                num_masked_tokens = masked_gold_labels.size(0)
+            else:
+                masked_loss = 0.0
+                masked_z_loss = 0.0
+                masked_accuracy = 0.0
+                num_masked_tokens = 0
+            if causal_subword_prediction is not None:
+                causal_gold_labels = masked_lm_labels[:, num_masked:].flatten()
+                causal_gold_labels = causal_gold_labels[causal_gold_labels != -100]
+                causal_loss = F.cross_entropy(causal_subword_prediction, causal_gold_labels)
+                causal_z_loss = torch.logsumexp(causal_subword_prediction, dim=-1).pow(2).mean()
+                with torch.no_grad():
+                    causal_accuracy = (causal_subword_prediction.argmax(-1) == causal_gold_labels).float().mean()
+                num_causal_tokens = causal_gold_labels.size(0)
+            else:
+                causal_loss = 0.0
+                causal_z_loss = 0.0
+                causal_accuracy = 0.0
+                num_causal_tokens = 0
+            loss = ratio * masked_loss + (1 - ratio) * causal_loss
+            z_loss = ratio * masked_z_loss + (1 - ratio) * causal_z_loss
+            with torch.no_grad():
+                accuracy = ratio * masked_accuracy + (1 - ratio) * causal_accuracy
+            num_tokens = num_masked_tokens + num_causal_tokens
+            return loss, masked_loss, causal_loss, accuracy, masked_accuracy, causal_accuracy, z_loss, num_tokens
+# From https://github.com/epfml/DenseFormer
+class InPlaceSetSlice(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, full_tensor, last_slice, x_idx, x_val):
+        full_tensor[x_idx] = x_val
+        ctx.x_idx = x_idx
+        ret = torch.Tensor().to(full_tensor.device)
+        ret.set_(full_tensor[:x_idx + 1])
+        return ret
+    @staticmethod
+    def backward(ctx, grad_out):
+        if ctx.x_idx == 0:
+            return None, None, None, grad_out[ctx.x_idx]
+        else:
+            return None, grad_out[:ctx.x_idx], None, grad_out[ctx.x_idx]
+def apply_inplace_set(x_acc, x_idx, x_val):
+    full_tensor, last_slice = x_acc
+    new_slice = InPlaceSetSlice.apply(full_tensor, last_slice, x_idx, x_val)
+    return full_tensor, new_slice
+class DWAModules(torch.nn.Module):
+    def __init__(self, hidden_size, n_blocks):
+        super().__init__()
+        self.n_blocks = n_blocks
+        self.alphas = nn.ParameterList([nn.Parameter(torch.zeros(i + 2)) for i in range(n_blocks)])
+        self.accumulator = None
+        self._init_weights()
+    def _init_weights(self):
+        for module in self.alphas:
+            module.data.zero_()
+            module.data[-1] = 1.0
+    def init_accumulator(self, x):
+        self.accumulator = (torch.zeros((self.n_blocks + 1, *x.shape), device=x.device, dtype=x.dtype), None)
+        self.accumulator = apply_inplace_set(self.accumulator, 0, x)
+    def forward(self, x, block_idx):
+        assert self.accumulator is not None, "`init_accumulator(x)` needs to be called first"
+        self.accumulator = apply_inplace_set(
+            self.accumulator,
+            block_idx + 1,
+            x
+        )
+        x = torch.tensordot(self.alphas[block_idx], self.accumulator[1], dims=1)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, config, activation_checkpointing=False):
+        super().__init__()
+        self.attention_layers = nn.ModuleList([Attention(config) for _ in range(config.num_hidden_layers)])
+        self.mlp_layers = nn.ModuleList([FeedForward(config) for _ in range(config.num_hidden_layers)])
+        self.dwa_modules = DWAModules(config.hidden_size, config.num_hidden_layers * 2)
+        for i, layer in enumerate(self.mlp_layers):
+            layer.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+            layer.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
+        self.activation_checkpointing = activation_checkpointing
+    def forward(self, x, attention_mask, relative_embedding):
+        self.dwa_modules.init_accumulator(x)
+        for i, (attention_layer, mlp_layer) in enumerate(zip(self.attention_layers, self.mlp_layers)):
+            x = x + attention_layer(x, attention_mask, relative_embedding)
+            x = self.dwa_modules(x, block_idx=i * 2)
+            x = x + mlp_layer(x)
+            x = self.dwa_modules(x, block_idx=i * 2 + 1)
+        return x
+class MaskClassifier(nn.Module):
+    def __init__(self, config, subword_embedding):
+        super().__init__()
+        self.nonlinearity = nn.Sequential(
+            nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.GELU(),
+            nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
+            nn.Dropout(config.hidden_dropout_prob),
+            nn.Linear(subword_embedding.size(1), subword_embedding.size(0))
+        )
+        self.initialize(config.hidden_size, subword_embedding)
+    def initialize(self, hidden_size, embedding):
+        std = math.sqrt(2.0 / (5.0 * hidden_size))
+        nn.init.trunc_normal_(self.nonlinearity[1].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        self.nonlinearity[-1].weight = embedding
+        self.nonlinearity[1].bias.data.zero_()
+        self.nonlinearity[-1].bias.data.zero_()
+    def forward(self, x, masked_lm_labels, num_masked=None):
+        if num_masked is None:
+            x = torch.index_select(x.flatten(0, 1), 0, torch.nonzero(masked_lm_labels.flatten() != -100).squeeze())
+            x = self.nonlinearity(x)
+            return x
+        else:
+            masked_x, causal_x = torch.tensor_split(x, (num_masked,), dim=1)
+            mntp_masked_lm_labels, causal_masked_lm_labels = torch.tensor_split(masked_lm_labels, (num_masked,), dim=1)
+            if masked_x.size(1) != 0:
+                masked_x = torch.index_select(masked_x.flatten(0, 1), 0, torch.nonzero(mntp_masked_lm_labels.flatten() != -100).squeeze())
+                masked_x = self.nonlinearity(masked_x)
+            else:
+                masked_x = None
+            if causal_x.size(1) != 0:
+                causal_x = torch.index_select(causal_x.flatten(0, 1), 0, torch.nonzero(causal_masked_lm_labels.flatten() != -100).squeeze())
+                causal_x = self.nonlinearity(causal_x)
+            else:
+                causal_x = None
+            return masked_x, causal_x
+class GeGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        x = x * F.gelu(gate, approximate='tanh')
+        return x
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.hidden_size, 2*config.intermediate_size, bias=False),
+            GeGLU(),
+            nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.intermediate_size, config.hidden_size, bias=False),
+            nn.Dropout(config.hidden_dropout_prob)
+        )
+        self.initialize(config.hidden_size)
+    def initialize(self, hidden_size):
+        std = math.sqrt(2.0 / (5.0 * hidden_size))
+        nn.init.trunc_normal_(self.mlp[1].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.mlp[-2].weight, mean=0.0, std=std, a=-2*std, b=2*std)
+    def forward(self, x):
+        return self.mlp(x)
+class MaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(self, x, mask, dim):
+        self.dim = dim
+        x.masked_fill_(mask, float('-inf'))
+        x = torch.softmax(x, self.dim)
+        x.masked_fill_(mask, 0.0)
+        self.save_for_backward(x)
+        return x
+    @staticmethod
+    def backward(self, grad_output):
+        output, = self.saved_tensors
+        inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype)
+        return inputGrad, None, None
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}")
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.in_proj_qk = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
+        self.in_proj_vg = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
+        self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
+        self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
+        position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
+            - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
+        position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
+        position_indices = config.position_bucket_size - 1 + position_indices
+        self.register_buffer("position_indices", position_indices, persistent=True)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.scale = 1.0 / math.sqrt(3 * self.head_size)
+        self.initialize()
+    def make_log_bucket_position(self, relative_pos, bucket_size, max_position):
+        sign = torch.sign(relative_pos)
+        mid = bucket_size // 2
+        abs_pos = torch.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, torch.abs(relative_pos).clamp(max=max_position - 1))
+        log_pos = torch.ceil(torch.log(abs_pos / mid) / math.log((max_position-1) / mid) * (mid - 1)).int() + mid
+        bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
+        return bucket_pos
+    def initialize(self):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        nn.init.trunc_normal_(self.in_proj_qk.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.in_proj_vg.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.out_proj.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+        self.in_proj_qk.bias.data.zero_()
+        self.in_proj_vg.bias.data.zero_()
+        self.out_proj.bias.data.zero_()
+    def forward(self, hidden_states, attention_mask, relative_embedding):
+        key_len, batch_size, _ = hidden_states.size()
+        query_len = key_len
+        if self.position_indices.size(0) < query_len:
+            position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
+                - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
+            position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
+            position_indices = self.config.position_bucket_size - 1 + position_indices
+            self.register_buffer("position_indices", position_indices.to(hidden_states.device), persistent=True)
+        hidden_states = self.pre_layer_norm(hidden_states)
+        query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
+        value, gate = self.in_proj_vg(hidden_states).chunk(2, dim=2)  # shape: [T, B, D]
+        gate = F.gelu(gate)
+        pos = self.in_proj_qk(self.dropout(relative_embedding))  # shape: [2T-1, 2D]
+        pos = F.embedding(self.position_indices[:query_len, :key_len], pos)  # shape: [T, T, 2D]
+        query_pos, key_pos = pos.chunk(2, dim=-1)
+        query_pos = query_pos.view(query_len, key_len, self.num_heads, self.head_size)
+        key_pos = key_pos.view(query_len, key_len, self.num_heads, self.head_size)
+        query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        value = value.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
+        attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
+        query = query.view(batch_size, self.num_heads, query_len, self.head_size)
+        key = key.view(batch_size, self.num_heads, query_len, self.head_size)
+        attention_scores = attention_scores.view(batch_size, self.num_heads, query_len, key_len)
+        attention_scores.add_(torch.einsum("bhqd,qkhd->bhqk", query, key_pos * self.scale))
+        attention_scores.add_(torch.einsum("bhkd,qkhd->bhqk", key * self.scale, query_pos))
+        attention_probs = MaskedSoftmax.apply(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context = torch.bmm(attention_probs.flatten(0, 1), value)  # shape: [B*H, Q, D]
+        context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size)  # shape: [Q, B, H*D]
+        context = context * gate
+        context = self.post_layer_norm(context)
+        context = self.out_proj(context)
+        context = self.dropout(context)
+        return context
+class Embedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.word_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.relative_embedding = nn.Parameter(torch.empty(2 * config.position_bucket_size - 1, config.hidden_size))
+        self.relative_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.initialize()
+    def initialize(self):
+        std = math.sqrt(2.0 / (5.0 * self.hidden_size))
+        nn.init.trunc_normal_(self.relative_embedding, mean=0.0, std=std, a=-2*std, b=2*std)
+        nn.init.trunc_normal_(self.word_embedding.weight, mean=0.0, std=std, a=-2*std, b=2*std)
+    def forward(self, input_ids):
+        word_embedding = self.dropout(self.word_layer_norm(self.word_embedding(input_ids)))
+        relative_embeddings = self.relative_layer_norm(self.relative_embedding)
+        return word_embedding, relative_embeddings
+# HF wrappers that preserve state dict keys and behavior
+        from transformers import PreTrainedModel
+        from transformers.modeling_outputs import MaskedLMOutput, CausalLMOutputWithCrossAttentions, SequenceClassifierOutput
+        from .configuration_gpt_bert import GPTBertConfig
+        import torch
+        import torch.nn as nn
+        DEFAULT_FORCE_CAUSAL_MASK = True
+        EMIT_HIDDEN_STATES_DEFAULT = True
+        def _normalize_mask_tensor(mask):
+            if mask.dtype == torch.bool:
+                if mask.numel() == 0:
+                    return mask
+                true_fraction = mask.float().mean().item()
+                if true_fraction > 0.5:
+                    mask = ~mask
+            else:
+                mask = mask <= 0
+            return mask.to(torch.bool)
+        def _ensure_valid_rows(mask):
+            row_masked = mask.all(dim=-1)
+            if row_masked.any():
+                idx = row_masked.nonzero(as_tuple=False)
+                mask[idx[:, 0], idx[:, 1], idx[:, 1]] = False
+            return mask
+        def _build_future_causal_mask(batch_size, seq_len, device):
+            base = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device), diagonal=1)
+            return base.unsqueeze(0).expand(batch_size, -1, -1)
+        def _build_babylm_attention_mask(input_ids, attention_mask, force_causal=False):
+            batch_size, seq_len = input_ids.shape[:2]
+            device = input_ids.device
+            if attention_mask is None:
+                mask = torch.zeros(batch_size, seq_len, seq_len, dtype=torch.bool, device=device)
+            else:
+                mask = attention_mask
+                if mask.dim() == 0:
+                    mask = mask.unsqueeze(0)
+                if mask.dim() == 1:
+                    mask = mask.unsqueeze(0)
+                if mask.dim() == 2:
+                    mask = _normalize_mask_tensor(mask)
+                    mask = mask.unsqueeze(1) | mask.unsqueeze(2)
+                elif mask.dim() == 3:
+                    if mask.size(1) == 1 and mask.size(2) == seq_len:
+                        mask = _normalize_mask_tensor(mask.squeeze(1))
+                        mask = mask.unsqueeze(1) | mask.unsqueeze(2)
+                    elif mask.size(1) == seq_len and mask.size(2) == 1:
+                        mask = _normalize_mask_tensor(mask.squeeze(2))
+                        mask = mask.unsqueeze(1) | mask.unsqueeze(2)
+                    else:
+                        mask = _normalize_mask_tensor(mask)
+                elif mask.dim() == 4:
+                    if mask.size(1) == 1:
+                        mask = mask[:, 0]
+                    else:
+                        mask = mask.any(dim=1)
+                    mask = _normalize_mask_tensor(mask)
+                else:
+                    raise ValueError("Unsupported attention_mask dimensions: {}".format(mask.dim()))
+                mask = mask.to(device=device, dtype=torch.bool)
+                if mask.dim() == 2:
+                    mask = mask.unsqueeze(1) | mask.unsqueeze(2)
+                if mask.dim() != 3:
+                    raise ValueError("attention_mask must broadcast to a square matrix")
+                if mask.size(0) == 1 and batch_size > 1:
+                    mask = mask.expand(batch_size, -1, -1).clone()
+                elif mask.size(0) != batch_size:
+                    raise ValueError("attention_mask batch dimension {} does not match inputs {}".format(mask.size(0), batch_size))
+                rows = min(mask.size(1), seq_len)
+                cols = min(mask.size(2), seq_len)
+                if mask.size(1) != seq_len or mask.size(2) != seq_len:
+                    new_mask = torch.ones(batch_size, seq_len, seq_len, dtype=torch.bool, device=device)
+                    new_mask[:, :rows, :cols] = mask[:, :rows, :cols]
+                    mask = new_mask
+            if force_causal:
+                future_mask = _build_future_causal_mask(mask.size(0), seq_len, device)
+                mask = mask | future_mask
+            mask = _ensure_valid_rows(mask)
+            return mask.unsqueeze(1)
+        class GPTBertForMaskedLM(PreTrainedModel):
+            config_class = GPTBertConfig
+            base_model_prefix = 'gpt_bert'
+            def __init__(self, config: GPTBertConfig):
+                super().__init__(config)
+                self.model = Bert(config)
+                self.force_causal_mask = getattr(config, "force_causal_mask", DEFAULT_FORCE_CAUSAL_MASK)
+            def tie_weights(self):
+                try:
+                    self.model.classifier.nonlinearity[-1].weight = self.model.embedding.word_embedding.weight
+                except Exception:
+                    pass
+                return super().tie_weights()
+            def forward(self, input_ids, attention_mask=None, labels=None, output_hidden_states=None, return_dict=None):
+                output_hidden_states = output_hidden_states if output_hidden_states is not None else (self.config.output_hidden_states or EMIT_HIDDEN_STATES_DEFAULT)
+                return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+                mask_4d = _build_babylm_attention_mask(input_ids, attention_mask, force_causal=self.force_causal_mask)
+                static_embeddings, relative_embedding = self.model.embedding(input_ids)
+                if static_embeddings.dim() == 3 and static_embeddings.shape[0] == input_ids.shape[0]:
+                    static_embeddings = static_embeddings.transpose(0, 1)
+                contextualized = self.model.transformer(static_embeddings, mask_4d, relative_embedding)
+                hs = contextualized.transpose(0, 1)
+                B, S, H = hs.shape
+                flat = hs.reshape(B * S, H)
+                logits_flat = self.model.classifier.nonlinearity(flat)
+                vocab = logits_flat.size(-1)
+                logits = logits_flat.view(B, S, vocab)
+                loss = None
+                if labels is not None:
+                    loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                    loss = loss_fct(logits.view(-1, vocab), labels.view(-1))
+                hidden_states = (hs,) if output_hidden_states else None
+                if not return_dict:
+                    outputs = (logits,)
+                    if hidden_states is not None:
+                        outputs = outputs + (hidden_states,)
+                    return ((loss,) + outputs) if loss is not None else outputs
+                return MaskedLMOutput(loss=loss, logits=logits, hidden_states=hidden_states)
+        class GPTBertForCausalLM(PreTrainedModel):
+            config_class = GPTBertConfig
+            base_model_prefix = 'gpt_bert'
+            def __init__(self, config: GPTBertConfig):
+                super().__init__(config)
+                self.model = Bert(config)
+                self.force_causal_mask = getattr(config, "force_causal_mask", DEFAULT_FORCE_CAUSAL_MASK)
+            def prepare_inputs_for_generation(self, input_ids, **kwargs):
+                return {'input_ids': input_ids, 'attention_mask': kwargs.get('attention_mask', None)}
+            def forward(self, input_ids, attention_mask=None, labels=None, output_hidden_states=None, return_dict=None):
+                output_hidden_states = output_hidden_states if output_hidden_states is not None else (self.config.output_hidden_states or EMIT_HIDDEN_STATES_DEFAULT)
+                return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+                mask_4d = _build_babylm_attention_mask(input_ids, attention_mask, force_causal=self.force_causal_mask)
+                static_embeddings, relative_embedding = self.model.embedding(input_ids)
+                if static_embeddings.dim() == 3 and static_embeddings.shape[0] == input_ids.shape[0]:
+                    static_embeddings = static_embeddings.transpose(0, 1)
+                contextualized = self.model.transformer(static_embeddings, mask_4d, relative_embedding)
+                hs = contextualized.transpose(0, 1)
+                B, S, H = hs.shape
+                flat = hs.reshape(B * S, H)
+                logits_flat = self.model.classifier.nonlinearity(flat)
+                vocab = logits_flat.size(-1)
+                logits = logits_flat.view(B, S, vocab)
+                loss = None
+                if labels is not None:
+                    shift_logits = logits[..., :-1, :].contiguous()
+                    shift_labels = labels[..., 1:].contiguous()
+                    loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+                    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+                hidden_states = (hs,) if output_hidden_states else None
+                if not return_dict:
+                    outputs = (logits,)
+                    if hidden_states is not None:
+                        outputs = outputs + (hidden_states,)
+                    return ((loss,) + outputs) if loss is not None else outputs
+                return CausalLMOutputWithCrossAttentions(loss=loss, logits=logits, hidden_states=hidden_states)
+class ClassifierHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.nonlinearity = nn.Sequential(
+            nn.LayerNorm(config.hidden_size, config.classifier_layer_norm_eps, elementwise_affine=False),
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.GELU(),
+            nn.LayerNorm(config.hidden_size, config.classifier_layer_norm_eps, elementwise_affine=False),
+            nn.Dropout(config.classifier_dropout),
+            nn.Linear(config.hidden_size, config.num_labels)
+        )
+    def forward(self, embeddings):
+        return self.nonlinearity(embeddings)
+class GPTBertForSequenceClassification(PreTrainedModel):
+    config_class = GPTBertConfig
+    base_model_prefix = 'gpt_bert'
+    def __init__(self, config: GPTBertConfig):
+        super().__init__(config)
+        self.model = Bert(config)
+        self.force_causal_mask = getattr(config, "force_causal_mask", DEFAULT_FORCE_CAUSAL_MASK)
+        self.sequence_classifier = ClassifierHead(config)
+    def forward(self, input_ids, attention_mask=None, labels=None, output_hidden_states=None, return_dict=None):
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else (self.config.output_hidden_states or EMIT_HIDDEN_STATES_DEFAULT)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        mask_4d = _build_babylm_attention_mask(input_ids, attention_mask, force_causal=self.force_causal_mask)
+        static_embeddings, relative_embedding = self.model.embedding(input_ids)
+        if static_embeddings.dim() == 3 and static_embeddings.shape[0] == input_ids.shape[0]:
+            static_embeddings = static_embeddings.transpose(0, 1)
+        contextualized = self.model.transformer(static_embeddings, mask_4d, relative_embedding)
+        hs = contextualized.transpose(0, 1)
+        pooled_output = hs[:, 0, :]
+        logits = self.sequence_classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            problem_type = self.config.problem_type
+            if problem_type is None:
+                if self.config.num_labels == 1:
+                    problem_type = "regression"
+                elif labels.dtype in (torch.long, torch.int):
+                    problem_type = "single_label_classification"
+                else:
+                    problem_type = "multilabel_classification"
+            if problem_type == "regression":
+                logits = logits.squeeze(-1)
+                loss_fct = nn.MSELoss()
+                loss = loss_fct(logits, labels.float())
+            elif problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            else:
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels.float())
+        hidden_states = (hs,) if output_hidden_states else None
+        if not return_dict:
+            outputs = (logits,)
+            if hidden_states is not None:
+                outputs = outputs + (hidden_states,)
+            return ((loss,) + outputs) if loss is not None else outputs
+        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states)

original_project_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "intermediate_size": 1280,
+  "max_position_embeddings": 512,
+  "position_bucket_size": 32,
+  "num_attention_heads": 6,
+  "num_hidden_layers": 12,
+  "vocab_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "force_causal_mask": true,
+  "classifier_dropout": 0.1,
+  "classifier_layer_norm_eps": 1e-05,
+  "num_labels": 2
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:456bd4098409dacd6f23319f8498ac6a31314a0c9aec4c79d33a5564a28a9620
+size 144780150

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

srp-2gpu-100steps.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ab48a9deb18bacf94e1863768f0ec6cbdf82f74684dfd5521174ae7e0fcaf39
+size 144793266

srp-2gpu-100steps_ema.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4950ee251bc951bf752fcc532bf4e59dc74ea80994b86f6abb5bb8cf556896a7
+size 144793966

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,141 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<special_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<special_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<special_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<special_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<special_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<special_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<special_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<special_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<special_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<special_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<special_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}