lgcharpe commited on
Commit
b2a4d72
·
verified ·
1 Parent(s): b9c1a62

Uploading patch

Browse files
Files changed (1) hide show
  1. modeling_gpt_bert.py +3 -2
modeling_gpt_bert.py CHANGED
@@ -310,6 +310,7 @@ class Embedding(nn.Module):
310
  class GPTBERTPreTrainedModel(PreTrainedModel):
311
  config_class = ModelConfig
312
  supports_gradient_checkpointing = False
 
313
 
314
  def _set_gradient_checkpointing(self, module, value=False):
315
  raise NotImplementedError("Gradient checkpointing is not supported by this model")
@@ -330,7 +331,7 @@ class GPTBERTPreTrainedModel(PreTrainedModel):
330
 
331
  class GPTBERT(GPTBERTPreTrainedModel):
332
 
333
- def __init__(self, config: ModelConfig, is_causal: bool, **kwargs):
334
  super().__init__(config, **kwargs)
335
  self.config = config
336
  self.hidden_size = config.hidden_size
@@ -380,7 +381,7 @@ class GPTBERT(GPTBERTPreTrainedModel):
380
  attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
381
  layer_embeddings = contextualized_embeddings[-1] + attention
382
  layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
383
- layer_embeddings += mlp_layer(layer_embeddings)
384
  layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
385
  contextualized_embeddings.append(layer_embeddings)
386
  attention_probs.append(layer_attention_probs)
 
310
  class GPTBERTPreTrainedModel(PreTrainedModel):
311
  config_class = ModelConfig
312
  supports_gradient_checkpointing = False
313
+ base_model_prefix = "model"
314
 
315
  def _set_gradient_checkpointing(self, module, value=False):
316
  raise NotImplementedError("Gradient checkpointing is not supported by this model")
 
331
 
332
  class GPTBERT(GPTBERTPreTrainedModel):
333
 
334
+ def __init__(self, config: ModelConfig, is_causal: bool = False, **kwargs):
335
  super().__init__(config, **kwargs)
336
  self.config = config
337
  self.hidden_size = config.hidden_size
 
381
  attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
382
  layer_embeddings = contextualized_embeddings[-1] + attention
383
  layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
384
+ layer_embeddings = layer_embeddings + mlp_layer(layer_embeddings)
385
  layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
386
  contextualized_embeddings.append(layer_embeddings)
387
  attention_probs.append(layer_attention_probs)