Uploading patch
Browse files- modeling_gpt_bert.py +3 -2
modeling_gpt_bert.py
CHANGED
@@ -310,6 +310,7 @@ class Embedding(nn.Module):
|
|
310 |
class GPTBERTPreTrainedModel(PreTrainedModel):
|
311 |
config_class = ModelConfig
|
312 |
supports_gradient_checkpointing = False
|
|
|
313 |
|
314 |
def _set_gradient_checkpointing(self, module, value=False):
|
315 |
raise NotImplementedError("Gradient checkpointing is not supported by this model")
|
@@ -330,7 +331,7 @@ class GPTBERTPreTrainedModel(PreTrainedModel):
|
|
330 |
|
331 |
class GPTBERT(GPTBERTPreTrainedModel):
|
332 |
|
333 |
-
def __init__(self, config: ModelConfig, is_causal: bool, **kwargs):
|
334 |
super().__init__(config, **kwargs)
|
335 |
self.config = config
|
336 |
self.hidden_size = config.hidden_size
|
@@ -380,7 +381,7 @@ class GPTBERT(GPTBERTPreTrainedModel):
|
|
380 |
attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
|
381 |
layer_embeddings = contextualized_embeddings[-1] + attention
|
382 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
|
383 |
-
layer_embeddings
|
384 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
|
385 |
contextualized_embeddings.append(layer_embeddings)
|
386 |
attention_probs.append(layer_attention_probs)
|
|
|
310 |
class GPTBERTPreTrainedModel(PreTrainedModel):
|
311 |
config_class = ModelConfig
|
312 |
supports_gradient_checkpointing = False
|
313 |
+
base_model_prefix = "model"
|
314 |
|
315 |
def _set_gradient_checkpointing(self, module, value=False):
|
316 |
raise NotImplementedError("Gradient checkpointing is not supported by this model")
|
|
|
331 |
|
332 |
class GPTBERT(GPTBERTPreTrainedModel):
|
333 |
|
334 |
+
def __init__(self, config: ModelConfig, is_causal: bool = False, **kwargs):
|
335 |
super().__init__(config, **kwargs)
|
336 |
self.config = config
|
337 |
self.hidden_size = config.hidden_size
|
|
|
381 |
attention, layer_attention_probs = attention_layer(contextualized_embeddings[-1], attention_mask, relative_embeddings)
|
382 |
layer_embeddings = contextualized_embeddings[-1] + attention
|
383 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2)
|
384 |
+
layer_embeddings = layer_embeddings + mlp_layer(layer_embeddings)
|
385 |
layer_embeddings = self.dwa_modules(layer_embeddings, block_idx=i * 2 + 1)
|
386 |
contextualized_embeddings.append(layer_embeddings)
|
387 |
attention_probs.append(layer_attention_probs)
|