minBERT / bert.py

Hotfix

27888bb 7 months ago

8.94 kB

	from base_bert import *
	from everything import *


	class BertSelfAttention(nn.Module):
	def __init__(self, config):
	super().__init__()

	self.num_attention_heads = config.num_attention_heads
	self.attention_head_size = config.hidden_size // config.num_attention_heads
	self.all_head_size = self.num_attention_heads * self.attention_head_size

	# Initialize the linear transformation layers for key, value, query.
	self.query = nn.Linear(config.hidden_size, self.all_head_size)
	self.key = nn.Linear(config.hidden_size, self.all_head_size)
	self.value = nn.Linear(config.hidden_size, self.all_head_size)
	# This dropout is applied to normalized attention scores following the original
	# implementation of transformer. Although it is a bit unusual, we empirically
	# observe that it yields better performance.
	self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

	def transform(self, x, linear_layer):
	# The corresponding linear_layer of k, v, q are used to project the hidden_state (x).
	bs, seq_len = x.shape[:2]
	proj = linear_layer(x)
	# Next, we need to produce multiple heads for the proj. This is done by spliting the
	# hidden state to self.num_attention_heads, each of size self.attention_head_size.
	proj = proj.view(bs, seq_len, self.num_attention_heads, self.attention_head_size)
	# By proper transpose, we have proj of size [bs, num_attention_heads, seq_len, attention_head_size].
	proj = proj.transpose(1, 2)
	return proj

	def attention(self, key, query, value, attention_mask):
	"""
	key, query, value: [batch_size, num_attention_heads, seq_len, attention_head_size]
	attention_mask: [batch_size, 1, 1, seq_len], masks padding tokens in the input.
	"""

	d_k = query.size(-1) # attention_head_size
	attention_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
	# attention_scores shape: [batch_size, num_attention_heads, seq_len, seq_len]

	# Apply attention mask
	attention_scores = attention_scores + attention_mask

	# Normalize scores with softmax and apply dropout.
	attention_probs = nn.functional.softmax(attention_scores, dim=-1)
	attention_probs = self.dropout(attention_probs)

	context = torch.matmul(attention_probs, value)
	# context shape: [batch_size, num_attention_heads, seq_len, attention_head_size]

	# Concatenate all attention heads to recover original shape: [batch_size, seq_len, hidden_size]
	context = context.transpose(1, 2).contiguous()
	context = context.view(context.size(0), context.size(1), -1)

	return context


	def forward(self, hidden_states, attention_mask):
	"""
	hidden_states: [bs, seq_len, hidden_size]
	attention_mask: [bs, 1, 1, seq_len]
	output: [bs, seq_len, hidden_state]
	"""
	# First, we have to generate the key, value, query for each token for multi-head attention
	# using self.transform (more details inside the function).
	# Size of *_layer is [bs, num_attention_heads, seq_len, attention_head_size].
	key_layer = self.transform(hidden_states, self.key)
	value_layer = self.transform(hidden_states, self.value)
	query_layer = self.transform(hidden_states, self.query)
	# Calculate the multi-head attention.
	attn_value = self.attention(key_layer, query_layer, value_layer, attention_mask)
	return attn_value


	class BertLayer(nn.Module):
	def __init__(self, config):
	super().__init__()
	# Multi-head attention.
	self.self_attention = BertSelfAttention(config)
	# Add-norm for multi-head attention.
	self.attention_dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.attention_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.attention_dropout = nn.Dropout(config.hidden_dropout_prob)
	# Feed forward.
	self.interm_dense = nn.Linear(config.hidden_size, config.intermediate_size)
	self.interm_af = F.gelu
	# Add-norm for feed forward.
	self.out_dense = nn.Linear(config.intermediate_size, config.hidden_size)
	self.out_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.out_dropout = nn.Dropout(config.hidden_dropout_prob)


	def add_norm(self, input, output, dense_layer, dropout, ln_layer):
	transformed_output = dense_layer(output) # Biến đổi output bằng dense_layer
	transformed_output = dropout(transformed_output) # Áp dụng dropout
	added_output = input + transformed_output # Kết hợp input và output
	normalized_output = ln_layer(added_output) # Áp dụng chuẩn hóa
	return normalized_output


	def forward(self, hidden_states, attention_mask):
	# 1. Multi-head attention
	attention_output = self.self_attention(hidden_states, attention_mask)

	# 2. Add-norm after attention
	attention_output = self.add_norm(
	hidden_states,
	attention_output,
	self.attention_dense,
	self.attention_dropout,
	self.attention_layer_norm
	)

	# 3. Feed-forward network
	intermediate_output = self.interm_af(self.interm_dense(attention_output))

	# 4. Add-norm after feed-forward
	layer_output = self.add_norm(
	attention_output,
	intermediate_output,
	self.out_dense,
	self.out_dropout,
	self.out_layer_norm
	)

	return layer_output


	class BertModel(BertPreTrainedModel):
	"""
	The BERT model returns the final embeddings for each token in a sentence.

	The model consists of:
	1. Embedding layers (used in self.embed).
	2. A stack of n BERT layers (used in self.encode).
	3. A linear transformation layer for the [CLS] token (used in self.forward, as given).
	"""
	def __init__(self, config):
	super().__init__(config)
	self.config = config

	# Embedding layers.
	self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
	self.pos_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
	self.tk_type_embedding = nn.Embedding(config.type_vocab_size, config.hidden_size)
	self.embed_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
	# Register position_ids (1, len position emb) to buffer because it is a constant.
	position_ids = torch.arange(config.max_position_embeddings).unsqueeze(0)
	self.register_buffer('position_ids', position_ids)

	# BERT encoder.
	self.bert_layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

	# [CLS] token transformations.
	self.pooler_dense = nn.Linear(config.hidden_size, config.hidden_size)
	self.pooler_af = nn.Tanh()

	self.init_weights()


	def embed(self, input_ids):
	input_shape = input_ids.size()
	seq_length = input_shape[1]

	inputs_embeds = self.word_embedding(input_ids)

	pos_ids = self.position_ids[:, :seq_length]
	pos_embeds = self.pos_embedding(pos_ids)

	# Since we are not considering token type, this embedding is just a placeholder.
	tk_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)
	tk_type_embeds = self.tk_type_embedding(tk_type_ids)

	embeddings = inputs_embeds + pos_embeds + tk_type_embeds
	embeddings = self.embed_layer_norm(embeddings)
	embeddings = self.embed_dropout(embeddings)

	return embeddings


	def encode(self, hidden_states, attention_mask):
	"""
	hidden_states: the output from the embedding layer [batch_size, seq_len, hidden_size]
	attention_mask: [batch_size, seq_len]
	"""
	# Get the extended attention mask for self-attention.
	# Returns extended_attention_mask of size [batch_size, 1, 1, seq_len].
	# Distinguishes between non-padding tokens (with a value of 0) and padding tokens
	# (with a value of a large negative number).
	extended_attention_mask: torch.Tensor = get_extended_attention_mask(attention_mask, self.dtype)

	# Pass the hidden states through the encoder layers.
	for i, layer_module in enumerate(self.bert_layers):
	# Feed the encoding from the last bert_layer to the next.
	hidden_states = layer_module(hidden_states, extended_attention_mask)

	return hidden_states


	def forward(self, input_ids, attention_mask):
	"""
	input_ids: [batch_size, seq_len], seq_len is the max length of the batch
	attention_mask: same size as input_ids, 1 represents non-padding tokens, 0 represents padding tokens
	"""
	# Get the embedding for each input token.
	embedding_output = self.embed(input_ids=input_ids)

	# Feed to a transformer (a stack of BertLayers).
	sequence_output = self.encode(embedding_output, attention_mask=attention_mask)

	# Get cls token hidden state.
	first_tk = sequence_output[:, 0]
	first_tk = self.pooler_dense(first_tk)
	first_tk = self.pooler_af(first_tk)

	return {'last_hidden_state': sequence_output, 'pooler_output': first_tk}