In [1]:
from transformers import AutoTokenizer
from sentence_transformers import util
import os
import numpy as np
import torch.nn.functional as F
from transformers import T5EncoderModel
import sentence_transformers

In [2]:

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


            

In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text 

model_size_tf, model_size_hf = "base", "base"
hub_url = f"https://tfhub.dev/google/gtr/gtr-{model_size_tf}/1"
encoder = hub.load(hub_url)

v = encoder.signatures['serving_default'].variables

INFO:absl:Using /tmp/tfhub_modules to cache modules.
2022-02-01 20:04:53.747606: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-02-01 20:04:53.747647: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1835] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-02-01 20:04:53.747987: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable

In [4]:
tf_name_weight = {var.name: var for var in v}
tf_name_shape = {var.name: var.shape for var in v}
tf_name_shape

{'encoder__encoder_norm__scale:0': TensorShape([768]),
 'encoder__layers_0__attention__key__kernel:0': TensorShape([768, 768]),
 'encoder__layers_0__attention__out__kernel:0': TensorShape([768, 768]),
 'encoder__layers_0__attention__query__kernel:0': TensorShape([768, 768]),
 'encoder__layers_0__attention__value__kernel:0': TensorShape([768, 768]),
 'encoder__layers_0__mlp__wi__kernel:0': TensorShape([768, 3072]),
 'encoder__layers_0__mlp__wo__kernel:0': TensorShape([3072, 768]),
 'encoder__layers_0__pre_attention_layer_norm__scale:0': TensorShape([768]),
 'encoder__layers_0__pre_mlp_layer_norm__scale:0': TensorShape([768]),
 'encoder__layers_1__attention__key__kernel:0': TensorShape([768, 768]),
 'encoder__layers_1__attention__out__kernel:0': TensorShape([768, 768]),
 'encoder__layers_1__attention__query__kernel:0': TensorShape([768, 768]),
 'encoder__layers_1__attention__value__kernel:0': TensorShape([768, 768]),
 'encoder__layers_1__mlp__wi__kernel:0': TensorShape([768, 3072]),
 'en

In [7]:
tokenizer = AutoTokenizer.from_pretrained(f"t5-{model_size_hf}")
t5 = T5EncoderModel.from_pretrained(f"t5-{model_size_hf}")  
pt_name_shape = {name: weight.shape for name, weight in t5.state_dict().items()}
pt_name_shape

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=45229452544.0), HTML(value='')))




Some weights of the model checkpoint at t5-11b were not used when initializing T5EncoderModel: ['decoder.block.13.layer.1.EncDecAttention.o.weight', 'decoder.block.14.layer.2.DenseReluDense.wo.weight', 'decoder.block.4.layer.0.layer_norm.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.1.layer_norm.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.11.layer.2.DenseReluDense.wo.weight', 'decoder.block.3.layer.0.SelfAttention.o.weight', 'decoder.block.12.layer.2.DenseReluDense.wo.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight', 'decoder.block.18.layer.1.layer_norm.weight', 'decoder.block.9.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.0.SelfAttention.q.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.14.layer.0.SelfAttention.q.we

Some weights of T5EncoderModel were not initialized from the model checkpoint at t5-11b and are newly initialized: ['encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'shared.weight': torch.Size([32128, 1024]),
 'encoder.embed_tokens.weight': torch.Size([32128, 1024]),
 'encoder.block.0.layer.0.SelfAttention.q.weight': torch.Size([16384, 1024]),
 'encoder.block.0.layer.0.SelfAttention.k.weight': torch.Size([16384, 1024]),
 'encoder.block.0.layer.0.SelfAttention.v.weight': torch.Size([16384, 1024]),
 'encoder.block.0.layer.0.SelfAttention.o.weight': torch.Size([1024, 16384]),
 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight': torch.Size([32, 128]),
 'encoder.block.0.layer.0.layer_norm.weight': torch.Size([1024]),
 'encoder.block.0.layer.1.DenseReluDense.wi.weight': torch.Size([65536, 1024]),
 'encoder.block.0.layer.1.DenseReluDense.wo.weight': torch.Size([1024, 65536]),
 'encoder.block.0.layer.1.layer_norm.weight': torch.Size([1024]),
 'encoder.block.1.layer.0.SelfAttention.q.weight': torch.Size([16384, 1024]),
 'encoder.block.1.layer.0.SelfAttention.k.weight': torch.Size([16384, 1024]),
 'encoder.block.1.layer.0.SelfAttention.

In [8]:
def convert_name(name):
    fct_map = {
        "attention": "SelfAttention",
        "mlp": "DenseReluDense",
        "pre_attention_layer_norm": "layer_norm",
        "pre_mlp_layer_norm": "layer_norm",
    }
    name_map = {
        'key': 'k',
        'out': 'o',
        'query': 'q',
        'value': 'v'
    }
    
    fixed_names = {
        "token_embedder__embedding:0": "shared.weight",
        "encoder__encoder_norm__scale:0": "encoder.final_layer_norm.weight",
        "encoder__relpos_bias__rel_embedding:0": "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
    }
    
    if name in fixed_names:
        return fixed_names[name]
    
    out = ""
    splits = name.split("__")
    layer = splits[1].split("_")[1]
    fct = fct_map.get(splits[2], splits[2])
    if 'layer_norm' in name:
        sublayer = "1" if "pre_mlp_layer_norm" in name else "0"  #Not sure on the right setting here
        #sublayer = "0" if "pre_mlp_layer_norm" in name else "1"  #Not sure on the right setting here
        out = f"encoder.block.{layer}.layer.{sublayer}.{fct}.weight"
    elif name.startswith("encoder__layers_"):
        sublayer = "0" if fct == "SelfAttention" else "1"
        name = name_map.get(splits[3], splits[3])
        out = f"encoder.block.{layer}.layer.{sublayer}.{fct}.{name}.weight"
        
    return out

In [9]:
def equal_shapes(shape1, shape2):
    if len(shape1) != len(shape2):
        return False
    
    for idx in range(len(shape1)):
        if shape1[idx] != shape2[idx]:
            return False
    
    return True

In [10]:
def need_transpose(name):
    #HF function: https://github.com/huggingface/transformers/blob/c962c2adbff678ae6d2e98378bed5b8d1a9831d9/src/transformers/models/t5/modeling_t5.py#L161
    return name != "shared.weight"

       

names_to_ignore = {"projection_layer__kernel:0"}
#Additional dense layer on top

#Check we used all names
pt_all_names = set(t5.state_dict().keys())

for var in v:
    name = var.name
    if name in names_to_ignore:
        continue
    
    pt_name = convert_name(name)
    if pt_name not in pt_all_names:
        print("Name not found:", name, "=>", pt_name)
    else:
        pt_all_names.remove(pt_name)
        tf_shape = tf_name_shape[name].as_list()
        pt_shape = list(pt_name_shape[pt_name])
        
        if need_transpose(pt_name):
            pt_shape = list(reversed(pt_shape))
        
        if not equal_shapes(tf_shape, pt_shape):
            print("Different shape:", name, tf_shape, pt_name, pt_shape )
        
print("Remaining weights:", pt_all_names)
#All layers match

Remaining weights: {'encoder.embed_tokens.weight'}


In [11]:
import torch
tokenizer = AutoTokenizer.from_pretrained(f"t5-{model_size_hf}")
T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
t5 = T5EncoderModel.from_pretrained(f"t5-{model_size_hf}")
t5_state = t5.state_dict()

state_all_names = set(t5_state.keys())

for var in v:
    tf_name = var.name
    if tf_name in names_to_ignore:
        continue
        
    pt_name = convert_name(tf_name)
    weights = np.float32(var.numpy())
    
    state_all_names.remove(pt_name)
    
    tranpose_status = "=>"
    if need_transpose(pt_name):
        tranpose_status = "=transpose=>"
        weights = weights.transpose()
    
    print(tf_name, f"({var.shape})", tranpose_status, pt_name, t5_state[pt_name].shape)
    
    original_shape = t5_state[pt_name].shape
    t5_state[pt_name] = torch.nn.Parameter(torch.tensor(weights))
    new_shape = t5_state[pt_name].shape
    
    if not equal_shapes(original_shape, new_shape):
        print("Different shape:", tf_name, original_shape, pt_name, new_shape)
        break

#Encoder Word embeddings
t5_state['encoder.embed_tokens.weight'] = t5_state['shared.weight']
state_all_names.remove('encoder.embed_tokens.weight')
    
#Load back the weights
t5.load_state_dict(t5_state) 

tf_linear_weight = tf_name_weight["projection_layer__kernel:0"]
linear = torch.nn.Linear(tf_linear_weight.shape[0], tf_linear_weight.shape[1], bias=False)
original_shape = linear.weight.shape
linear.weight = torch.nn.Parameter(torch.tensor(np.float32(tf_linear_weight.numpy()).transpose()))
new_shape = linear.weight.shape
if not equal_shapes(original_shape, new_shape):
    print("Different shape at linear layer")
    
print(linear)
print("Remaining weights:", state_all_names)
assert len(state_all_names) == 0


Some weights of T5EncoderModel were not initialized from the model checkpoint at t5-11b and are newly initialized: ['encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


encoder__encoder_norm__scale:0 ((1024,)) =transpose=> encoder.final_layer_norm.weight torch.Size([1024])
encoder__layers_0__attention__key__kernel:0 ((1024, 16384)) =transpose=> encoder.block.0.layer.0.SelfAttention.k.weight torch.Size([16384, 1024])
encoder__layers_0__attention__out__kernel:0 ((16384, 1024)) =transpose=> encoder.block.0.layer.0.SelfAttention.o.weight torch.Size([1024, 16384])
encoder__layers_0__attention__query__kernel:0 ((1024, 16384)) =transpose=> encoder.block.0.layer.0.SelfAttention.q.weight torch.Size([16384, 1024])
encoder__layers_0__attention__value__kernel:0 ((1024, 16384)) =transpose=> encoder.block.0.layer.0.SelfAttention.v.weight torch.Size([16384, 1024])
encoder__layers_0__mlp__wi__kernel:0 ((1024, 65536)) =transpose=> encoder.block.0.layer.1.DenseReluDense.wi.weight torch.Size([65536, 1024])
encoder__layers_0__mlp__wo__kernel:0 ((65536, 1024)) =transpose=> encoder.block.0.layer.1.DenseReluDense.wo.weight torch.Size([1024, 65536])
encoder__layers_0__pre_at

encoder__layers_15__attention__out__kernel:0 ((16384, 1024)) =transpose=> encoder.block.15.layer.0.SelfAttention.o.weight torch.Size([1024, 16384])
encoder__layers_15__attention__query__kernel:0 ((1024, 16384)) =transpose=> encoder.block.15.layer.0.SelfAttention.q.weight torch.Size([16384, 1024])
encoder__layers_15__attention__value__kernel:0 ((1024, 16384)) =transpose=> encoder.block.15.layer.0.SelfAttention.v.weight torch.Size([16384, 1024])
encoder__layers_15__mlp__wi__kernel:0 ((1024, 65536)) =transpose=> encoder.block.15.layer.1.DenseReluDense.wi.weight torch.Size([65536, 1024])
encoder__layers_15__mlp__wo__kernel:0 ((65536, 1024)) =transpose=> encoder.block.15.layer.1.DenseReluDense.wo.weight torch.Size([1024, 65536])
encoder__layers_15__pre_attention_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.15.layer.0.layer_norm.weight torch.Size([1024])
encoder__layers_15__pre_mlp_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.15.layer.1.layer_norm.weight torch.Size([1

encoder__layers_21__attention__query__kernel:0 ((1024, 16384)) =transpose=> encoder.block.21.layer.0.SelfAttention.q.weight torch.Size([16384, 1024])
encoder__layers_21__attention__value__kernel:0 ((1024, 16384)) =transpose=> encoder.block.21.layer.0.SelfAttention.v.weight torch.Size([16384, 1024])
encoder__layers_21__mlp__wi__kernel:0 ((1024, 65536)) =transpose=> encoder.block.21.layer.1.DenseReluDense.wi.weight torch.Size([65536, 1024])
encoder__layers_21__mlp__wo__kernel:0 ((65536, 1024)) =transpose=> encoder.block.21.layer.1.DenseReluDense.wo.weight torch.Size([1024, 65536])
encoder__layers_21__pre_attention_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.21.layer.0.layer_norm.weight torch.Size([1024])
encoder__layers_21__pre_mlp_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.21.layer.1.layer_norm.weight torch.Size([1024])
encoder__layers_22__attention__key__kernel:0 ((1024, 16384)) =transpose=> encoder.block.22.layer.0.SelfAttention.k.weight torch.Size([16384, 1

encoder__layers_7__mlp__wi__kernel:0 ((1024, 65536)) =transpose=> encoder.block.7.layer.1.DenseReluDense.wi.weight torch.Size([65536, 1024])
encoder__layers_7__mlp__wo__kernel:0 ((65536, 1024)) =transpose=> encoder.block.7.layer.1.DenseReluDense.wo.weight torch.Size([1024, 65536])
encoder__layers_7__pre_attention_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.7.layer.0.layer_norm.weight torch.Size([1024])
encoder__layers_7__pre_mlp_layer_norm__scale:0 ((1024,)) =transpose=> encoder.block.7.layer.1.layer_norm.weight torch.Size([1024])
encoder__layers_8__attention__key__kernel:0 ((1024, 16384)) =transpose=> encoder.block.8.layer.0.SelfAttention.k.weight torch.Size([16384, 1024])
encoder__layers_8__attention__out__kernel:0 ((16384, 1024)) =transpose=> encoder.block.8.layer.0.SelfAttention.o.weight torch.Size([1024, 16384])
encoder__layers_8__attention__query__kernel:0 ((1024, 16384)) =transpose=> encoder.block.8.layer.0.SelfAttention.q.weight torch.Size([16384, 1024])
encoder__l

In [12]:
english_sentences = ["Berlin is the capital of Germany", "Berlin is a large city in Germany",
                     "Tensorflow can be used for deep learning", "Pytorch, developed by Facebook AI, is a deep learning framework",
                    "Is Scipy or numpy better?", "Which is faster: scipy or pandas?",
                    "Cats can live for quite a long time", "Cats are humans best friend"]

encoded_input = tokenizer(english_sentences, return_tensors="pt", padding=True)

with torch.no_grad():
    model_output = t5(**encoded_input)
    
    # Perform pooling
    hf_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Apply linear layer
    hf_embeddings = linear(hf_embeddings)
    
    print(hf_embeddings.shape)

    # Normalize embeddings
    hf_embeddings = F.normalize(hf_embeddings, p=2, dim=1)

# Cos
util.dot_score(hf_embeddings, hf_embeddings)

torch.Size([8, 768])


tensor([[1.0000, 0.8303, 0.2995, 0.3906, 0.2986, 0.3062, 0.3430, 0.3734],
        [0.8303, 1.0000, 0.3455, 0.4187, 0.3043, 0.3464, 0.4388, 0.3959],
        [0.2995, 0.3455, 1.0000, 0.6648, 0.4726, 0.4597, 0.3798, 0.3454],
        [0.3906, 0.4187, 0.6648, 1.0000, 0.5167, 0.5195, 0.3746, 0.4006],
        [0.2986, 0.3043, 0.4726, 0.5167, 1.0000, 0.7602, 0.3923, 0.3550],
        [0.3062, 0.3464, 0.4597, 0.5195, 0.7602, 1.0000, 0.4338, 0.3432],
        [0.3430, 0.4388, 0.3798, 0.3746, 0.3923, 0.4338, 1.0000, 0.6090],
        [0.3734, 0.3959, 0.3454, 0.4006, 0.3550, 0.3432, 0.6090, 1.0000]])

In [13]:
# Test the models - Original embeddings
english_embeds = encoder(english_sentences)[0].numpy()
print(english_embeds.shape)
util.dot_score(english_embeds, english_embeds)

2022-01-31 23:13:39.702310: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-31 23:13:41.448337: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0x7f41641cf460 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-01-31 23:13:41.448385: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): Host, Default Version
2022-01-31 23:13:44.375222: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:210] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-01-31 23:14:17.816928: I tensorflow/compiler/jit/xla_compilation_cache.cc:363] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2022-01-31 23:14:17.866550: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3089104896 exceeds 10% of free system memory.


(8, 768)


tensor([[1.0000, 0.8303, 0.2996, 0.3908, 0.2984, 0.3062, 0.3428, 0.3735],
        [0.8303, 1.0000, 0.3453, 0.4187, 0.3044, 0.3462, 0.4387, 0.3961],
        [0.2996, 0.3453, 1.0000, 0.6643, 0.4724, 0.4596, 0.3803, 0.3454],
        [0.3908, 0.4187, 0.6643, 1.0000, 0.5169, 0.5196, 0.3744, 0.4003],
        [0.2984, 0.3044, 0.4724, 0.5169, 1.0000, 0.7603, 0.3920, 0.3550],
        [0.3062, 0.3462, 0.4596, 0.5196, 0.7603, 1.0000, 0.4333, 0.3427],
        [0.3428, 0.4387, 0.3803, 0.3744, 0.3920, 0.4333, 1.0000, 0.6087],
        [0.3735, 0.3961, 0.3454, 0.4003, 0.3550, 0.3427, 0.6087, 1.0000]])

In [14]:
folder = f'models/gtr-t5-{model_size_hf}'
t5.save_pretrained(folder)
tokenizer.save_pretrained(folder)
os.makedirs(os.path.join(folder, '2_Dense'), exist_ok=True)


dense = sentence_transformers.models.Dense(linear.in_features, linear.out_features, 
                                           bias=False, activation_function=torch.nn.Identity())
dense.linear = linear
dense.save(os.path.join(folder, '2_Dense'))


# FP16 experiment

In [None]:
#FP16 experiment
#t5 = T5EncoderModel.from_pretrained('models/gtr-t5-base')
#t5.half()
#t5.save_pretrained('models/gtr-t5-base-fp16')