use_wandb = 1 | |
seed = 1 | |
style = "dit" | |
d_adapter = 1024 | |
normalize_embeddings = 1 | |
depth = 4 | |
embs = [ "gte", "gtr", "stella", "sentence-t5", "e5", "sbert", "clip", "stella-big", "snowflake", "ember",] | |
n_embs_per_batch = 2 | |
max_seq_length = 512 | |
depth_transform = 12 | |
lr = 5e-5 | |
bs = 64 | |
save_every = 500 | |
epochs = 100.0 | |
val_bs = 64 | |
dataset = "bge" | |
max_grad_norm = 10.0 | |
gradient_accumulation_steps = 1 | |
loss_coefficient_contrastive = 1 | |
loss_coefficient_vsp = 0 | |
loss_coefficient_trans = 1 | |
loss_coefficient_cc = 0 | |
eval_steps = 999999 | |
cluster_size = 512 | |
cluster_strategy = "cluster_within_domain" | |
warmup_steps = 100 | |
wandb_project = "edx-2" | |
wandb_name = "dit-pretrain-p2-4-long" | |
save_dir = "checkpoints/{}/" | |
state_dict_dir = "checkpoints/dit-pretrain-32/model.pt" | |
num_params = 2610419544 | |