File size: 3,034 Bytes
e60d8d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
model:
names:
- fusion_mlp
- hf_text
- numerical_mlp
- timm_image
numerical_mlp:
hidden_size: 128
activation: leaky_relu
num_layers: 1
drop_rate: 0.1
normalization: layer_norm
d_token: 8
embedding_arch: null
data_types:
- numerical
merge: concat
hf_text:
checkpoint_name: local://hf_text
gradient_checkpointing: false
pooling_mode: cls
data_types:
- text
tokenizer_name: hf_auto
max_text_len: 512
insert_sep: true
low_cpu_mem_usage: false
text_segment_num: 2
stochastic_chunk: false
text_aug_detect_length: 10
text_trivial_aug_maxscale: 0.0
text_train_augment_types: null
timm_image:
checkpoint_name: swin_base_patch4_window7_224
mix_choice: all_logits
data_types:
- image
train_transforms:
- resize_shorter_side
- center_crop
- trivial_augment
val_transforms:
- resize_shorter_side
- center_crop
image_norm: imagenet
image_size: null
max_img_num_per_col: 2
fusion_mlp:
weight: 0.1
adapt_in_features: max
hidden_sizes:
- 128
activation: leaky_relu
drop_rate: 0.1
normalization: layer_norm
data_types: null
data:
image:
missing_value_strategy: zero
text:
normalize_text: false
categorical:
minimum_cat_count: 100
maximum_num_cat: 20
convert_to_text: true
numerical:
convert_to_text: false
scaler_with_mean: true
scaler_with_std: true
document:
missing_value_strategy: zero
label:
numerical_label_preprocessing: standardscaler
pos_label: null
mixup:
turn_on: false
mixup_alpha: 0.8
cutmix_alpha: 1.0
cutmix_minmax: null
prob: 1.0
switch_prob: 0.5
mode: batch
turn_off_epoch: 5
label_smoothing: 0.1
templates:
turn_on: false
num_templates: 30
template_length: 2048
preset_templates:
- super_glue
- rte
custom_templates: null
optimization:
optim_type: adamw
learning_rate: 0.0001
weight_decay: 0.001
lr_choice: layerwise_decay
lr_decay: 0.9
lr_schedule: cosine_decay
max_epochs: 10
max_steps: -1
warmup_steps: 0.1
end_lr: 0
lr_mult: 1
patience: 10
val_check_interval: 0.5
check_val_every_n_epoch: 1
skip_final_val: false
gradient_clip_val: 1
gradient_clip_algorithm: norm
track_grad_norm: -1
log_every_n_steps: 10
top_k: 3
top_k_average_method: greedy_soup
efficient_finetune: null
lora:
module_filter: null
filter:
- query
- value
- ^q$
- ^v$
- ^k$
- ^o$
r: 8
alpha: 8
loss_function: auto
focal_loss:
alpha: null
gamma: 2.0
reduction: mean
env:
num_gpus: 1
num_nodes: 1
batch_size: 128
per_gpu_batch_size: 8
eval_batch_size_ratio: 4
per_gpu_batch_size_evaluation: null
precision: 16
num_workers: 2
num_workers_evaluation: 2
fast_dev_run: false
deterministic: false
auto_select_gpus: true
strategy: null
deepspeed_allgather_size: 1000000000.0
deepspeed_allreduce_size: 1000000000.0
|