merit-roberta-large-v2 / training_config.yaml
chitanda's picture
upload models
fa2a4c6
train_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.train.0.pkl
dev_file: /home/share/jiaofangkai/wiki_erica_path/v7/union/train_distant.path_v7.dev.pkl
test_file: null
model:
_target_: models.roberta_baseline.RobertaForMultipleChoiceForPreTrain.from_pretrained
mlp_hidden_size: 2048
read_tensor:
_target_: dataset.wiki_entity_path_v8_2.convert_examples_into_features
max_neg_num: 3
aug_num: 1
max_seq_length: 384
shuffle_context: true
min_rep_num: 5
geo_p: 0.4
deduct_ratio: 1.0
context_ratio: 1.0
remove_context: false
num_workers: 64
extended_vocab: null
collator:
_target_: dataset.wiki_entity_path_v8.WikiPathDatasetCollatorWithContext
max_seq_length: 320
tokenizer: pretrained-models/roberta-large
mlm_probability: 0.15
max_option_num: 4
swap: true
num_workers: 4
prefetch_factor: 2
model_name_or_path: pretrained-models/roberta-large
pretrain: null
output_dir: experiments/roberta.large.wiki_erica_path_v7_v8.2.2.1aug.fix.seq320.w2.2080Ti
do_train: Train
evaluate_during_training: true
do_eval: false
eval_sub_path: null
do_preprocess: false
per_gpu_train_batch_size: 1
per_gpu_eval_batch_size: 1
learning_rate: 0.0001
gradient_accumulation_steps: 2048
weight_decay: 0.01
adam_epsilon: 1.0e-06
adam_betas: (0.9, 0.98)
max_grad_norm: 5.0
num_train_epochs: 1
max_steps: 500
warmup_proportion: 0.1
warmup_steps: 0
optimizer: lamb
use_nvlamb: true
bit_training: null
logging_steps: 1
save_steps: 100
eval_steps: 100
no_cuda: false
seed: 42
local_rank: 0
fp16: true
fp16_opt_level: O1
reshard_after_forward: false
cpu_offload: false
move_grads_to_cpu: false
move_params_to_cpu: false
n_gpu: 1
device: cuda:0
train_batch_size: 1
eval_batch_size: 1
note: null