model: pretrain: ckpt/ek100mir.pt freeze_vis_backbone: true freeze_txt_backbone: true inflat_posemb: true # false for cascade models; true for single-stage models (default: true) num_frames: 16 text_prompt: n_ctx: 8 use_bank: true visual_prompt: num_layers: 12 prompt_dim: 512 num_tokens: 128 deep: true deep_shared: false split_st: false pt_spt: true pt_tmp: false style: VoP_c_pool n_seg: 16 # number of segments per video (n_seg=clip_length -> 1 frame/seg) K_s: 8 # boundary of intra-frame/inter-frame attention (VoP_f+c) pool: size: 10 data: dataset: ek100_mir #root: /data/EK100/video_ht256px #metadata: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_train.csv #metadata_val: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/EPIC_100_retrieval_test.csv #relevancy_path: /data/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/caption_relevancy_EPIC_100_retrieval_test.pkl root: data/ek100_mir/video metadata_val: data/ek100_mir/csv/{}.csv relevancy_path: meta/ek100_mir/relevancy_sel.npy narrations: meta/ek100_mir/EPIC_100_retrieval_test_sentence.csv clip_length: 16