| data: |
| train_json_files: |
| - data/MC/MusicCaps_train.json |
| - data/MC/LP_MusicCaps_MC_train.json |
| - data/MC/MusicCapsBinary.json |
| - data/MC/MusicCapsMCQ.json |
| - data/MC/MusicCapsDetailQA.json |
| - data/MC/MusicInstruct_train.json |
| - data/MC_NEW/MusicCapsMCQ.json |
| - data/MTT/LP_MusicCaps_MTT_train.json |
| - data/MTT/MTT_Binary.json |
| - data/MTT/MTT_MCQ.json |
| - data/MTT/OpenMU_MTT_train.json |
| - data/MTT_NEW/MTT_MCQ.json |
| - data/ASM_NEW/AudioSetMusicBinary.json |
| - data/ASM_NEW/AudioSetMusicMCQ.json |
| - data/ASM_NEW/AudioSetMusicQA.json |
| - data/ASM_NEW/Captioning.json |
| - data/FMA/FMA_Caption_Train.json |
| - data/FMA/FMA_QA_MIX.json |
| - data/FMA/FMA_QA_MIX_Part2.json |
| train_audio_dirs: |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MC/Audio |
| - ../../data/LP_MusicCaps_MTT/Audio |
| - ../../data/LP_MusicCaps_MTT/Audio |
| - ../../data/LP_MusicCaps_MTT/Audio |
| - ../../data/LP_MusicCaps_MTT/Audio |
| - ../../data/LP_MusicCaps_MTT/Audio |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio |
| - /apdcephfs_gy4/share_302507476/xiquanli/data/AudioSet/data/audio |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s |
| - /apdcephfs_gy2/share_302507476/0_public_datasets/fma_dataset/fma_split_15s |
| val_json_files: |
| - data/captioning/MusicCaps_test.json |
| val_audio_dirs: |
| - ../../data/LP_MusicCaps_MC/Audio |
| sample_rate: 32000 |
| max_length: 10 |
| tokenizer_type: HuggingFaceTB/SmolLM2-135M |
| max_text_token_len: 129 |
| batch_size: 16 |
| num_workers: 8 |
| wav_aug: false |
| model: |
| encoder: |
| audioenc_name: MATPAC |
| matpac_ckpt_path: ./weights/matpac_plus_as_48_1_map_enconly.pt |
| freeze: true |
| ds_rate: 1 |
| c2l_first: true |
| wo_repeat: true |
| projector: |
| proj_name: linearprojector |
| d_in: 3840 |
| d_h: 2048 |
| d_out: 576 |
| decoder: |
| textdec_name: HuggingFaceTB/SmolLM2-135M |
| max_text_token_len: 129 |
| freeze: false |
| use_lora: false |
| lora_config: |
| r: 32 |
| lora_alpha: 128 |
| lora_dropout: 0.1 |
| bias: none |
| target_modules: |
| - q_proj |
| - v_proj |
| model_type: m2t |
| training: |
| epochs: 3 |
| clip_grad: 2 |
| dropout: 0.2 |
| seed: 42 |
| device: cuda |
| validation_step: 3000 |
| optim_args: |
| lr: 0.0001 |
| warmup_steps: 0 |
| scheduler: cosine |
| optimizer_name: adam |
| betas: |
| - 0.9 |
| - 0.999 |
| eps: 1.0e-08 |
| momentum: 0.9 |
| warmup_ratio: 0.05 |
|
|