|
|
|
|
|
defaults: |
|
- /model: small |
|
|
|
model: |
|
downscale_ratio: 16 |
|
image_vocab_size: 8192 |
|
vae_type: magvit |
|
use_custom_vae_ckpt: null |
|
custom_vae_name: null |
|
img_length: 256 |
|
txt_length: 128 |
|
image_model: true |
|
text_model: true |
|
unified_model: true |
|
image_model_fid_eval: false |
|
force_argmax_valid_indices: true |
|
use_pretrained_img_emb: false |
|
codebook_embed_dim: 256 |
|
qk_norm: true |
|
norm_type: rms |
|
sandwich_normalization: true |
|
zero_linear_init: false |
|
modality_embed: true |
|
rope_2d: false |
|
use_spda_attn: true |
|
force_optimized_native_attn: true |
|
freeze_txt_emb: false |
|
add_labels: null |
|
txt_dropout: null |
|
text_vocab_size: 32001 |
|
use_flex_attention: true |
|
flex_attention_txt_masking_prob: 0.1 |
|
flex_attention_img_masking_prob: 0.1 |
|
linear_factor: 1 |
|
data: |
|
train: combined_tokens |
|
valid: ${.train} |
|
n_duplicate_train: null |
|
wrap: true |
|
streaming: false |
|
precache: false |
|
tokenizer_name_or_path: NousResearch/Llama-2-7b-hf |
|
resolution: 256 |
|
block_size: 128 |
|
n_val_samples: null |
|
unpaired: false |
|
n_duplicate_val: null |
|
save_train_dataloader: true |
|
save_validation_dataloader: true |
|
iterable: false |
|
webdataset_iterable: false |
|
webdataset_indexed: false |
|
dataset_type: null |
|
tokens_flip_collate: false |
|
n_train_samples: null |
|
raw_data_dir: null |
|
tokenizers_parallelism: false |
|
token_data_dir: null |
|
force_disable_shuffle: false |
|
keep_tensordict_on_disk: true |
|
use_custom_tensordict_collate: true |
|
force_mp_spawn: false |
|
enable_cuda_in_tensordict_collate: false |
|
use_weighted_tensordict_sampler: true |
|
fraction_txt_data: 0.0 |
|
data_dir_train: |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/scratch_ssd_tokens/datacomp1b_8_magvit |
|
weight: -1 |
|
name: datacomp1b_8_magvit_train |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/scratch_ssd_tokens/cc12m_tokens_train_256 |
|
weight: -1 |
|
name: cc12m_tokens_train_256 |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/grogu/HPDv2_image_reward_v1_v2_v3_magvit |
|
weight: -1 |
|
name: HPDv2_image_reward_v1_v2_v3_magvit |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/grogu/pick_score_sac_prompts_v1_v2_v3_magvit |
|
weight: -1 |
|
name: pick_score_sac_prompts_v1_v2_v3_magvit |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/grogu/datacomp1b_0_1_6_magvit |
|
weight: -1 |
|
name: datacomp1b_0_1_6_magvit |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/grogu/laion400m_magvit_part_0 |
|
weight: -1 |
|
name: laion400m_magvit_part_0 |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/grogu/laion400m_magvit_part_1 |
|
weight: -1 |
|
name: laion400m_magvit_part_1 |
|
data_dir_val: |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/scratch_ssd_tokens/datacomp1b_8_magvit_val |
|
weight: 1 |
|
name: datacomp1b_8_magvit_val |
|
- dir: ${oc.env:DIFFUSION_DATA_DIR}/tokens/scratch_ssd_tokens/cc12m_tokens_val_256 |
|
weight: 1 |
|
name: cc12m_tokens_val_256 |
|
tokenize_vqvae_in_dataloader: false |
|
val: |
|
.train: null |
|
use_token_dataset: true |
|
image_dataset: tglcourse/lsun_church_train |
|
image_data_train: null |
|
image_data_val: null |
|
keep_hf_dataset_in_memory: true |
|
allow_label: false |
|
disable_text_modality: true |
|
force_raw_train_images: false |
|
aggressive_aug: true |
|
allow_aug_vqvae_dataloader: true |
|
move_tensordict_to_shm: false |
|
force_full_attention_mask: false |
|
eval: |
|
generate_samples: false |
|
compute_generative_perplexity: false |
|
log_every_n_evals: 10 |
|
log_every_n_fid: 20 |
|
limit_val_batches_manual: 16 |
|
perplexity_batch_size: ${loader.eval_batch_size} |
|
num_masking_viz_batches: -1 |
|
max_num_fid_batches_per_device: ${eval:'8192 // (${trainer.devices} * ${loader.eval_batch_size})'} |
|
cfg: null |
|
class_conditional_fid: false |
|
force_cfg_value: true |
|
split_cfg_batches: true |
|
fid_mode: clean |
|
clean_fid_precomputed_name: lsun_church |
|
clean_fid_precomputed_split: trainfull |
|
clean_fid_precomputed_res: 256 |
|
trainer: |
|
log_every_n_steps: 10 |
|
val_check_interval: 1000 |
|
custom_ddp_bf16: true |
|
scale_lr_by_batch_size: false |
|
limit_val_batches: 16 |
|
use_gradient_checkpointing: false |
|
log_seperate_modal_losses: true |
|
softmin_snr: 5 |
|
text_loss_weight: 1.0 |
|
img_loss_weight: null |
|
low_precision_loss: false |
|
compile: false |
|
multimodal_batches: true |
|
compile_fullgraph: false |
|
log_grad_norm_every_n_steps: 10 |
|
mask_entire_modality: 0.1 |
|
force_shift_image_batches: false |
|
ckpt_steps: 10000 |
|
ckpt_every_n_minutes: -1 |
|
ignore_text_in_unified: false |
|
disable_all_eval_generation: false |
|
eval_on_start: false |
|
ckpt_model_only: false |
|
ema: 0.0 |
|
use_custom_ema: false |
|
log_flops: false |
|
disable_distributed_torchmetrics: true |
|
restart_on_failure: true |
|
force_null_sigma: true |
|
allow_null_sigma: true |
|
compile_flag_pos_emb: true |
|
add_label: false |
|
first_token_dropout: null |
|
force_shift_raw_image_batches: true |
|
txt_dropout: 0.1 |
|
disable_ddp_optimizer: true |
|
optim: |
|
lr: 0.0003 |
|
weight_decay: 0.05 |
|
loader: |
|
batch_size: 64 |
|
eval_batch_size: ${loader.batch_size} |
|
num_workers: 1 |
|
desired_global_batch_size: 512 |
|
persistent_workers: true |
|
pin_memory: true |
|
num_eval_workers: 1 |
|
sampling: |
|
steps: ${model.length} |
|
num_sample_batches: 2 |
|
max_sampling_steps: ${model.length} |
|
wandb: |
|
mode: online |
|
lr_scheduler: |
|
num_warmup_steps: 5000 |
|
checkpointing: |
|
checkpoints_total_limit: 4 |
|
|