|
defaults: |
|
- _self_ |
|
- /model: small |
|
- /noise: loglinear |
|
- /lr_scheduler: constant_warmup |
|
- /experiments: [] |
|
|
|
|
|
slurm: False |
|
debug: False |
|
mode: train |
|
diffusion: absorbing_state |
|
backbone: dit |
|
parameterization: subs |
|
time_conditioning: False |
|
T: 0 |
|
subs_masking: False |
|
seed: 42 |
|
profile: False |
|
|
|
devices: ${device_count:} |
|
nodes: 1 |
|
partition: ${find_partition:} |
|
constraint: ${find_constraint:} |
|
ckpt: null |
|
|
|
loader: |
|
desired_global_batch_size: 512 |
|
global_batch_size: null |
|
eval_global_batch_size: ${.global_batch_size} |
|
batch_size: ${div_up:${.desired_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} |
|
eval_batch_size: ${div_up:${.desired_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} |
|
num_workers: ${eval:"max(len(__import__('os').sched_getaffinity(0)) // 16, 4)"} |
|
pin_memory: True |
|
persistent_workers: True |
|
|
|
sampling: |
|
predictor: ddpm_cache |
|
steps: 1000 |
|
max_sampling_steps: 500 |
|
noise_removal: True |
|
num_sample_log: 2 |
|
semi_ar: False |
|
stride_length: 1 |
|
num_strides: 1 |
|
|
|
eval: |
|
checkpoint_path: '' |
|
disable_ema: False |
|
compute_generative_perplexity: False |
|
perplexity_batch_size: 8 |
|
gen_ppl_eval_model_name_or_path: gpt2-large |
|
generate_samples: True |
|
cfg: null |
|
num_masking_viz_batches: 1 |
|
num_sample_batches: 2 |
|
test_eval_speed: False |
|
standalone_fid: False |
|
visualize_data_only: false |
|
val_with_train_data: false |
|
max_num_fid_batches_per_device: null |
|
class_conditional_fid: false |
|
compute_entropy: false |
|
compute_standalone_mauve: false |
|
compute_standalone_entropy: false |
|
compute_img_to_txt_mauve_clip: false |
|
compute_img_to_txt_mauve_during_unconditional_fid: false |
|
mauve_num_samples: 5000 |
|
mauve_divergence_curve_discretization_size: 25 |
|
mauve_average_over_seeds: 3 |
|
mauve_scaling_factor: 5 |
|
txt_conditional_fid: false |
|
unconditional_fid: false |
|
fid_mode: inline |
|
calculate_clip_score: false |
|
clean_fid_use_precomputed_stats: false |
|
clean_fid_precomputed_name: null |
|
clean_fid_precomputed_split: null |
|
clean_fid_precomputed_res: null |
|
attention_caching: false |
|
set_random_gen_seed: false |
|
compute_val_metrics_standalone: false |
|
num_val_metrics_standalone_batches_per_device: ${eval:'max(${eval.num_val_metrics_standalone_samples} // (${trainer.devices} * ${loader.eval_batch_size}), 1)'} |
|
num_val_metrics_standalone_samples: -1 |
|
return_unweighed_sim: false |
|
compute_chameleon_perplexity: false |
|
global_disable_mauve: false |
|
bypass_normal_validation: false |
|
auto_enhance: false |
|
num_auto_enhance_iter: 2 |
|
ar_inpainting_min_val: 0.5 |
|
ar_inpainting_max_val: 1.0 |
|
ar_inpainting_force_val: null |
|
|
|
optim: |
|
weight_decay: 0 |
|
lr: 3e-4 |
|
beta1: 0.9 |
|
beta2: 0.999 |
|
eps: 1e-8 |
|
fused: true |
|
|
|
model: |
|
use_custom_vae_config: false |
|
use_custom_vae_ckpt: null |
|
downscale_ratio: null |
|
image_vocab_size: null |
|
vae_type: null |
|
use_attention_mask: false |
|
|
|
cond_use_custom_vae_config: false |
|
cond_use_custom_vae_ckpt: null |
|
cond_downscale_ratio: null |
|
cond_image_vocab_size: null |
|
cond_vae_type: null |
|
text_model: true |
|
|
|
attn_type: flash |
|
force_varlen_attn: false |
|
force_cast_bf16: false |
|
norm_type: layernorm |
|
mup: false |
|
qk_norm: false |
|
distillation: false |
|
force_argmax_valid_indices: false |
|
use_flash_attn_3: false |
|
use_spda_attn: false |
|
rope_2d: false |
|
modality_embed: false |
|
zero_linear_init: true |
|
full_attention: true |
|
use_lora: false |
|
use_kv_cache: false |
|
force_optimized_native_attn: false |
|
use_pretrained_img_emb: true |
|
use_flex_attention: false |
|
add_labels: null |
|
flex_attention_txt_masking_prob: null |
|
flex_attention_img_masking_prob: null |
|
|
|
trainer: |
|
_target_: lightning.Trainer |
|
accelerator: cuda |
|
num_nodes: ${nodes} |
|
devices: ${devices} |
|
|
|
|
|
accumulate_grad_batches: ${find_grad_accum:${loader.desired_global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}} |
|
gradient_clip_val: 1.0 |
|
precision: 'bf16' |
|
max_steps: 1_000_000_000 |
|
|
|
num_epochs: 1_000_000_000 |
|
optimizer_cls: adamw |
|
set_grads_to_none: true |
|
eval_on_start: true |
|
eval_decay_steps: false |
|
eval_epochs: null |
|
ckpt_steps: 100000 |
|
fsdp: false |
|
force_enable_checkpointing: false |
|
limit_val_batches: null |
|
ckpt_every_n_minutes: 60 |
|
ckpt_recent_timeout_minutes: 10 |
|
checkpoint_all_ranks: true |
|
force_null_sigma: false |
|
|
|
log_every_n_steps: 10 |
|
limit_train_batches: 1.0 |
|
val_check_interval: 100 |
|
|
|
ema: 0.9999 |
|
antithetic_sampling: True |
|
importance_sampling: False |
|
sampling_eps: 1e-3 |
|
change_of_variables: False |
|
benchmark: true |
|
backward_pass: true |
|
forward_pass: true |
|
profile_memory: false |
|
pytorch_profile: false |
|
nvtx_profile: false |
|
custom_ddp_bf16: true |
|
log_seperate_modal_losses: true |
|
use_gradient_checkpointing: false |
|
text_loss_weight: null |
|
img_loss_weight: null |
|
disable_strict_load: false |
|
attach_oom_observer_eval: false |
|
find_unused_parameters: false |
|
restart_on_failure: false |
|
skip_early_checkpointing: true |
|
log_flops: true |
|
sync_timing: false |
|
use_custom_ema: false |
|
scale_lr_by_batch_size: false |
|
tpu_eager: false |
|
allow_dynamic_nodes: false |
|
force_disable_signal_handler: false |
|
tpu_profile: false |
|
tpu_cache: false |
|
enable_jax_smi: false |
|
tpu_compile_debug: false |
|
xla_spmd: false |
|
log_grad_norm: true |
|
tpu_profile_markers: true |
|
compile: false |
|
disable_all_checkpointing: false |
|
tpu_force_mark_step: false |
|
ar_shift: false |
|
ar_llm_loss: false |
|
ar_print_loss: false |
|
chameleon_z_loss: null |
|
image_mode: discrete |
|
chameleon_use_ce_loss: false |
|
low_precision_loss: false |
|
low_precision_params: false |
|
scratch: false |
|
use_spmd_distributed_checkpointing: null |
|
use_simple_spmd_distributed_checkpointing: false |
|
load_from_state_dict: null |
|
load_from_optimizer_state_dict: null |
|
multimodal_batches: false |
|
sync_dataloader_timing: false |
|
compile_flag_pos_emb: false |
|
compile_fullgraph: false |
|
compile_mode: max-autotune-no-cudagraphs |
|
joint_ar_nar_prob: null |
|
joint_ar_nar_prob_warmup_steps: null |
|
joint_ar_nar_timestep_warmup_steps: null |
|
spmd_mesh: null |
|
detect_anomaly: false |
|
freeze_chameleon_embeddings: false |
|
ckpt_model_only: false |
|
use_orig_params: null |
|
disable_adjust_num_warmup_steps: false |
|
mask_entire_modality: null |
|
iterate_dataloader_only: false |
|
force_bf16_eval: false |
|
disable_all_eval_generation: false |
|
debug_xla_sept: false |
|
ignore_text_in_unified: false |
|
allow_null_sigma: false |
|
disable_forward_autocast_during_eval: false |
|
viz_images_only: false |
|
add_label: false |
|
first_token_dropout: null |
|
disable_ddp_optimizer: false |
|
rand_flip_ar_prob: null |
|
rand_ar_modality_dropout: null |
|
use_linear_warmup_cosine_annealing: false |
|
no_ce_weighting: false |
|
interleaved: false |
|
interleaved_training_flex_attention: false |
|
awr: false |
|
ar_inpainting: false |
|
|
|
wandb: |
|
entity: grads |
|
project: ${eval:'"unidisc-debug" if ${debug} else "unidisc"'} |
|
resume: ${eval:'"allow" if ${slurm} else None'} |
|
id: null |
|
group: null |
|
job_type: null |
|
name: null |
|
tags: |
|
- ${data.train} |
|
|
|
checkpointing_root_dir: ${oc.env:UNIDISC_CHECKPOINTING_ROOT_DIR,null} |
|
root_output_dir: ${oc.env:UNIDISC_ROOT_OUTPUT_DIR,outputs} |
|
python_orig: | |
|
accelerate launch \ |
|
--num_machines $SLURM_NNODES \ |
|
--num_processes $NUM_PROCESSES \ |
|
--rdzv_backend c10d \ |
|
--main_process_ip $MASTER_ADDR \ |
|
--main_process_port $MASTER_PORT \ |
|
--machine_rank $SLURM_PROCID \ |
|
--mixed_precision bf16 \ |
|
--dynamo_backend no \ |
|
--enable_cpu_affinity \ |
|
--max_restarts 0 \ |
|
|
|
mem_per_gpu: 40 |
|
cpus_per_gpu: 8 |
|
slurm_name: null |
|
timeout_min: ${partition_limit:${partition}} |
|
hydra: |
|
run: |
|
dir: ${oc.env:HYDRA_RUN_DIR,${root_output_dir}/outputs/${get_dir_name:}/${oc.env:HYDRA_RUN_DIR_NAME,${now:%Y_%m_%d}/${now:%H_%M_%S}}} |
|
sweep: |
|
dir: ${oc.env:HYDRA_RUN_DIR,${root_output_dir}/outputs/${get_dir_name:}/${oc.env:HYDRA_RUN_DIR_NAME,${now:%Y_%m_%d}/${now:%H_%M_%S}}} |
|
subdir: ${hydra.job.id} |
|
job: |
|
chdir: true |
|
|
|
checkpointing: |
|
|
|
save_dir: ${cwd:}/checkpoints |
|
|
|
resume_from_ckpt: true |
|
resume_ckpt_path: ${cwd:}/checkpoints |
|
initial_resume_ckpt_path: null |
|
resume_wandb: true |
|
checkpoints_total_limit: 2 |
|
use_automatic_naming: false |
|
|
|
|
|
data: |
|
cache_dir: ${oc.env:HF_DATASETS_CACHE,/grogu/user/mprabhud/aswerdlo/huggingface/datasets} |
|
num_proc: ${eval:"max(len(__import__('os').sched_getaffinity(0)) // 4, 16)"} |
|
cond_resolution: null |
|
iterable: false |
|
force_disable_shuffle: false |
|
pin_dataset_to_gpu: false |
|
webdataset_iterable: false |
|
webdataset_train_data: null |
|
webdataset_val_data: null |
|
webdataset_train_num_samples: null |
|
webdataset_val_num_samples: null |
|
webdataset_indexed: false |
|
dataset_type: null |
|
keep_tensordict_on_disk: false |
|
use_token_dataset: false |
|
use_custom_tensordict_collate: false |
|
use_weighted_tensordict_sampler: false |
|
enable_cuda_in_tensordict_collate: true |
|
data_dir_train: null |
|
data_dir_val: null |
|
token_output_dir: null |
|
wrap_dataloaders: true |
|
force_shuffle_train: false |
|
move_tensordict_to_shm: false |
|
keep_hf_dataset_in_memory: false |
|
use_chameleon: false |
|
tokenize_vqvae_in_dataloader: false |
|
force_mp_spawn: false |
|
force_raw_images_in_multiple_tensordict: false |
|
disable_text_modality: false |
|
txt_only: false |
|
disable_mask_after_eos: false |
|
allow_label: false |
|
split_dataset: false |
|
img_token_shift: ${model.text_vocab_size} |
|
zero_shot_eval_dataset: null |
|
require_sample_ids: false |
|
use_packing_collate: false |
|
dynamic_packing_lengths: false |
|
remove_txt_img_padding: false |
|
add_image_gen_tokens: false |
|
use_slow_tokenizer: false |
|
add_image_token: false |
|
train: "unset_dataset" |
|
val: "unset_dataset" |
|
tokenizer_name_or_path: "NousResearch/Llama-2-7b-hf" |
|
|
|
dummyarg: null |