# LHM-1B experiment: type: lrm seed: 42 parent: video_human_benchmark child: human-lrm-1B model: # image encoder model_name: SapDinoLRMBHSD3_5 encoder_type: dinov2_fusion encoder_model_name: "dinov2_vitl14_reg" encoder_feat_dim: 1024 # dinov2 embeding size 1024 encoder_freeze: False fine_encoder_type: sapiens fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2" # sapiens pretrained model path fine_encoder_feat_dim: 1536 # sapiens embeding size 1024 fine_encoder_freeze: True use_face_id: True # points embeddings # num_pcl: 10240 latent_query_points_type: "e2e_smplx_sub1" pcl_dim: 1024 facesr: True # transformer # # camera_embed_dim: 1024 # transformer_dim: 512 # transformer_layers: 12 # transformer_heads: 8 transformer_type: "sd3_mm_bh_cond" # multi-modal attention. transformer_heads: 16 # 30 transformer_dim: 1024 # 30 * 64=1920 transformer_layers: 15 # 30 tf_grad_ckpt: true encoder_grad_ckpt: true # for gs renderer human_model_path: "./pretrained_models/human_model_files" smplx_subdivide_num: 1 smplx_type: "smplx_2" gs_query_dim: 1024 gs_use_rgb: True gs_sh: 3 dense_sample_pts: 40000 # 4,000 gs_mlp_network_config: n_neurons: 512 n_hidden_layers: 2 activation: silu # gs_xyz_offset_max_step: 0.05625 # 1.8 / 32 # gs_clip_scaling: 0.2 # avoid too large Sphere gs_xyz_offset_max_step: 1. # 1.8 / 32 gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end] expr_param_dim: 100 shape_param_dim: 10 fix_opacity: False fix_rotation: False cano_pose_type: 1 # 0 means exavatar-pose 1 indicates REC-MV pose dataset: subsets: - name: video_human_flame root_dirs: "./train_data/ClothVideo" meta_path: train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json" val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json" sample_rate: 1.0 use_flame: True src_head_size: 112 - name: video_human_flame_v2 root_dirs: "./train_data/ClothVideo" meta_path: train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json" val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json" sample_rate: 1.0 use_flame: True src_head_size: 112 sample_side_views: 5 source_image_res: 1024 src_head_size: 112 render_image: low: 512 high: 512 region: null num_train_workers: 4 multiply: 16 # dino features num_val_workers: 2 pin_mem: true repeat_num: 1 train: mixed_precision: bf16 # REPLACE THIS BASED ON GPU TYPE find_unused_parameters: false loss_func: pixel_loss: l1 # L1 or MSE ball_loss: type: heuristic # heuristic ball_loss group: head: 1. lower_body: 100. upper_body: 1000. hands: 10000. offset_loss: type: classical # heuristic ball_loss group: head: 1. lower_body: 1. upper_body: 100. hands: 1000. loss: pixel_weight: 0.0 masked_pixel_weight: 1.0 masked_head_weight: 0.0 perceptual_weight: 1.0 # tv_weight: 5e-4 tv_weight: -1 mask_weight: 1.0 face_id_weight: 0.05 asap_weight: 10.0 # ball loss acap_weight: 1000.0 # offset loss optim: lr: 4e-5 weight_decay: 0.05 beta1: 0.9 beta2: 0.95 clip_grad_norm: 0.1 # diffusion model scheduler: type: cosine warmup_real_iters: 0 batch_size: 2 # REPLACE THIS (PER GPU) accum_steps: 1 # REPLACE THIS epochs: 60 # REPLACE THIS debug_global_steps: null val: batch_size: 2 global_step_period: 1000 debug_batches: 10 saver: auto_resume: True checkpoint_root: None checkpoint_global_steps: 1000 checkpoint_keep_level: 60 logger: stream_level: WARNING log_level: INFO log_root: ./exps/logs tracker_root: ./exps/trackers enable_profiler: false trackers: - tensorboard image_monitor: train_global_steps: 100 samples_per_log: 4 compile: suppress_errors: true print_specializations: true disable: true