{
  "hardware_name": "4xL4",
  "specs": {
    "gpu_count": 4,
    "gpu_type": "L4",
    "vram_per_gpu": 24,
    "total_vram": 96,
    "vcpu_count": 48,
    "ram": 186
  },
  "training_optimizations": {
    "per_device_batch_size": 24,
    "gradient_accumulation_steps": 2,
    "effective_batch_size": 192,
    "memory_optimizations": {
      "use_gradient_checkpointing": true,
      "pin_memory": true,
      "num_workers": 4,
      "use_flash_attention": true
    },
    "distributed_settings": {
      "device_map": "auto",
      "ddp_find_unused_parameters": false,
      "use_fsdp": true,
      "fsdp_config": {
        "sharding_strategy": "FULL_SHARD",
        "mixed_precision": "BF16",
        "activation_checkpointing": true
      }
    }
  },
  "memory_breakdown": {
    "model_size": "~3.5GB (pre-quantized 4-bit)",
    "optimizer_states": "~1GB",
    "batch_memory_per_gpu": "~3GB",
    "peak_memory_estimate": "~18GB",
    "safe_headroom": "~6GB"
  },
  "compute_environment": "L4_CLOUD",
  "distributed_type": "FSDP",
  "mixed_precision": "bf16",
  "num_gpus": 4,
  "training_parameters": {
    "per_device_train_batch_size": 24,
    "gradient_accumulation_steps": 2,
    "dataloader_num_workers": 4,
    "dataloader_pin_memory": true,
    "gradient_checkpointing": true,
    "max_grad_norm": 1.0
  },
  "memory_optimization": {
    "offload_to_cpu": false,
    "use_flash_attention": true,
    "use_gradient_checkpointing": true
  }
}