{ "hardware_name": "4xL4", "specs": { "gpu_count": 4, "gpu_type": "L4", "vram_per_gpu": 24, "total_vram": 96, "vcpu_count": 48, "ram": 186 }, "training_optimizations": { "per_device_batch_size": 24, "gradient_accumulation_steps": 2, "effective_batch_size": 192, "memory_optimizations": { "use_gradient_checkpointing": true, "pin_memory": true, "num_workers": 4, "use_flash_attention": true }, "distributed_settings": { "device_map": "auto", "ddp_find_unused_parameters": false, "use_fsdp": true, "fsdp_config": { "sharding_strategy": "FULL_SHARD", "mixed_precision": "BF16", "activation_checkpointing": true } } }, "memory_breakdown": { "model_size": "~3.5GB (pre-quantized 4-bit)", "optimizer_states": "~1GB", "batch_memory_per_gpu": "~3GB", "peak_memory_estimate": "~18GB", "safe_headroom": "~6GB" }, "compute_environment": "L4_CLOUD", "distributed_type": "FSDP", "mixed_precision": "bf16", "num_gpus": 4, "training_parameters": { "per_device_train_batch_size": 24, "gradient_accumulation_steps": 2, "dataloader_num_workers": 4, "dataloader_pin_memory": true, "gradient_checkpointing": true, "max_grad_norm": 1.0 }, "memory_optimization": { "offload_to_cpu": false, "use_flash_attention": true, "use_gradient_checkpointing": true } }