[general] name = "quantization" universal = false [torch] include = ["."] src = [ "core/scalar_type.hpp", "torch-ext/torch_binding.cpp", "torch-ext/torch_binding.h", ] [kernel.gptq_marlin] backend = "cuda" cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0", ] depends = ["torch"] include = ["."] src = [ "core/scalar_type.hpp", "gptq_marlin/awq_marlin_repack.cu", "gptq_marlin/dequant.h", "gptq_marlin/gptq_marlin.cu", "gptq_marlin/gptq_marlin_repack.cu", "gptq_marlin/kernel.h", "gptq_marlin/kernel_bf16_kfe2m1f.cu", "gptq_marlin/kernel_bf16_kfe4m3fn.cu", "gptq_marlin/kernel_bf16_ku4.cu", "gptq_marlin/kernel_bf16_ku4b8.cu", "gptq_marlin/kernel_bf16_ku8b128.cu", "gptq_marlin/kernel_fp16_kfe2m1f.cu", "gptq_marlin/kernel_fp16_kfe4m3fn.cu", "gptq_marlin/kernel_fp16_ku4.cu", "gptq_marlin/kernel_fp16_ku4b8.cu", "gptq_marlin/kernel_fp16_ku8b128.cu", "gptq_marlin/marlin.cuh", "gptq_marlin/marlin_dtypes.cuh", "gptq_marlin/marlin_template.h", ] #[kernel.fp8_common_rocm] #backend = "rocm" #depends = ["torch"] #rocm-archs = [ # "gfx906", # "gfx908", # "gfx90a", # "gfx940", # "gfx941", # "gfx942", # "gfx1030", # "gfx1100", # "gfx1101", #] #include = ["."] #src = [ # "attention/attention_dtypes.h", # "attention/attention_generic.cuh", # "attention/dtype_bfloat16.cuh", # "attention/dtype_float16.cuh", # "attention/dtype_float32.cuh", # "attention/dtype_fp8.cuh", # "fp8/amd/quant_utils.cuh", # "fp8/common.cu", # "fp8/common.cuh", # "dispatch_utils.h", # "utils.cuh", # "vectorization.cuh", #] [kernel.int8_common] backend = "cuda" cuda-capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0", ] depends = ["torch"] include = ["."] src = [ "compressed_tensors/int8_quant_kernels.cu", "dispatch_utils.h", "vectorization_utils.cuh", ] [kernel.fp8_common] backend = "cuda" cuda-capabilities = [ "7.0", "7.2", "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0", ] depends = ["torch"] include = ["."] src = [ "fp8/common.cu", "fp8/common.cuh", "dispatch_utils.h", "utils.cuh", "vectorization.cuh", ] [kernel.cutlass_w8a8_hopper] backend = "cuda" cuda-capabilities = ["9.0a"] cuda-minver = "12.0" depends = [ "cutlass_3_9", "torch", ] include = ["."] src = [ "cuda_utils.h", "core/math.hpp", "cutlass_w8a8/c3x/cutlass_gemm_caller.cuh", "cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu", "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu", "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh", "cutlass_w8a8/c3x/scaled_mm.cuh", "cutlass_w8a8/c3x/scaled_mm_kernels.hpp", "cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu", "cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh", "cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu", "cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh", "cutlass_w8a8/c3x/scaled_mm_helper.hpp", "cutlass_w8a8/scaled_mm_c3x_sm90.cu", "cutlass_extensions/common.hpp", "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp", "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp", "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp", "cutlass_extensions/gemm/dispatch_policy.hpp", "cutlass_extensions/gemm/collective/collective_builder.hpp", "cutlass_extensions/gemm/collective/fp8_accumulation.hpp", "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp", ] [kernel.cutlass_w8a8_blackwell] backend = "cuda" cuda-capabilities = [ "10.0a", "10.1a", "12.0a", ] cuda-minver = "12.9" depends = [ "cutlass_3_9", "torch", ] include = ["."] src = [ "cuda_utils.h", "cutlass_w8a8/scaled_mm_c3x_sm100.cu", "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu", "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh", "cutlass_w8a8/c3x/scaled_mm_helper.hpp", "cutlass_w8a8/c3x/scaled_mm_kernels.hpp", "cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu", "cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh", ] [kernel.cutlass_w8a8] backend = "cuda" cuda-capabilities = [ "7.5", "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0", ] depends = [ "cutlass_3_9", "torch", ] include = ["."] src = [ "core/math.hpp", "cutlass_w8a8/scaled_mm_c2x.cu", "cutlass_w8a8/scaled_mm_c2x.cuh", "cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh", "cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh", "cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh", "cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh", "cutlass_w8a8/scaled_mm_entry.cu", "cutlass_extensions/common.cpp", "cutlass_extensions/common.hpp", "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp", "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp", ] [kernel.marlin] backend = "cuda" cuda-capabilities = [ "8.0", "8.6", "8.7", "8.9", "9.0", "10.0", "10.1", "12.0", ] depends = ["torch"] include = ["."] src = [ "core/scalar_type.hpp", "marlin/dense/common/base.h", "marlin/dense/common/mem.h", "marlin/dense/marlin_cuda_kernel.cu", "marlin/qqq/marlin_qqq_gemm_kernel.cu", "marlin/sparse/common/base.h", "marlin/sparse/common/mem.h", "marlin/sparse/common/mma.h", "marlin/sparse/marlin_24_cuda_kernel.cu", ] #[kernel.int8_common_rocm] #backend = "rocm" #depends = ["torch"] #rocm-archs = [ # "gfx906", # "gfx908", # "gfx90a", # "gfx940", # "gfx941", # "gfx942", # "gfx1030", # "gfx1100", # "gfx1101", #] #include = ["."] #src = [ # "compressed_tensors/int8_quant_kernels.cu", # "dispatch_utils.h", #]