|
[general] |
|
name = "quantization" |
|
universal = false |
|
|
|
[torch] |
|
include = ["."] |
|
src = [ |
|
"core/scalar_type.hpp", |
|
"torch-ext/torch_binding.cpp", |
|
"torch-ext/torch_binding.h", |
|
] |
|
|
|
[kernel.gptq_marlin] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"8.0", |
|
"8.6", |
|
"8.7", |
|
"8.9", |
|
"9.0", |
|
"10.0", |
|
"10.1", |
|
"12.0", |
|
] |
|
depends = ["torch"] |
|
include = ["."] |
|
src = [ |
|
"core/scalar_type.hpp", |
|
"gptq_marlin/awq_marlin_repack.cu", |
|
"gptq_marlin/dequant.h", |
|
"gptq_marlin/gptq_marlin.cu", |
|
"gptq_marlin/gptq_marlin_repack.cu", |
|
"gptq_marlin/kernel.h", |
|
"gptq_marlin/kernel_bf16_kfe2m1f.cu", |
|
"gptq_marlin/kernel_bf16_kfe4m3fn.cu", |
|
"gptq_marlin/kernel_bf16_ku4.cu", |
|
"gptq_marlin/kernel_bf16_ku4b8.cu", |
|
"gptq_marlin/kernel_bf16_ku8b128.cu", |
|
"gptq_marlin/kernel_fp16_kfe2m1f.cu", |
|
"gptq_marlin/kernel_fp16_kfe4m3fn.cu", |
|
"gptq_marlin/kernel_fp16_ku4.cu", |
|
"gptq_marlin/kernel_fp16_ku4b8.cu", |
|
"gptq_marlin/kernel_fp16_ku8b128.cu", |
|
"gptq_marlin/marlin.cuh", |
|
"gptq_marlin/marlin_dtypes.cuh", |
|
"gptq_marlin/marlin_template.h", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[kernel.int8_common] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"7.0", |
|
"7.2", |
|
"7.5", |
|
"8.0", |
|
"8.6", |
|
"8.7", |
|
"8.9", |
|
"9.0", |
|
"10.0", |
|
"10.1", |
|
"12.0", |
|
] |
|
depends = ["torch"] |
|
include = ["."] |
|
src = [ |
|
"compressed_tensors/int8_quant_kernels.cu", |
|
"dispatch_utils.h", |
|
"vectorization_utils.cuh", |
|
] |
|
|
|
[kernel.fp8_common] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"7.0", |
|
"7.2", |
|
"7.5", |
|
"8.0", |
|
"8.6", |
|
"8.7", |
|
"8.9", |
|
"9.0", |
|
"10.0", |
|
"10.1", |
|
"12.0", |
|
] |
|
depends = ["torch"] |
|
include = ["."] |
|
src = [ |
|
"fp8/common.cu", |
|
"fp8/common.cuh", |
|
"dispatch_utils.h", |
|
"utils.cuh", |
|
"vectorization.cuh", |
|
] |
|
|
|
[kernel.cutlass_w8a8_hopper] |
|
backend = "cuda" |
|
cuda-capabilities = ["9.0a"] |
|
cuda-minver = "12.0" |
|
depends = [ |
|
"cutlass_3_9", |
|
"torch", |
|
] |
|
include = ["."] |
|
src = [ |
|
"cuda_utils.h", |
|
"core/math.hpp", |
|
"cutlass_w8a8/c3x/cutlass_gemm_caller.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm_kernels.hpp", |
|
"cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm_helper.hpp", |
|
"cutlass_w8a8/scaled_mm_c3x_sm90.cu", |
|
"cutlass_extensions/common.hpp", |
|
"cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp", |
|
"cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp", |
|
"cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp", |
|
"cutlass_extensions/gemm/dispatch_policy.hpp", |
|
"cutlass_extensions/gemm/collective/collective_builder.hpp", |
|
"cutlass_extensions/gemm/collective/fp8_accumulation.hpp", |
|
"cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp", |
|
] |
|
|
|
|
|
|
|
[kernel.cutlass_w8a8_blackwell] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"10.0a", |
|
"10.1a", |
|
"12.0a", |
|
] |
|
cuda-minver = "12.9" |
|
depends = [ |
|
"cutlass_3_9", |
|
"torch", |
|
] |
|
include = ["."] |
|
src = [ |
|
"cuda_utils.h", |
|
"cutlass_w8a8/scaled_mm_c3x_sm100.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh", |
|
"cutlass_w8a8/c3x/scaled_mm_helper.hpp", |
|
"cutlass_w8a8/c3x/scaled_mm_kernels.hpp", |
|
"cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu", |
|
"cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh", |
|
] |
|
|
|
[kernel.cutlass_w8a8] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"7.5", |
|
"8.0", |
|
"8.6", |
|
"8.7", |
|
"8.9", |
|
"9.0", |
|
"10.0", |
|
"10.1", |
|
"12.0", |
|
] |
|
depends = [ |
|
"cutlass_3_9", |
|
"torch", |
|
] |
|
include = ["."] |
|
src = [ |
|
"core/math.hpp", |
|
"cutlass_w8a8/scaled_mm_c2x.cu", |
|
"cutlass_w8a8/scaled_mm_c2x.cuh", |
|
"cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh", |
|
"cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh", |
|
"cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh", |
|
"cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh", |
|
"cutlass_w8a8/scaled_mm_entry.cu", |
|
"cutlass_extensions/common.cpp", |
|
"cutlass_extensions/common.hpp", |
|
"cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp", |
|
"cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp", |
|
] |
|
|
|
[kernel.marlin] |
|
backend = "cuda" |
|
cuda-capabilities = [ |
|
"8.0", |
|
"8.6", |
|
"8.7", |
|
"8.9", |
|
"9.0", |
|
"10.0", |
|
"10.1", |
|
"12.0", |
|
] |
|
depends = ["torch"] |
|
include = ["."] |
|
src = [ |
|
"core/scalar_type.hpp", |
|
"marlin/dense/common/base.h", |
|
"marlin/dense/common/mem.h", |
|
"marlin/dense/marlin_cuda_kernel.cu", |
|
"marlin/qqq/marlin_qqq_gemm_kernel.cu", |
|
"marlin/sparse/common/base.h", |
|
"marlin/sparse/common/mem.h", |
|
"marlin/sparse/common/mma.h", |
|
"marlin/sparse/marlin_24_cuda_kernel.cu", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|