kernel
flash-attn3 / build.toml
danieldk's picture
danieldk HF Staff
Convert FA3 to Kernel Hub format
eb8ddce
[general]
name = "flash_attn3"
universal = false
cuda-minver = "12.4"
cuda-maxver = "12.4"
[torch]
src = [
"torch-ext/pytorch_shim.h",
"torch-ext/torch_binding.cpp",
"torch-ext/torch_binding.h",
]
[kernel.flash_attn]
backend = "cuda"
cuda-capabilities = ["8.0", "9.0a"]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0", # To debug template code
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
"-DNDEBUG",
]
src = [
"flash-attn/cuda_check.h",
"flash-attn/flash_api.cpp",
"flash-attn/flash_fwd_combine.cu",
"flash-attn/flash_fwd_combine_kernel.h",
"flash-attn/flash_fwd_combine_launch_template.h",
"flash-attn/flash.h",
"flash-attn/flash_prepare_scheduler.cu",
"flash-attn/heuristics.h",
"flash-attn/seqlen.h",
"flash-attn/static_switch.h",
"flash-attn/tile_size.h",
"flash-attn/utils.h",
]
depends = ["torch", "cutlass_3_9"]
[kernel.flash_attn_sm80]
backend = "cuda"
cuda-capabilities = ["8.0", "9.0a"]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0", # To debug template code
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
"-DNDEBUG",
]
src = [
"flash-attn/block.h",
"flash-attn/copy_sm90_bulk_reduce.hpp",
"flash-attn/epilogue_bwd.hpp",
"flash-attn/epilogue_fwd.hpp",
"flash-attn/flash.h",
"flash-attn/flash_bwd_kernel_sm80.h",
"flash-attn/flash_bwd_kernel_sm90.h",
"flash-attn/flash_bwd_launch_template.h",
"flash-attn/flash_bwd_postprocess_kernel.h",
"flash-attn/flash_bwd_preprocess_kernel.h",
"flash-attn/flash_fwd_launch_template.h",
"flash-attn/flash_fwd_kernel_sm80.h",
"flash-attn/flash_fwd_kernel_sm90.h",
"flash-attn/heuristics.h",
"flash-attn/mainloop_bwd_sm80.hpp",
"flash-attn/mainloop_fwd_sm80.hpp",
"flash-attn/mainloop_bwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mainloop_fwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mask.h",
"flash-attn/named_barrier.hpp",
"flash-attn/pack_gqa.h",
"flash-attn/paged_kv.h",
"flash-attn/rotary.h",
"flash-attn/sm90_pipeline_no_cluster.hpp",
"flash-attn/softmax.h",
"flash-attn/tile_size.h",
"flash-attn/tile_scheduler.hpp",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_softcap_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_sm80.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_softcap_sm80.cu"
]
include = ["flash-attn"]
depends = ["torch", "cutlass_3_9"]
[kernel.flash_attn_sm90]
backend = "cuda"
cuda-capabilities = ["8.0", "9.0a"]
cuda-flags = [
"-O3",
"-std=c++17",
"--ftemplate-backtrace-limit=0", # To debug template code
"--use_fast_math",
"-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
"-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
"-DCUTLASS_ENABLE_GDC_FOR_SM90",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
"-DNDEBUG",
]
src = [
"flash-attn/block.h",
"flash-attn/copy_sm90_bulk_reduce.hpp",
"flash-attn/epilogue_bwd.hpp",
"flash-attn/epilogue_fwd.hpp",
"flash-attn/flash.h",
"flash-attn/flash_bwd_kernel_sm80.h",
"flash-attn/flash_bwd_kernel_sm90.h",
"flash-attn/flash_bwd_launch_template.h",
"flash-attn/flash_bwd_postprocess_kernel.h",
"flash-attn/flash_bwd_preprocess_kernel.h",
"flash-attn/flash_fwd_launch_template.h",
"flash-attn/flash_fwd_kernel_sm80.h",
"flash-attn/flash_fwd_kernel_sm90.h",
"flash-attn/heuristics.h",
"flash-attn/mainloop_bwd_sm80.hpp",
"flash-attn/mainloop_fwd_sm80.hpp",
"flash-attn/mainloop_bwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mainloop_fwd_sm90_tma_gmma_ws.hpp",
"flash-attn/mask.h",
"flash-attn/named_barrier.hpp",
"flash-attn/pack_gqa.h",
"flash-attn/paged_kv.h",
"flash-attn/rotary.h",
"flash-attn/sm90_pipeline_no_cluster.hpp",
"flash-attn/softmax.h",
"flash-attn/tile_size.h",
"flash-attn/tile_scheduler.hpp",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim128_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim128_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim192_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim192_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim256_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim256_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim64_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim64_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim96_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_sm90.cu",
"flash-attn/instantiations/flash_bwd_hdim96_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim128_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_128_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim192_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim256_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_256_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_512_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim64_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdim96_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimall_fp16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_bf16_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_e4m3_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_paged_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_paged_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_paged_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_paged_split_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_softcap_packgqa_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_softcap_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_split_sm90.cu",
"flash-attn/instantiations/flash_fwd_hdimdiff_fp16_split_softcap_sm90.cu",
]
include = ["flash-attn"]
depends = ["torch", "cutlass_3_9"]
# [kernel.flash_attn_sm100]
# backend = "cuda"
# cuda-capabilities = ["8.0", "9.0a", "10.0"]
# cuda-flags = [
# "-O3",
# "-std=c++17",
# "--ftemplate-backtrace-limit=0", # To debug template code
# "--use_fast_math",
# "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",
# "-DCUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1",
# "-DCUTLASS_ENABLE_GDC_FOR_SM90",
# "--expt-relaxed-constexpr",
# "--expt-extended-lambda",
# "--use_fast_math",
# "-DNDEBUG",
# ]
# src = [
# "flash-attn/block.h",
# "flash-attn/copy_sm90_bulk_reduce.hpp",
# "flash-attn/epilogue_bwd.hpp",
# "flash-attn/epilogue_fwd.hpp",
# "flash-attn/flash.h",
# "flash-attn/flash_bwd_kernel_sm80.h",
# "flash-attn/flash_bwd_kernel_sm90.h",
# "flash-attn/flash_bwd_launch_template.h",
# "flash-attn/flash_bwd_postprocess_kernel.h",
# "flash-attn/flash_bwd_preprocess_kernel.h",
# "flash-attn/flash_fwd_launch_template.h",
# "flash-attn/flash_fwd_kernel_sm80.h",
# "flash-attn/flash_fwd_kernel_sm90.h",
# "flash-attn/heuristics.h",
# "flash-attn/mainloop_bwd_sm80.hpp",
# "flash-attn/mainloop_fwd_sm80.hpp",
# "flash-attn/mainloop_bwd_sm90_tma_gmma_ws.hpp",
# "flash-attn/mainloop_fwd_sm90_tma_gmma_ws.hpp",
# "flash-attn/mask.h",
# "flash-attn/named_barrier.hpp",
# "flash-attn/pack_gqa.h",
# "flash-attn/paged_kv.h",
# "flash-attn/rotary.h",
# "flash-attn/sm90_pipeline_no_cluster.hpp",
# "flash-attn/softmax.h",
# "flash-attn/tile_size.h",
# "flash-attn/tile_scheduler.hpp",
#
# "flash-attn/instantiations/flash_fwd_hdim128_bf16_sm100.cu",
# ]
# include = ["flash-attn"]
# depends = ["torch", "cutlass_3_9"]