Add support for XPU to run gpt-oss

Browse files

Files changed (8) hide show

torch-ext/triton_kernels/matmul_ogs.py +2 -1
torch-ext/triton_kernels/matmul_ogs_details/_common.py +13 -1
torch-ext/triton_kernels/matmul_ogs_details/_finalize_matmul.py +6 -5
torch-ext/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py +3 -3
torch-ext/triton_kernels/matmul_ogs_details/opt_flags.py +80 -1
torch-ext/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_intel.py +41 -0
torch-ext/triton_kernels/numerics_details/flexpoint.py +2 -1
torch-ext/triton_kernels/target_info.py +47 -26

torch-ext/triton_kernels/matmul_ogs.py CHANGED Viewed

@@ -602,6 +602,7 @@ def matmul_ogs_torch(x, w, bias,
                  betas = None,
                  gammas = None,
                  round_x = None, round_y = None,
                  ):
     is_input_batched = x.ndim == 3
     assert x.dtype.itemsize > 1
@@ -641,7 +642,7 @@ def matmul_ogs_torch(x, w, bias,
         else:
             idx = gather_indx.src_indx[lo:hi] // n_expts_act
         batch = i if is_input_batched else 0
-        out = torch.matmul(round_x(x[batch, idx, :], torch.arange(lo, hi, device="cuda")).float(),
                            w[i].float())
         if bias is not None:
             out += bias[i, :] if betas is None else bias[i, :] * betas[lo:hi, None]

                  betas = None,
                  gammas = None,
                  round_x = None, round_y = None,
+                 device: str = "cuda",
                  ):
     is_input_batched = x.ndim == 3
     assert x.dtype.itemsize > 1
         else:
             idx = gather_indx.src_indx[lo:hi] // n_expts_act
         batch = i if is_input_batched else 0
+        out = torch.matmul(round_x(x[batch, idx, :], torch.arange(lo, hi, device=device)).float(),
                            w[i].float())
         if bias is not None:
             out += bias[i, :] if betas is None else bias[i, :] * betas[lo:hi, None]

torch-ext/triton_kernels/matmul_ogs_details/_common.py CHANGED Viewed

@@ -7,9 +7,21 @@ from triton.tools.tensor_descriptor import TensorDescriptor
 # -----------------------------------------------------------------------------
 #                                  Utilities
 # -----------------------------------------------------------------------------
-@tl.constexpr_function
 def get_scaled_dot_format_string(dtype: tl.dtype):
     mapping = {
         tl.float16: "fp16",

 # -----------------------------------------------------------------------------
 #                                  Utilities
 # -----------------------------------------------------------------------------
+try:
+    _ver_str = getattr(triton, "__version__", "0.0.0").split("+")[0]
+    _parts = _ver_str.split(".")
+    _ver_tuple = tuple(int(p) for p in (_parts + ["0", "0", "0"])[:3])
+except Exception:
+    _ver_tuple = (0, 0, 0)
+if _ver_tuple > (3, 4, 0) and hasattr(triton, "constexpr_function"):
+    _constexpr_function = triton.constexpr_function
+else:
+    _constexpr_function = tl.constexpr_function
+@_constexpr_function
 def get_scaled_dot_format_string(dtype: tl.dtype):
     mapping = {
         tl.float16: "fp16",

torch-ext/triton_kernels/matmul_ogs_details/_finalize_matmul.py CHANGED Viewed

@@ -4,25 +4,26 @@ from ..numerics_details.flexpoint import float_to_flex, load_scale, update_scale
 from ..numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from ..target_info import cuda_capability_geq as _cuda_capability_geq
 from ..target_info import is_hip as _is_hip
 # fmt: off
-@tl.constexpr_function
 def is_hip():
     return _is_hip()
-@tl.constexpr_function
 def cuda_capability_geq(x, y):
     return _cuda_capability_geq(x, y)
-@tl.constexpr_function
 def log2(n):
     return len(bin(n)) - 3
-@tl.constexpr_function
 def _permute_to_end_order(n: int, axis: int):
     """
     Returns the order of the axes of a tensor to permute `axis` to the end.
@@ -105,7 +106,7 @@ def _finalize_matmul_launch_metadata(grid, kernel, args):
     return ret
-@tl.constexpr_function
 def _accumulate_f16_into_f32_and_track_absmax_ptx(n_inputs: int, src_type: str, absmax_reg_name: str | None):
     """
     Generate PTX code to take fp16 inputs and sum them into an f32 accumulator using mixed-precision

 from ..numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 from ..target_info import cuda_capability_geq as _cuda_capability_geq
 from ..target_info import is_hip as _is_hip
+from ._common import _constexpr_function
 # fmt: off
+@_constexpr_function
 def is_hip():
     return _is_hip()
+@_constexpr_function
 def cuda_capability_geq(x, y):
     return _cuda_capability_geq(x, y)
+@_constexpr_function
 def log2(n):
     return len(bin(n)) - 3
+@_constexpr_function
 def _permute_to_end_order(n: int, axis: int):
     """
     Returns the order of the axes of a tensor to permute `axis` to the end.
     return ret
+@_constexpr_function
 def _accumulate_f16_into_f32_and_track_absmax_ptx(n_inputs: int, src_type: str, absmax_reg_name: str | None):
     """
     Generate PTX code to take fp16 inputs and sum them into an f32 accumulator using mixed-precision

torch-ext/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py CHANGED Viewed

@@ -12,14 +12,14 @@ from ..numerics_details.flexpoint import (
     compute_scale,
 )
 from ..numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
-from ._common import make_matmul_repr, matmul_launch_metadata, swizzle2d, xcd_swizzle, get_scaled_dot_format_string
-@tl.constexpr_function
 def cuda_capability_geq(major, minor):
     return target_info.cuda_capability_geq(major, minor)
-@tl.constexpr_function
 def get_dtype(tensor_or_desc: tl.tensor | tl.tensor_descriptor) -> tl.dtype:
     if isinstance(tensor_or_desc, tl.tensor):
         return tensor_or_desc.dtype.element_ty

     compute_scale,
 )
 from ..numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
+from ._common import make_matmul_repr, matmul_launch_metadata, swizzle2d, xcd_swizzle, get_scaled_dot_format_string, _constexpr_function
+@_constexpr_function
 def cuda_capability_geq(major, minor):
     return target_info.cuda_capability_geq(major, minor)
+@_constexpr_function
 def get_dtype(tensor_or_desc: tl.tensor | tl.tensor_descriptor) -> tl.dtype:
     if isinstance(tensor_or_desc, tl.tensor):
         return tensor_or_desc.dtype.element_ty

torch-ext/triton_kernels/matmul_ogs_details/opt_flags.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 import triton
 from ..target_info import get_cdna_version
 import torch
-from .opt_flags_details import opt_flags_amd, opt_flags_nvidia
 @dataclass
@@ -30,6 +30,83 @@ class OptFlags:
             raise ValueError("Not supported")
 def make_default_opt_flags_amd(
     out_dtype,
@@ -292,6 +369,8 @@ def make_opt_flags(
             enforce_bitwise_invariance, epilogue_effective_itemsize,
             _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
     if backend == "hip":
         return make_default_opt_flags_amd(*args)
     if backend == "cuda":

 import triton
 from ..target_info import get_cdna_version
 import torch
+from .opt_flags_details import opt_flags_amd, opt_flags_nvidia, opt_flags_intel
 @dataclass
             raise ValueError("Not supported")
+def make_default_opt_flags_intel(
+    out_dtype,
+    lhs_dtype,
+    rhs_dtype,
+    precision_config,
+    m,
+    n,
+    k,
+    routing_data,
+    can_use_persistent_tma,
+    can_use_fused_scatter,
+    enforce_bitwise_invariance,
+    epilogue_effective_itemsize,
+    constraints,
+):
+    constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages"]
+    assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
+    # tokens per expert
+    if routing_data is None:
+        tokens_per_expt = m
+    elif routing_data.expected_tokens_per_expt is None:
+        tokens_per_expt = max(1, m // routing_data.n_expts_tot)
+    else:
+        tokens_per_expt = routing_data.expected_tokens_per_expt
+    # pid swizzling
+    group_m = 8
+    xcd_swizzle = 1
+    # block_m
+    if constraints.get("block_m", None):
+        block_m = constraints["block_m"]
+    elif enforce_bitwise_invariance:
+        block_m = 128
+    else:
+        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+    # block n
+    block_n = opt_flags_intel.compute_block_n(n)
+    # is_persistent
+    is_persistent = constraints.get("is_persistent", False)
+    # block k
+    if constraints.get("block_k", None) is not None:
+        block_k = constraints["block_k"]
+    else:
+        block_k = opt_flags_intel.compute_block_k(k, is_persistent, precision_config)
+    # split_k
+    if constraints.get("split_k", None) is not None:
+        split_k = constraints["split_k"]
+    elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
+        split_k = 1
+    else:
+        estimated_actual_grid_size = opt_flags_intel.compute_grid_size(None, m, n, block_m, block_n)
+        split_k = opt_flags_intel.compute_split_k(block_k, k, estimated_actual_grid_size)
+    epilogue_subtile = constraints.get('epilogue_subtile', None)
+    if epilogue_subtile is None:
+        epilogue_subtile = 1
+    ret = OptFlags(
+        block_m=block_m,
+        block_n=block_n,
+        block_k=block_k,
+        num_warps=opt_flags_intel.compute_num_warps(block_m, block_n),
+        num_stages=constraints.get("num_stages", 2),
+        fused_scatter=constraints.get('fused_scatter', False),
+        group_m=group_m,
+        xcd_swizzle=xcd_swizzle,
+        w_cache_modifier=None,
+        split_k=split_k,
+        is_persistent=is_persistent,
+        epilogue_subtile=epilogue_subtile,
+        arch=None,
+        target_kernel_kwargs=dict(),
+        idle_sms=0,
+    )
+    # check constraints
+    assert all(getattr(ret, ck) == cv for ck, cv in constraints.items() if cv is not None), f"{ret} != {constraints}"
+    return ret
 def make_default_opt_flags_amd(
     out_dtype,
             enforce_bitwise_invariance, epilogue_effective_itemsize,
             _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
+    if backend == "xpu":
+        return make_default_opt_flags_intel(*args)
     if backend == "hip":
         return make_default_opt_flags_amd(*args)
     if backend == "cuda":

torch-ext/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_intel.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import triton
+def compute_grid_size(routing_data, m, n, block_m, block_n):
+    if routing_data is not None:
+        grid_m = routing_data.n_blocks(m, block_m)
+    else:
+        grid_m = triton.cdiv(m, block_m)
+    grid_n = (n + block_n - 1) // block_n
+    return grid_m * grid_n
+def compute_block_n(n: int):
+    # block_n:
+    return max(16, min(128, triton.next_power_of_2(n)))
+def compute_block_k(k: int | None, is_persistent: bool, precision_config):
+    if k is not None:
+        block_k = max(32, min(128, triton.next_power_of_2(k)))
+    has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None
+    if is_persistent and has_mx_weight_scale:
+        block_k = min(block_k, 128)
+    return block_k
+def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
+    device_props = torch.xpu.get_device_properties(0)
+    n_sms = device_props.gpu_subslice_count
+    split_k = n_sms // grid_size
+    if k is not None:
+        # avoid split_k for small k
+        num_block_k = triton.cdiv(k, block_k)
+        split_k = min(split_k, num_block_k // 4)
+    split_k = max(split_k, 1)
+    return split_k
+def compute_num_warps(block_m, block_n):
+    return max(block_m * block_n // 4096, 4)

torch-ext/triton_kernels/numerics_details/flexpoint.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from ..numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5
 from .. import target_info
 import triton
 import triton.language as tl
@@ -52,7 +53,7 @@ def rcp_max_finite(dtype):
         tl.static_assert(tl.constexpr(False), f"{dtype} not supported in flexpoint")
-@tl.constexpr_function
 def cuda_capability_geq(major, minor):
     return target_info.cuda_capability_geq(major, minor)

 from ..numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5
 from .. import target_info
+from ..matmul_ogs_details._common import _constexpr_function
 import triton
 import triton.language as tl
         tl.static_assert(tl.constexpr(False), f"{dtype} not supported in flexpoint")
+@_constexpr_function
 def cuda_capability_geq(major, minor):
     return target_info.cuda_capability_geq(major, minor)

torch-ext/triton_kernels/target_info.py CHANGED Viewed

@@ -1,54 +1,70 @@
 import torch
 import triton
-cached_capabilities = {}
 def is_cuda():
-    if "is_cuda" not in cached_capabilities:
-        target = triton.runtime.driver.active.get_current_target()
-        cached_capabilities["is_cuda"] = False if target is None else target.backend == "cuda"
-    return cached_capabilities["is_cuda"]
 def is_hip():
-    if "is_hip" not in cached_capabilities:
-        cached_capabilities["is_hip"] = torch.cuda.is_available() and bool(torch.version.hip)
-    return cached_capabilities["is_hip"]
 def is_hip_cdna3():
-    if "is_hip_cdna3" not in cached_capabilities:
-        target = triton.runtime.driver.active.get_current_target()
-        cached_capabilities["is_hip_cdna3"] = (target is not None and target.backend == 'hip'
-                                               and target.arch == 'gfx942')
-    return cached_capabilities["is_hip_cdna3"]
 def is_hip_cdna4():
-    if "is_hip_cdna4" not in cached_capabilities:
-        target = triton.runtime.driver.active.get_current_target()
-        cached_capabilities["is_hip_cdna4"] = (target is not None and target.backend == 'hip'
-                                               and target.arch == 'gfx950')
-    return cached_capabilities["is_hip_cdna4"]
 def cuda_capability_geq(major, minor=0):
     """
     Determines whether we have compute capability >= (major, minor) and
     returns this as a constexpr boolean. This can be used for guarding
     inline asm implementations that require a certain compute capability.
     """
-    if is_hip():
         return False
-    if "cuda" not in cached_capabilities:
-        if torch.cuda.is_available():
-            cached_capabilities["cuda"] = torch.cuda.get_device_capability()
-        else:
-            cached_capabilities["cuda"] = (0, 0)
-    return cached_capabilities["cuda"] >= (major, minor)
 def get_cdna_version():
     """
     Gets the AMD architecture version, i.e. CDNA3 or CDNA4, currently
@@ -65,13 +81,18 @@ def get_cdna_version():
     return -1
 def has_tma_gather():
     return cuda_capability_geq(10, 0)
 def has_native_mxfp():
     return cuda_capability_geq(10, 0)
 def num_sms():
-    return torch.cuda.get_device_properties(0).multi_processor_count

 import torch
 import triton
+from .matmul_ogs_details._common import _constexpr_function
+from triton.runtime import driver
+def current_target():
+    try:
+        active_driver = driver.active
+    except RuntimeError:
+        # If there is no active driver, return None
+        return None
+    return active_driver.get_current_target()
+current_target.__triton_builtin__ = True
+@_constexpr_function
 def is_cuda():
+    target = current_target()
+    return target is not None and target.backend == "cuda"
+@_constexpr_function
 def is_hip():
+    target = current_target()
+    return target is not None and target.backend == "hip"
+@_constexpr_function
+def is_xpu():
+    target = current_target()
+    return target is not None and target.backend == "xpu"
+@_constexpr_function
 def is_hip_cdna3():
+    target = current_target()
+    return target is not None and target.arch == "gfx942"
+@_constexpr_function
 def is_hip_cdna4():
+    target = current_target()
+    return target is not None and target.arch == "gfx950"
+@_constexpr_function
 def cuda_capability_geq(major, minor=0):
     """
     Determines whether we have compute capability >= (major, minor) and
     returns this as a constexpr boolean. This can be used for guarding
     inline asm implementations that require a certain compute capability.
     """
+    """
+    Determines whether we have compute capability >= (major, minor) and
+    returns this as a constexpr boolean. This can be used for guarding
+    inline asm implementations that require a certain compute capability.
+    """
+    target = current_target()
+    if target is None or target.backend != "cuda":
         return False
+    assert isinstance(target.arch, int)
+    return target.arch >= major * 10 + minor
+@_constexpr_function
 def get_cdna_version():
     """
     Gets the AMD architecture version, i.e. CDNA3 or CDNA4, currently
     return -1
+@_constexpr_function
 def has_tma_gather():
     return cuda_capability_geq(10, 0)
+@_constexpr_function
 def has_native_mxfp():
     return cuda_capability_geq(10, 0)
 def num_sms():
+    if is_cuda():
+        return torch.cuda.get_device_properties(0).multi_processor_count
+    if is_xpu():
+        return torch.xpu.get_device_properties(0).max_compute_units