medmekk
/

unsloth_kernels

+{
+  description = "Flake for ReLU kernel";
+  inputs = {
+    kernel-builder.url = "path:../..";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

torch-ext/unsloth_kernels/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from .cross_entropy_loss import fast_cross_entropy_loss
+from .fast_lora import fast_lora_forward
+from .flex_attention import slow_inference_attention_softcapping
+from .layernorm import fast_layernorm
+from .rope_embedding import inplace_rope_embedding, fast_rope_embedding
+from .rms_layernorm import fast_rms_layernorm
+from .swiglu import swiglu_fg_kernel
+from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel, geglu_exact_forward_kernel, geglu_exact_backward_kernel
+from .swiglu import swiglu_fg_kernel
+__all__ = ["fast_cross_entropy_loss",
+           "fast_lora_forward",
+           "slow_inference_attention_softcapping",
+           "fast_layernorm",
+           "inplace_rope_embedding",
+           "fast_rms_layernorm",
+           "swiglu_fg_kernel",
+           "geglu_approx_forward_kernel",
+           "geglu_approx_backward_kernel",
+           "geglu_exact_forward_kernel",
+           "geglu_exact_backward_kernel",
+           "fast_rope_embedding"
+        ]

torch-ext/unsloth_kernels/cross_entropy_loss.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import (
+    calculate_settings,
+    MAX_FUSED_SIZE,
+    triton_tanh,
+    triton_cast,
+    torch_cuda_device,
+)
+from transformers.models.llama.modeling_llama import logger
+from packaging.version import Version
+from unsloth_zoo.loss_utils import (
+    patch_loss_functions as _patch_loss_functions,
+    post_patch_loss_function,
+)
+def _cross_entropy_forward(
+    logits_ptr        ,
+    logits_row_stride ,
+    loss_ptr          ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        : tl.constexpr,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
+):
+    """
+        Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
+        Pi = exp(xi) / sum(exp(xi))
+        CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
+             = -y [ x - log[sum(exp(x))] ]
+             = y * (log[sum(exp(x))] - x)
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+        logsumexp is also stable
+        Take    y =         log[sum(exp(x))]
+           exp(y) =             sum(exp(x))
+           exp(y) =             sum(exp(x - c)*exp(c)) Since e^(x-c)*e^c = e^x
+           exp(y) =      exp(c)*sum(exp(x - c))
+               y  = log(exp(c)*sum(exp(x - c)))
+               y  = c + log[sum(exp(x - c))]
+        This means we can set c = max(x) to make sure
+        exp(x - c) always is exp(x - max(x)).
+        This ensures exp(x - max(x))'s maximum is 1 as exp(0) = 1.
+    """
+    row_idx = tl.program_id(0)
+    logits_ptr    += row_idx * triton_cast(logits_row_stride, tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx
+    labels_ptr    += row_idx
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+    if label_idx != -100:
+        x = tl.load(logits_ptr + label_idx).to(tl.float32)
+        # Go logit scaling for Cohere: t * x
+        if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+        # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+        if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+        loss = logsumexp - x
+    else:
+        loss = 0.0
+    tl.store(logsumexp_ptr, logsumexp)
+    tl.store(loss_ptr, loss)
+pass
+_cross_entropy_forward = triton.jit(_cross_entropy_forward)
+_cross_entropy_forward = triton.heuristics(
+    {
+        "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+        "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+    }
+)(_cross_entropy_forward)
+def _chunked_cross_entropy_forward(
+    logits_ptr        ,
+    logits_row_stride ,
+    loss_ptr          ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        : tl.constexpr,
+    N_CHUNKS          : tl.constexpr,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
+):
+    """
+        256K vocab divided in 4 chunks
+        |-65536-| |-65536-| |-65536-| |-65536-|
+        |-------| |-------| |-------| |-------|
+        |-------| |-------| |-------| |-------|
+        If y == 0: CE_i = 0
+        If y == 1: CE_i = logsumexp - x
+        Notice we can do logsumexp for each chunk and then
+        logsumexp[chunk_sum(logsumexp)] == logsumexp
+        chunk_sum = log[chunk_sum(logsumexp)]
+                  = log[exp(logsumexp(a)) + ... + exp(logsumexp(z))]
+                  = log[exp(log[sum(exp(a))]) + ... + exp(log[sum(exp(z))])]
+                  = log[sum(exp(a)) + ... + sum(exp(z))]
+                  = logsumexp(x)
+        This means we can perform a logsumexp for each chunk, then do a
+        final logsumexp reduction!
+        Ie do: logsumexp(chunked_logsumexp) - x
+    """
+    row_idx   = tl.program_id(0)
+    chunk_idx = tl.program_id(1)
+    logits_ptr    += row_idx * triton_cast(logits_row_stride, tl.int64)
+    loss_ptr      += row_idx
+    logsumexp_ptr += row_idx * N_CHUNKS + chunk_idx
+    labels_ptr    += row_idx
+    col_offsets = chunk_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr).to(tl.int32)
+    logits = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    # Go logit scaling for Cohere: t * x
+    if DO_LOGIT_SCALING: logits = LOGIT_SCALE * logits
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    if DO_SOFTCAPPING:   logits = SOFTCAP * triton_tanh(logits / SOFTCAP)
+    c = tl.max(logits, 0)
+    logsumexp = c + tl.log(tl.sum(tl.exp(logits - c), 0))
+    if chunk_idx == 0:
+        # logsumexp(chunked_logsumexp) - x
+        # Do the -x separately
+        if label_idx != -100:
+            x = tl.load(logits_ptr + label_idx).to(tl.float32)
+            # Go logit scaling for Cohere: t * x
+            if DO_LOGIT_SCALING: x = LOGIT_SCALE * x
+            # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+            if DO_SOFTCAPPING:   x = SOFTCAP * triton_tanh(x / SOFTCAP)
+            loss = -1.0 * x
+        else:
+            loss = 0.0
+        tl.store(loss_ptr, loss)
+    pass
+    tl.store(logsumexp_ptr, logsumexp)
+pass
+_chunked_cross_entropy_forward = triton.jit(_chunked_cross_entropy_forward)
+_chunked_cross_entropy_forward = triton.heuristics(
+    {
+        "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+        "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+    }
+)(_chunked_cross_entropy_forward)
+def _cross_entropy_backward(
+    logits_ptr        ,
+    logits_row_stride ,
+    dloss_ptr         ,
+    dloss_row_stride  ,
+    logsumexp_ptr     ,
+    labels_ptr        ,
+    VOCAB_SIZE        : tl.constexpr,
+    BLOCK_SIZE        : tl.constexpr,
+    DO_SOFTCAPPING    : tl.constexpr,
+    SOFTCAP           : tl.constexpr,
+    DO_LOGIT_SCALING  : tl.constexpr,
+    LOGIT_SCALE       : tl.constexpr,
+):
+    """
+        CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
+        dC/dx = d/dx (y * log[sum(exp(x))] - x * y)
+        From https://en.wikipedia.org/wiki/LogSumExp
+        d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)
+        dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
+        dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
+        dC/dx = y * exp[x - logsumexp] - d/dx (x * y)
+        If y == 0: dC/dx = 0
+        If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
+        If y == 1 and x != label: dC/dx     = exp[x - logsumexp]
+    """
+    row_idx   = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    logits_ptr += row_idx * triton_cast(logits_row_stride, tl.int64)
+    dloss_ptr  += row_idx *  dloss_row_stride
+    col_offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < VOCAB_SIZE
+    label_idx = tl.load(labels_ptr + row_idx).to(tl.int32)
+    if label_idx != -100:
+        dloss = tl.load(dloss_ptr)
+    else:
+        dloss = 0.0
+    x = tl.load(logits_ptr + col_offsets, mask = mask, other = -float("inf")).to(tl.float32)
+    # Do logit scaling for Cohere
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        x = x * LOGIT_SCALE
+    pass
+    # Do logit softcapping for Gemma 2: t * tanh(1/t * x)
+    partial = x
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        partial = triton_tanh(x / SOFTCAP)
+        x = SOFTCAP * partial
+    pass
+    logsumexp = tl.load(logsumexp_ptr + row_idx)
+    y = tl.exp(x - logsumexp)
+    y = tl.where(
+        col_offsets == label_idx,
+        y - 1.0, # exp(x - logsumexp) - 1
+        y,       # exp(x - logsumexp)
+    )
+    if DO_LOGIT_SCALING:
+        # d/dx [s * x] = s
+        y = y * LOGIT_SCALE
+    pass
+    if DO_SOFTCAPPING:
+        # d/dx [t * tanh(1/t * x)] = 1 - tanh^2(1/t * x)
+        y = y * (1.0 - partial*partial)
+    pass
+    # If y == 0: dC/dx = 0 ==> we already masked it to be = 0, so dloss = 0.
+    tl.store(logits_ptr + col_offsets, dloss * y, mask = mask)
+pass
+_cross_entropy_backward = triton.jit(_cross_entropy_backward)
+_cross_entropy_backward = triton.heuristics(
+    {
+        "DO_SOFTCAPPING":   lambda args: bool(args["DO_SOFTCAPPING"  ]),
+        "DO_LOGIT_SCALING": lambda args: bool(args["DO_LOGIT_SCALING"]),
+    }
+)(_cross_entropy_backward)
+MAX_FUSED_SIZE = 65536 # 2**16
+class Fast_CrossEntropyLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, logits, labels, logit_softcapping : float = 0, logit_scaling : float = 0):
+        n_rows : int
+        vocab_size : int
+        n_rows, vocab_size = logits.shape
+        device = logits.device
+        div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
+        n_chunks : int = div + (mod != 0)
+        losses = torch.empty(n_rows, dtype = torch.float32, device = device)
+        DO_SOFTCAPPING   : bool = bool(logit_softcapping != 0)
+        DO_LOGIT_SCALING : bool = bool(logit_scaling != 0)
+        BLOCK_SIZE : int
+        num_warps  : int
+        if n_chunks == 1:
+            # For small vocabs <= 65336 like Llama, Mistral
+            BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
+            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = device)
+            with torch_cuda_device(device):
+                _cross_entropy_forward[(n_rows,)](
+                    logits, logits.stride(0),
+                    losses,
+                    logsumexp,
+                    labels,
+                    VOCAB_SIZE       = vocab_size,
+                    BLOCK_SIZE       = BLOCK_SIZE,
+                    DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                    SOFTCAP          = logit_softcapping,
+                    DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                    LOGIT_SCALE      = logit_scaling,
+                    num_warps        = num_warps,
+                )
+        else:
+            # For large vocabs > 65336 like Gemma 256K
+            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = device)
+            with torch_cuda_device(device):
+                _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
+                    logits, logits.stride(0),
+                    losses,
+                    logsumexp,
+                    labels,
+                    VOCAB_SIZE       = vocab_size,
+                    N_CHUNKS         = n_chunks,
+                    BLOCK_SIZE       = MAX_FUSED_SIZE,
+                    DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                    SOFTCAP          = logit_softcapping,
+                    DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                    LOGIT_SCALE      = logit_scaling,
+                    num_warps        = 32,
+                )
+            # logsumexp(chunked_logsumexp) - x
+            # Do the -x separately
+            logsumexp = torch.logsumexp(logsumexp, dim = 1) # Row sum
+            losses += logsumexp
+            losses.masked_fill_(labels == -100, 0) # Don't forget to mask padding out!
+        pass
+        ctx.save_for_backward(logits, logsumexp, labels)
+        ctx.DO_SOFTCAPPING    = DO_SOFTCAPPING
+        ctx.logit_softcapping = logit_softcapping
+        ctx.DO_LOGIT_SCALING  = DO_LOGIT_SCALING
+        ctx.logit_scaling     = logit_scaling
+        return losses
+    pass
+    @staticmethod
+    def backward(ctx, dlosses):
+        logits, logsumexp, labels = ctx.saved_tensors
+        n_rows : int
+        vocab_size : int
+        n_rows, vocab_size = logits.shape
+        BLOCK_SIZE : int = 4096
+        div : int
+        mod : int
+        div, mod = divmod(vocab_size, BLOCK_SIZE)
+        n_blocks : int = div + (mod != 0)
+        with torch_cuda_device(dlosses.device):
+            _cross_entropy_backward[(n_rows, n_blocks,)](
+                logits,   logits.stride(0),
+                dlosses, dlosses.stride(0),
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                BLOCK_SIZE       = BLOCK_SIZE,
+                DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
+                SOFTCAP          = ctx.logit_softcapping,
+                DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
+                LOGIT_SCALE      = ctx.logit_scaling,
+                num_warps        = 8,
+            )
+        return logits, None, None, None,
+    pass
+pass
+def fast_cross_entropy_loss(
+    logits,
+    labels,
+    logit_softcapping = 0,
+    logit_scaling = 0,
+    n_items = None,
+):
+    """
+    Arguments:
+        logits: (batch, seq_len, vocab_size)
+        labels: (batch, seq_len,)
+    Returns:
+        losses: float
+    """
+    batch, seq_len, d = logits.shape
+    assert(labels.shape == (batch, seq_len))
+    loss = Fast_CrossEntropyLoss.apply(
+        logits.view(batch*seq_len, d),
+        labels.view(-1),
+        logit_softcapping,
+        logit_scaling,
+    )
+    if n_items is None:
+        n_items = torch.count_nonzero(labels != -100)
+    return loss.sum() / n_items
+pass
+if (Version(torch.__version__) < Version("2.4.0")) and \
+    not hasattr(fast_cross_entropy_loss, "__wrapped__"):
+    fast_cross_entropy_loss = torch._disable_dynamo(fast_cross_entropy_loss)
+pass
+# Patch CE Losses in transformers
+def patch_loss_functions(torch_compile = True):
+    _patch_loss_functions(fast_cross_entropy_loss, torch_compile = torch_compile)
+pass

torch-ext/unsloth_kernels/fast_lora.py ADDED Viewed

	@@ -0,0 +1,537 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .utils import (
+    fast_dequantize,
+    QUANT_STATE,
+    get_lora_parameters,
+    get_lora_parameters_bias,
+    matmul_lora,
+    torch_amp_custom_fwd,
+    torch_amp_custom_bwd,
+)
+class LoRA_MLP(torch.autograd.Function):
+    """
+    ### LoRA weights
+    G = G + Ag @ Bg
+    U = U + Au @ Bu
+    W = W + Aw @ Bw
+    ### SwiGLU(X)
+    e = X @ G
+    f = e * sigmoid(e)
+    g = X @ U
+    h = f * g
+    i = h @ W
+    ### Backpropagation chain rule
+    See our blog post for more details
+    df = sigmoid(e) * (1 - f) + f
+    dC/dW = h.T @ dY
+    dC/dU = X.T @ (D @ W.T * f)
+    dC/dG = X.T @ (D @ W.T * df * g)
+    ### Down projection LoRA weights
+    dC/dAw = dC/dW @ B.T
+    dC/dBw = A.T @ dC/dW
+    dC/dAw =       h.T @ dY @ B.T
+    dC/dBw = A.T @ h.T @ dY
+    ### Up projection LoRA weights
+    dC/dAu =       X.T @ (D @ W.T * f) @ B.T
+    dC/dBu = A.T @ X.T @ (D @ W.T * f)
+    ### Gate projection LoRA weights
+    dC/dAg =       X.T @ (D @ W.T * df * g) @ B.T
+    dC/dBg = A.T @ X.T @ (D @ W.T * df * g)
+    Don't forget to see our blog post for more details!
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                gateW, gateW_quant, gateA, gateB, gateS,
+                  upW,   upW_quant, upA,   upB,   upS,
+                downW, downW_quant, downA, downB, downS,
+                _forward_function, _backward_function,
+                inplace = True,):
+        dtype = X.dtype
+        e = matmul_lora(X, gateW, gateW_quant, gateA, gateB, gateS)
+        g = matmul_lora(X,   upW,   upW_quant,   upA,   upB,   upS)
+        h = _forward_function(e, g)
+        i = matmul_lora(h, downW, downW_quant, downA, downB, downS)
+        ctx.custom_saved_tensors = (
+            gateW, gateW_quant, gateS,
+            upW, upW_quant, upS,
+            downW, downW_quant, downS,
+            _backward_function,
+        )
+        ctx.save_for_backward(gateA, gateB, upA, upB, downA, downB,
+                              X, e, g)
+        ctx.inplace = inplace
+        return i
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        gateW, gateW_quant, gateS, upW, upW_quant, upS, downW, downW_quant, downS, \
+            _backward_function = ctx.custom_saved_tensors
+        gateA, gateB, upA, upB, downA, downB, \
+            X, e, g = ctx.saved_tensors
+        batch, seq_len, hd = X.shape
+        dY = dY.view(-1, dY.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        e  = e .view(-1, e .shape[-1])
+        g  = g .view(-1, g .shape[-1])
+        dtype = X.dtype
+        gateA, gateB, upA, upB, downA, downB = \
+            gateA.to(dtype), gateB.to(dtype), upA.to(dtype), upB.to(dtype), downA.to(dtype), downB.to(dtype)
+        gateA, gateB, upA, upB, downA, downB = \
+            gateA.t(), gateB.t(), upA.t(), upB.t(), downA.t(), downB.t()
+        DW = matmul_lora(dY, downW.t(), downW_quant, downB, downA, downS)
+        DW, e, g = _backward_function(DW, e, g)
+        h, df, de = DW, e, g
+        d_downA = torch.empty_like(downA)
+        d_downB = torch.empty_like(downB)
+        d_gateA = torch.empty_like(gateA)
+        d_gateB = torch.empty_like(gateB)
+        d_upA   = torch.empty_like(upA)
+        d_upB   = torch.empty_like(upB)
+        # Down projection LoRA weights
+        # d_downA = h.t() @ (dY @ downB.t())
+        # d_downB = (downA.t() @ h.t()) @ dY
+        # d_downA *= downS
+        # d_downB *= downS
+        d_downA.addmm_(h.t(), dY @ downB.t(), alpha = downS, beta = 0)
+        d_downB.addmm_(downA.t() @ h.t(), dY, alpha = downS, beta = 0)
+        # Up projection LoRA weights
+        # d_upA   = X.t() @ (df @ upB.t())
+        # d_upB   = (upA.t() @ X.t()) @ df
+        # d_upA  *= upS
+        # d_upB  *= upS
+        d_upA.addmm_(X.t(), df @ upB.t(), alpha = upS, beta = 0)
+        d_upB.addmm_(upA.t() @ X.t(), df, alpha = upS, beta = 0)
+        # Gate projection LoRA weights
+        # d_gateA = X.t() @ (de @ gateB.t())
+        # d_gateB = (gateA.t() @ X.t()) @ de
+        # d_gateA *= gateS
+        # d_gateB *= gateS
+        d_gateA.addmm_(X.t(), de @ gateB.t(), alpha = gateS, beta = 0)
+        d_gateB.addmm_(gateA.t() @ X.t(), de, alpha = gateS, beta = 0)
+        # dX  = matmul_lora(df, upW.t(), upW_quant, upB, upA, upS)
+        # dX += matmul_lora(de, gateW.t(), gateW_quant, gateB, gateA, gateS)
+        upW = fast_dequantize(upW.t(), upW_quant)
+        dX = torch.matmul(df, upW.t(), out = X if ctx.inplace else None)
+        del upW
+        # dX += df @ upB.to(dtype).t() @ (upS * upA.to(dtype).t())
+        dX.addmm_(df @ upB.t(), upA.t(), alpha = upS)
+        gateW = fast_dequantize(gateW.t(), gateW_quant)
+        # dX += de @ gateW.t()
+        dX.addmm_(de, gateW.t())
+        del gateW
+        # dX += de @ gateB.to(dtype).t() @ (gateS * gateA.to(dtype).t())
+        dX.addmm_(de @ gateB.t(), gateA.t(), alpha = gateS)
+        # gateW, gateW_quant, gateA, gateB, gateS,
+        #  upW,    upW_quant,   upA,   upB,   upS,
+        # downW, downW_quant, downA, downB, downS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_gateA.t(), d_gateB.t(), None, \
+            None, None,   d_upA.t(),   d_upB.t(), None, \
+            None, None, d_downA.t(), d_downB.t(), None, \
+            None, None, None, # _backward and _forward and inplace
+    pass
+pass
+from .swiglu import swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel
+def apply_lora_mlp_swiglu(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         swiglu_fg_kernel, swiglu_DWf_DW_dfg_kernel,
+                         inplace,)
+    return out
+pass
+from .geglu import geglu_exact_forward_kernel, geglu_exact_backward_kernel
+def apply_lora_mlp_geglu_exact(self, X, inplace = True):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_exact_forward_kernel, geglu_exact_backward_kernel,
+                         inplace,)
+    return out
+pass
+from .geglu import geglu_approx_forward_kernel, geglu_approx_backward_kernel
+def apply_lora_mlp_geglu_approx(self, X):
+    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW,     upW_quant,   upA,   upB,   upS = get_lora_parameters(self.  up_proj)
+    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    out = LoRA_MLP.apply(X,
+                         gateW, gateW_quant, gateA, gateB, gateS,
+                         upW,     upW_quant, upA,   upB,   upS,
+                         downW, downW_quant, downA, downB, downS,
+                         geglu_approx_forward_kernel, geglu_approx_backward_kernel,)
+    return out
+pass
+class LoRA_QKV(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+    ### Backpropagation chain rule
+    See our blogpost for more details.
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+    We then sum them all find dC/dX
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                QW, QW_quant, QA, QB, QS,
+                KW, KW_quant, KA, KB, KS,
+                VW, VW_quant, VA, VB, VS,
+                inplace = True):
+        dtype = X.dtype
+        Q = matmul_lora(X, QW, QW_quant, QA, QB, QS)
+        K = matmul_lora(X, KW, KW_quant, KA, KB, KS)
+        V = matmul_lora(X, VW, VW_quant, VA, VB, VS)
+        ctx.custom_saved_tensors = (
+            QW, QW_quant, QS,
+            KW, KW_quant, KS,
+            VW, VW_quant, VS,
+        )
+        ctx.save_for_backward(X, QA, QB, KA, KB, VA, VB,)
+        ctx.inplace = inplace
+        return Q, K, V
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dQ, dK, dV):
+        QW, QW_quant, QS, KW, KW_quant, KS, VW, VW_quant, VS = \
+            ctx.custom_saved_tensors
+        X, QA, QB, KA, KB, VA, VB, = ctx.saved_tensors
+        batch, seq_len, hd = X.shape
+        dQ = dQ.view(-1, dQ.shape[-1])
+        dK = dK.reshape(-1, dK.shape[-1]) # view doesn't work on K.T
+        dV = dV.view(-1, dV.shape[-1])
+        X  = X .view(-1, X .shape[-1])
+        dtype = X.dtype
+        QA, QB, KA, KB, VA, VB = \
+            QA.to(dtype), QB.to(dtype), KA.to(dtype), KB.to(dtype), VA.to(dtype), VB.to(dtype)
+        QA, QB, KA, KB, VA, VB = \
+            QA.t(), QB.t(), KA.t(), KB.t(), VA.t(), VB.t()
+        ### Weight projection LoRA weights
+        # See our blogpost for more details.
+        d_QA = torch.empty_like(QA)
+        d_QB = torch.empty_like(QB)
+        d_KA = torch.empty_like(KA)
+        d_KB = torch.empty_like(KB)
+        d_VA = torch.empty_like(VA)
+        d_VB = torch.empty_like(VB)
+        # Q Projection
+        # d_QA = X.t() @ (dQ @ QB.t())
+        # d_QB = (QA.t() @ X.t()) @ dQ
+        # d_QA *= QS
+        # d_QB *= QS
+        d_QA.addmm_(X.t(), dQ @ QB.t(), alpha = QS, beta = 0)
+        d_QB.addmm_(QA.t() @ X.t(), dQ, alpha = QS, beta = 0)
+        # K Projection
+        # d_KA = X.t() @ (dK @ KB.t())
+        # d_KB = (KA.t() @ X.t()) @ dK
+        # d_KA *= KS
+        # d_KB *= KS
+        d_KA.addmm_(X.t(), dK @ KB.t(), alpha = KS, beta = 0)
+        d_KB.addmm_(KA.t() @ X.t(), dK, alpha = KS, beta = 0)
+        # V Projection
+        # d_VA = X.t() @ (dV @ VB.t())
+        # d_VB = (VA.t() @ X.t()) @ dV
+        # d_VA *= VS
+        # d_VB *= VS
+        d_VA.addmm_(X.t(), dV @ VB.t(), alpha = VS, beta = 0)
+        d_VB.addmm_(VA.t() @ X.t(), dV, alpha = VS, beta = 0)
+        # Combine derivatives to find dX
+        # dQ
+        QW = fast_dequantize(QW.t(), QW_quant)
+        dX = torch.matmul(dQ, QW.t(), out = X if ctx.inplace else None)
+        del QW
+        # dX += (dQ @ QB.to(dtype).t() @ (QS * QA.to(dtype).t()))
+        dX.addmm_(dQ @ QB.t(), QA.t(), alpha = QS)
+        # dK
+        KW = fast_dequantize(KW.t(), KW_quant)
+        # dX += dK @ KW.t()
+        dX.addmm_(dK, KW.t())
+        del KW
+        # dX += dK @ KB.to(dtype).t() @ (KS * KA.to(dtype).t())
+        dX.addmm_(dK @ KB.t(), KA.t(), alpha = KS)
+        # dV
+        VW = fast_dequantize(VW.t(), VW_quant)
+        # dX += dV @ VW.t()
+        dX.addmm_(dV, VW.t())
+        del VW
+        # dX += dV @ VB.to(dtype).t() @ (VS * VA.to(dtype).t())
+        dX.addmm_(dV @ VB.t(), VA.t(), alpha = VS)
+        # QW, QW_quant, QA, QB, QS,
+        # KW, KW_quant, KA, KB, KS,
+        # VW, VW_quant, VA, VB, VS,
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_QA.t(), d_QB.t(), None, \
+            None, None, d_KA.t(), d_KB.t(), None, \
+            None, None, d_VA.t(), d_VB.t(), None, \
+            None,
+    pass
+pass
+def apply_lora_qkv(self, X, inplace = True):
+    QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
+    KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
+    VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
+    Q, K, V = LoRA_QKV.apply(X,
+        QW, QW_quant, QA, QB, QS,
+        KW, KW_quant, KA, KB, KS,
+        VW, VW_quant, VA, VB, VS,
+        inplace,
+    )
+    return Q, K, V
+pass
+class LoRA_W(torch.autograd.Function):
+    """
+    ### LoRA weights
+    Wq = Wq + Aq @ Bq
+    Wk = Wk + Ak @ Bk
+    Wv = Wv + Av @ Bv
+    Q = X @ Wq = X @ Wq + X @ Aq @ Bq
+    K = X @ Wk = X @ Wk + X @ Ak @ Bk
+    V = X @ Wv = X @ Wv + X @ Av @ Bv
+    ### Backpropagation chain rule
+    dC/dWq = X.T @ D(Wq)
+    dC/dWk = X.T @ D(Wk)
+    dC/dWv = X.T @ D(Wv)
+    ### Q projection LoRA weights
+    dC/dAq =       X.T @ D(Wq) @ B.T
+    dC/dBq = A.T @ X.T @ D(Wq)
+    ### K projection LoRA weights
+    dC/dAk =       X.T @ D(Wk) @ B.T
+    dC/dBk = A.T @ X.T @ D(Wk)
+    ### V projection LoRA weights
+    dC/dAv =       X.T @ D(Wv) @ B.T
+    dC/dBv = A.T @ X.T @ D(Wv)
+    """
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(ctx, X : torch.Tensor,
+                W, W_quant, A, B, S):
+        dtype = X.dtype
+        XW = matmul_lora(X, W, W_quant, A, B, S)
+        ctx.custom_saved_tensors = (W, W_quant, S,)
+        ctx.save_for_backward(A, B, X)
+        return XW
+    pass
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(ctx, dY : torch.Tensor):
+        W, W_quant, S = ctx.custom_saved_tensors
+        A, B, X = ctx.saved_tensors
+        batch, seq_len, hd = X.shape
+        dY = dY.reshape(-1, dY.shape[-1]) # Must be reshape
+        X  = X .reshape(-1, X .shape[-1]) # Must be reshape
+        dtype = X.dtype
+        A, B = A.to(dtype), B.to(dtype)
+        A, B = A.t(), B.t()
+        d_A = torch.empty_like(A)
+        d_B = torch.empty_like(B)
+        ### Weight projection LoRA weights
+        # Weight projection
+        # d_A = X.t() @ (dY @ B.t())
+        # d_B = (A.t() @ X.t()) @ dY
+        # d_A *= S
+        # d_B *= S
+        d_A.addmm_(X.t(), dY @ B.t(), alpha = S, beta = 0)
+        d_B.addmm_(A.t() @ X.t(), dY, alpha = S, beta = 0)
+        # Get derivative for dX
+        W = fast_dequantize(W.t(), W_quant)
+        dX = dY @ W.t()
+        del W
+        # dX += dY @ B.to(dtype).t() @ (S * A.to(dtype).t())
+        dX.addmm_(dY @ B.t(), A.t(), alpha = S)
+        # W, W_quant, A, B, S
+        return dX.view(batch, seq_len, hd), \
+            None, None, d_A.t(), d_B.t(), None
+    pass
+pass
+def apply_lora_o(self, X):
+    OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
+    O = LoRA_W.apply(X, OW, OW_quant, OA, OB, OS)
+    return O
+pass
+IDENTITY_DROPOUT = torch.nn.Identity
+@torch._disable_dynamo
+def fast_lora_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    raise NotImplementedError(
+        "Unsloth: Currently not supported yet - reshaping done incorrectly"
+    )
+    self._check_forward_args(x, *args, **kwargs)
+    adapter_names = kwargs.pop("adapter_names", None)
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs)
+    elif self.merged:
+        result = self.base_layer(x, *args, **kwargs)
+    else:
+        # Fastpath
+        if len(self.active_adapters) == 1:
+            active_adapter = self.active_adapters[0]
+            if active_adapter not in self.lora_A.keys(): return self.base_layer(x, *args, **kwargs)
+            dropout = self.lora_dropout[active_adapter]
+            if isinstance(dropout, IDENTITY_DROPOUT) and not self.use_dora[active_adapter]:
+                lora_A = self.lora_A[active_adapter].weight
+                lora_B = self.lora_B[active_adapter].weight
+                scaling = self.scaling[active_adapter]
+                W = self.base_layer.weight
+                return LoRA_W.apply(x, W, QUANT_STATE(W), lora_A, lora_B, scaling)
+            pass
+        pass
+        result = self.base_layer(x, *args, **kwargs)
+        # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+        # The reason is that in some cases, an error can occur that backprop
+        # does not work on a manipulated view. This issue may be solved with
+        # newer PyTorch versions but this would need extensive testing to be
+        # sure.
+        result = result.clone()
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+            if not self.use_dora[active_adapter]:
+                result = result + lora_B(lora_A(dropout(x))) * scaling
+            else:
+                if isinstance(dropout, torch.nn.Identity) or not self.training:
+                    base_result = result
+                else:
+                    x = dropout(x)
+                    base_result = None
+                result = result + self.lora_magnitude_vector[active_adapter](
+                    x,
+                    lora_A=lora_A,
+                    lora_B=lora_B,
+                    scaling=scaling,
+                    base_layer=self.get_base_layer(),
+                    base_result=base_result,
+                )
+            if requires_conversion:
+                result = result.to(expected_dtype)
+    return result
+pass

torch-ext/unsloth_kernels/flex_attention.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from functools import lru_cache
+from transformers.models.llama.modeling_llama import logger
+import os
+torch_compile_options = {
+    "epilogue_fusion"   : True,
+    "max_autotune"      : True,
+    "shape_padding"     : True,
+    "trace.enabled"     : os.environ.get("UNSLOTH_COMPILE_DEBUG", "0") == "1",
+    "triton.cudagraphs" : False,
+}
+# Flex Attention supported from torch 2.5 onwards only
+try:
+    from torch.nn.attention.flex_attention import (
+        flex_attention as _flex_attention,
+        create_block_mask as _create_block_mask,
+    )
+    _flex_attention = torch.compile(_flex_attention, dynamic = True, options = torch_compile_options)
+    HAS_FLEX_ATTENTION = False
+except:
+    HAS_FLEX_ATTENTION = False
+pass
+if not HAS_FLEX_ATTENTION:
+    # Logit softcapping
+    @torch.compile(fullgraph = True, dynamic = True, options = torch_compile_options)
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.config.num_attention_heads
+        head_dim   = self.head_dim
+        n_kv_heads = self.config.num_key_value_heads
+        n_groups   = self.num_key_value_groups
+        # Grouped query attention
+        K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+        K = K.reshape(bsz, n_heads, q_len, head_dim)
+        V = V.reshape(bsz, n_heads, q_len, head_dim)
+        # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+        # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+        # We default to using the config file itself
+        # s = self.config.hidden_size // self.config.num_attention_heads
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+        A = torch.matmul(Q, K.transpose(2, 3))
+        A = t * torch.tanh(A / t) # Logit softcapping
+        A += causal_mask[:q_len, :q_len]
+        # Much slower in torch compile!
+        # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+        A = torch.nn.functional.softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+        A = torch.matmul(A, V)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+    create_flex_attention_causal_mask = None
+    create_flex_attention_sliding_window_mask = None
+else:
+    # See https://github.com/pytorch-labs/attention-gym/blob/main/examples/flex_attn.ipynb
+    # for more examples
+    # BSD 3-Clause License Copyright (c) 2023, Driss Guessous, Horace He et al
+    import functools, math
+    def generate_tanh_softcap(t):
+        def tanh_softcap(x, b, h, q_idx, kv_idx):
+            return t * torch.tanh(x / t)
+        return tanh_softcap
+    pass
+    def causal_masker(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+    pass
+    @functools.lru_cache
+    def sliding_window_masker(size = 4096):
+        def sliding_window(b, h, q_idx, kv_idx):
+            causal_mask = q_idx >= kv_idx
+            window_mask = q_idx - kv_idx <= size
+            return causal_mask & window_mask
+        return sliding_window
+    pass
+    @functools.lru_cache
+    def create_block_mask(mask, n = 128):
+        return _create_block_mask(
+            mask, 1, 1, n, n,
+            BLOCK_SIZE = 128,
+            _compile = True,
+        )
+    pass
+    def create_flex_attention_causal_mask(max_seq_length = 8192):
+        causal_mask = create_block_mask(causal_masker, max_seq_length)
+        return causal_mask
+    pass
+    def create_flex_attention_sliding_window_mask(max_seq_length = 8192, sliding_window = 4096):
+        sliding_masker = sliding_window_masker(sliding_window)
+        causal_mask = create_block_mask(sliding_masker, max_seq_length)
+        return causal_mask
+    pass
+    @functools.lru_cache
+    def flex_attention(s, t):
+        scale = 1.0 / math.sqrt(s)
+        score_mod = generate_tanh_softcap(t)
+        return functools.partial(
+            _flex_attention, score_mod = score_mod, scale = scale, enable_gqa = True,
+        )
+    pass
+    def slow_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+        n_heads    = self.config.num_attention_heads
+        head_dim   = self.head_dim
+        s = self.config.query_pre_attn_scalar
+        t = self.config.attn_logit_softcapping
+        fx = flex_attention(s, t)
+        A = fx(query = Q, key = K, value = V, block_mask = causal_mask)
+        A = A.transpose(1, 2).contiguous()
+        A = A.reshape(bsz, q_len, n_heads*head_dim)
+        return A
+    pass
+pass
+torch_matmul = torch.matmul
+torch_tanh   = torch.tanh
+torch_nn_functional_softmax = torch.nn.functional.softmax
+def slow_inference_attention_softcapping(Q, K, V, causal_mask, self, bsz, q_len):
+    n_heads    = self.config.num_attention_heads
+    head_dim   = self.head_dim
+    n_kv_heads = self.config.num_key_value_heads
+    n_groups   = self.num_key_value_groups
+    # Grouped query attention
+    K = K[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    V = V[:, :, None, :, :].expand(bsz, n_kv_heads, n_groups, q_len, head_dim)
+    K = K.reshape(bsz, n_heads, q_len, head_dim)
+    V = V.reshape(bsz, n_heads, q_len, head_dim)
+    # See https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
+    # Gemma 9b should use 256 and not 224 (hs / nah). 27b uses the below
+    # We default to using the config file itself
+    # s = self.config.hidden_size // self.config.num_attention_heads
+    s = self.config.query_pre_attn_scalar
+    t = self.config.attn_logit_softcapping
+    Q = Q * torch.tensor(s**-0.5, dtype = Q.dtype) # Follow Keras exactly
+    A = torch_matmul(Q, K.transpose(2, 3))
+    # Logit softcapping
+    A /= t; torch_tanh(A, out = A); A *= t;
+    A += causal_mask[:q_len, :q_len]
+    # Much slower in torch compile!
+    # A.masked_fill_(causal_mask[:q_len, :q_len], -float("inf"))
+    A = torch_nn_functional_softmax(A, dim = -1, dtype = torch.float32).to(Q.dtype)
+    A = torch_matmul(A, V)
+    A = A.transpose(1, 2).contiguous()
+    A = A.reshape(bsz, q_len, n_heads*head_dim)
+    return A
+pass

torch-ext/unsloth_kernels/geglu.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import (
+    calculate_settings,
+    triton_tanh,
+    torch_cuda_device,
+)
+@triton.jit
+def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    # h = f * up
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+    f_row = 0.5 * e_row * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+def geglu_exact_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    device = gate.device
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = device)
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(device):
+        _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+@triton.jit
+def _exact_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    h = f * up
+    df/de (with help of Wolfram :)
+    df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+    Reuse via
+    f =        1/2 * (1 + erf(1/sqrt(2) * e)) * e
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # Break e_row away for re-use
+    # f = 1/2 * e * (1 + erf(1/sqrt(2) * e))
+    f_partial_row = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * e_row) + 1.0)
+    f_row = f_partial_row * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    # df/de = 1/2 * (1 + erf(1/sqrt(2) * e)) + 1/sqrt(2*pi) * e * exp(-1/2 * e^2)
+    t = 0.3989422804014327 # 1/sqrt(2*pi)
+    df_de = f_partial_row + t * e_row * tl.exp(-0.5 * e_row * e_row)
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+def geglu_exact_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(e.device):
+        _exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass
+@triton.jit
+def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    # h = f * up
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+    f_row = 0.5 * e_row * (
+        triton_tanh(s * e_row * (1.0 + 0.044715 * e_row * e_row)) \
+        + 1.0
+    )
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    h_row = f_row * g_row
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+def geglu_approx_forward_kernel(gate, up):
+    batch, seq_len, hd = gate.shape
+    n_elements = gate.numel()
+    device = gate.device
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = device)
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(device):
+        _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    return out
+pass
+@triton.jit
+def _approx_backward_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    f = 1/2 * e * (1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) ))
+    h = f * up
+    df/de (with help from https://arxiv.org/pdf/2305.12073.pdf :))
+    df/de = 1/2 * [1 + tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )] +
+            1/2 * sech^2 [   sqrt(2/pi) * x * (1 + 0.044715 * x^2 )  ] * \
+                           ( sqrt(2/pi) * x * (1 + 0.044715 * x^2 * 3 ) )
+    Notice sech^2(x) = 1 - tanh^2(x)
+    So reuse tanh( sqrt(2/pi) * x * (1 + 0.044715 * x^2 ) )
+    See https://www.desmos.com/calculator/nqprfoni6x
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # See https://www.desmos.com/calculator/nqprfoni6x
+    s = 0.7978845608028654 # math.sqrt(2 / math.pi)
+    a = s * e_row # a = sqrt(2 / pi) * x
+    b = a * 0.044715 * e_row * e_row # b = a * 0.044715 * x^2
+    T = 1.0 + triton_tanh(a + b)
+    T2 = 0.5 * T
+    # Q = 0.5 * -T * (T - 2.0) * (a + 3.0 * b)
+    Q2 = -T2 * (T - 2.0) * (a + 3.0 * b)
+    df_de = T2 + Q2 # 1/2 * (T + Q)
+    # f = 1/2 * e * (1 + tanh( sqrt(2/pi) * (x + 0.044715 * x^3 ) ))
+    f_row = T2 * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    de_row = dg_row.to(tl.float32) * df_de
+    de_row = de_row.to(DW_row.dtype)
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+def geglu_approx_backward_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(e.device):
+        _approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass

torch-ext/unsloth_kernels/layernorm.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+# Copyright 2024-present Andrej Karpathy & the llm.c team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, torch_cuda_device
+from unsloth_zoo.patching_utils import (
+    patch_layernorm,
+)
+@triton.jit
+def layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W,
+    b,
+    r,
+    mu,
+    n_cols : tl.constexpr,
+    eps : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr
+):
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    Y  += row_idx * Y_row_stride
+    X  += row_idx * X_row_stride
+    r  += row_idx
+    mu += row_idx
+    # According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
+    # are in float32!
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask = mask, other = 0).to(tl.float32)
+    mean_X  = tl.sum(X_row,   axis = 0) / n_cols
+    # (X[0] - mean) == -mean so we need to mask it out
+    XX = tl.where(mask, X_row - mean_X, 0)
+    row_var = tl.sum(XX * XX, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store (r, inv_var)
+    tl.store (mu, mean_X)
+    output = (XX * inv_var) * W_row + b_row
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+@triton.jit
+def layernorm_backward(
+    dY, dY_row_stride,
+    X,   X_row_stride,
+    W,
+    b,
+    r,
+    mu,
+    n_cols : tl.constexpr,
+    eps : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr
+):
+    # Approximately follows https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dY += row_idx * dY_row_stride
+    X  += row_idx *  X_row_stride
+    r  += row_idx
+    mu += row_idx
+    # According to https://pytorch.org/torchtune/stable/_modules/torchtune/modules/layer_norm.html#Fp32LayerNorm, all modules
+    # are in float32!
+    dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
+    X_row  = tl.load(X  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row  = tl.load(W  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    b_row  = tl.load(b  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    inv_var = tl.load(r) .to(tl.float32)
+    mean    = tl.load(mu).to(tl.float32)
+    normed  = (X_row - mean) * inv_var
+    dY_W = dY_row * W_row
+    dX_row = dY_W - tl.sum(dY_W, axis = 0) / n_cols - normed * tl.sum(dY_W * normed, axis = 0) / n_cols
+    dX_row = dX_row * inv_var
+    tl.store(dY + col_offsets, dX_row, mask = mask)
+pass
+class Fast_Layernorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X, W, b, eps):
+        shape = X.shape
+        dim = shape[-1]
+        X = X.view(-1, dim)
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        device = X.device
+        Y  = torch.empty((n_rows, n_cols), dtype = X.dtype, device = device)
+        r  = torch.empty(n_rows, dtype = torch.float32, device = device)
+        mu = torch.empty(n_rows, dtype = torch.float32, device = device)
+        with torch_cuda_device(device):
+            layernorm_forward[(n_rows,)](
+                Y, Y.stride(0),
+                X, X.stride(0),
+                W,
+                b,
+                r,
+                mu,
+                n_cols, eps,
+                BLOCK_SIZE = BLOCK_SIZE,
+                num_warps  = num_warps,
+            )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.save_for_backward(X, W, b, r, mu)
+        return Y.view(*shape)
+    pass
+    @staticmethod
+    def backward(ctx, dY):
+        shape = dY.shape
+        dim = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, b, r, mu = ctx.saved_tensors
+        n_rows, n_cols = dY.shape
+        with torch_cuda_device(dY.device):
+            layernorm_backward[(n_rows,)](
+                dY, dY.stride(0),
+                X,  X .stride(0),
+                W,
+                b,
+                r,
+                mu,
+                n_cols, ctx.eps,
+                BLOCK_SIZE = ctx.BLOCK_SIZE,
+                num_warps  = ctx.num_warps,
+            )
+        dX = dY.view(*shape)
+        return dX, None, None, None, None
+    pass
+pass
+def fast_layernorm(layernorm, X):
+    assert(layernorm.elementwise_affine is True)
+    W    = layernorm.weight
+    bias = layernorm.bias
+    eps = layernorm.variance_epsilon if \
+        hasattr(layernorm, "variance_epsilon") \
+        else layernorm.eps
+    out = Fast_Layernorm.apply(X, W, bias, eps)
+    return out
+pass

torch-ext/unsloth_kernels/rms_layernorm.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, torch_cuda_device
+@triton.jit
+def _rms_layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W, W_row_stride,
+    r, r_row_stride : tl.constexpr,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr,
+):
+    """
+        Fast RMS Layernorm kernel
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    Y += row_idx * Y_row_stride
+    X += row_idx * X_row_stride
+    r += row_idx * r_row_stride
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0)#.to(tl.float32)
+    row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store(r, inv_var)
+    normed = X_row * inv_var
+    normed = normed.to(W_row.dtype) # Exact copy from HF
+    output = normed * W_row
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+def _rms_layernorm_backward(
+    dY, dY_row_stride,
+    dX, dX_row_stride,
+    X,   X_row_stride,
+    W,   W_row_stride,
+    r,   r_row_stride : tl.constexpr,
+    # dW, dW_row_stride,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
+    GEMMA      : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr,
+):
+    """
+        Fast RMS Layernorm kernel for the backward pass
+        Inspiration from a Triton tutorial:
+        https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    """
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    dY += row_idx * dY_row_stride
+    X  += row_idx *  X_row_stride
+    r  += row_idx *  r_row_stride
+    if GEMMA: dX += row_idx * dY_row_stride
+    else:     dX = dY
+    dY_row = tl.load(dY + col_offsets, mask = mask, other = 0).to(tl.float32)
+    X_row  = tl.load(X  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row  = tl.load(W  + col_offsets, mask = mask, other = 0).to(tl.float32)
+    # Get saved row variance
+    inv_var = tl.load(r).to(tl.float32)
+    normed = X_row * inv_var
+    if GEMMA: dY_W = dY_row * (W_row + 1.0)
+    else:     dY_W = dY_row * W_row
+    rowsum_dY_normed = tl.sum(dY_W * normed, axis = 0)
+    output = inv_var/n_cols * (n_cols*dY_W - normed*rowsum_dY_normed)
+    tl.store(dX + col_offsets, output, mask = mask)
+pass
+_rms_layernorm_backward = triton.jit(_rms_layernorm_backward)
+_rms_layernorm_backward = triton.heuristics(
+    {
+        "GEMMA": lambda args: bool(args["GEMMA"]),
+    }
+)(_rms_layernorm_backward)
+@triton.jit
+def _gemma_rms_layernorm_forward(
+    Y, Y_row_stride,
+    X, X_row_stride,
+    W, W_row_stride,
+    r, r_row_stride : tl.constexpr,
+    n_cols     : tl.constexpr,
+    eps        : tl.constexpr,
+    BLOCK_SIZE : tl.constexpr,
+):
+    # Copies https://github.com/google-deepmind/gemma/blob/main/gemma/layers.py#L31
+    # and https://github.com/keras-team/keras-nlp/blob/v0.8.2/keras_nlp/models/gemma/rms_normalization.py#L33
+    # exactly. Essentially all in float32!
+    row_idx = tl.program_id(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    Y += row_idx * Y_row_stride
+    X += row_idx * X_row_stride
+    r += row_idx * r_row_stride
+    X_row = tl.load(X + col_offsets, mask = mask, other = 0).to(tl.float32)
+    W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
+    row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
+    inv_var = tl.math.rsqrt(row_var + eps)
+    tl.store(r, inv_var)
+    normed = X_row * inv_var
+    output = normed * (W_row + 1.0)
+    tl.store(Y + col_offsets, output, mask = mask)
+pass
+class Fast_RMS_Layernorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X : torch.Tensor, W : torch.Tensor, eps : float, gemma : bool = False):
+        shape = X.shape
+        dim : int = shape[-1]
+        X = X.view(-1, dim)
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE : int
+        num_warps  : int
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        device = X.device
+        Y = torch.empty((n_rows, n_cols), dtype = X.dtype, device = device)
+        r = torch.empty(n_rows, dtype = torch.float32, device = device)
+        fx = _gemma_rms_layernorm_forward if gemma else _rms_layernorm_forward
+        with torch_cuda_device(device):
+            fx[(n_rows,)](
+                Y, Y.stride(0),
+                X, X.stride(0),
+                W, W.stride(0),
+                r, r.stride(0),
+                n_cols, eps,
+                BLOCK_SIZE = BLOCK_SIZE,
+                num_warps  = num_warps,
+            )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.GEMMA = gemma
+        ctx.save_for_backward(X, W, r)
+        return Y.view(*shape)
+    pass
+    @staticmethod
+    def backward(ctx, dY : torch.Tensor):
+        shape = dY.shape
+        dim : int = shape[-1]
+        dY = dY.view(-1, dim)
+        X, W, r = ctx.saved_tensors
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = dY.shape
+        # dW = X
+        dX = torch.empty_like(dY) if ctx.GEMMA else dY
+        with torch_cuda_device(dY.device):
+            _rms_layernorm_backward[(n_rows,)](
+                dY, dY.stride(0),
+                dX, dX.stride(0),
+                X,  X .stride(0),
+                W,  W .stride(0),
+                r,  r .stride(0),
+                # dW, dW.stride(0),
+                n_cols, ctx.eps,
+                GEMMA      = ctx.GEMMA,
+                BLOCK_SIZE = ctx.BLOCK_SIZE,
+                num_warps  = ctx.num_warps,
+            )
+        dX = dX.view(*shape)
+        return dX, None, None, None
+    pass
+pass
+# [TODO] Unsure why RMS Layernorm is not torch.compiling properly
+@torch.compiler.disable
+def fast_rms_layernorm(layernorm, X : torch.Tensor, gemma : bool = False):
+    W : torch.Tensor = layernorm.weight
+    eps : float = layernorm.variance_epsilon if \
+        hasattr(layernorm, "variance_epsilon") \
+        else layernorm.eps
+    out = Fast_RMS_Layernorm.apply(X, W, eps, gemma)
+    return out
+pass
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+class Unsloth_LlamaRMSNorm(LlamaRMSNorm):
+    def forward(self, X):
+        return fast_rms_layernorm(self, X, gemma = False)
+    pass
+pass
+try:
+    from transformers.models.mllama.modeling_mllama import MllamaTextRMSNorm
+    class Unsloth_MllamaTextRMSNorm(MllamaTextRMSNorm):
+        def forward(self, X):
+            return fast_rms_layernorm(self, X, gemma = False)
+        pass
+    pass
+except:
+    pass
+pass
+def patch_rms_layernorm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaRMSNorm = Unsloth_LlamaRMSNorm
+    try:
+        import transformers.models.mllama.modeling_mllama
+        transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = Unsloth_MllamaTextRMSNorm
+    except:
+        pass
+    return
+pass
+def unpatch_rms_layernorm():
+    import transformers.models.llama.modeling_llama
+    transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+    try:
+        import transformers.models.mllama.modeling_mllama
+        transformers.models.mllama.modeling_mllama.MllamaTextRMSNorm = MllamaTextRMSNorm
+    except:
+        pass
+    return
+pass

torch-ext/unsloth_kernels/rope_embedding.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, torch_cuda_device
+ROPE_GROUP_SIZE : int = 4
+def _rope_embedding(
+    Q,     Q_row_stride,
+    cos, cos_row_stride,
+    sin, sin_row_stride,
+    seqlen,
+    head_dim      : tl.constexpr,
+    n_heads       : tl.constexpr,
+    BACKWARD_PASS : tl.constexpr,
+    BLOCK_SIZE    : tl.constexpr,
+):
+    """
+        Calculates the RoPE Embedding quickly
+        RoPE is Q * cos + rotate_half(Q) * sin
+        See our blog post for more info
+    """
+    ROPE_GROUP_SIZE = 4
+    row_position  = tl.program_id(0)
+    group_head_position = tl.program_id(1)
+    col_offsets  = tl.arange(0, BLOCK_SIZE)
+    half_head_dim = head_dim // 2
+    mask = col_offsets < half_head_dim
+    sin1 = tl.load(sin + (row_position % seqlen)*sin_row_stride + \
+                   half_head_dim*0 + col_offsets, mask = mask, other = 0)
+    cos1 = tl.load(cos + (row_position % seqlen)*cos_row_stride + \
+                   half_head_dim*0 + col_offsets, mask = mask, other = 0)
+    if BACKWARD_PASS:
+        # See our blog post for more info.
+        sin1 = -sin1
+    pass
+    # [TODO] Autotune ROPE_GROUP_SIZE to be 1, 2, 4, 8
+    head_start = group_head_position * ROPE_GROUP_SIZE
+    head_end = min((head_start + ROPE_GROUP_SIZE), n_heads)
+    # 10% Faster kernel from [HuyNguyen-hust](https://github.com/unslothai/unsloth/pull/238)
+    for k in range(head_start, head_end):
+        offs_q1 = row_position * Q_row_stride + k * head_dim + col_offsets
+        offs_q2 = row_position * Q_row_stride + k * head_dim + col_offsets + half_head_dim
+        # For Gemma - sometimes RoPE must be done in float32 and not bfloat16
+        Q1 = tl.load(Q + offs_q1, mask = mask, other = 0).to(sin1.dtype)
+        Q2 = tl.load(Q + offs_q2, mask = mask, other = 0).to(sin1.dtype)
+        tl.store(Q + offs_q1, Q1*cos1 - Q2*sin1, mask = mask)
+        tl.store(Q + offs_q2, Q2*cos1 + Q1*sin1, mask = mask)
+    pass
+pass
+_rope_embedding = triton.jit(_rope_embedding)
+_rope_embedding = triton.heuristics(
+    {
+        "BACKWARD_PASS": lambda args: bool(args["BACKWARD_PASS"]),
+    }
+)(_rope_embedding)
+class Fast_RoPE_Embedding(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, cos, sin):
+        cos, sin = cos.squeeze(), sin.squeeze()
+        batch    : int
+        seq_len  : int
+        n_heads  : int
+        head_dim : int
+        batch, seq_len, n_heads, head_dim = Q.shape
+        Q = Q.view(batch*seq_len, n_heads*head_dim)
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = Q.shape
+        assert(seq_len <= cos.shape[0])
+        # [TODO] Changing blocksize to head_dim//2 seems to have
+        # some concurrency / un-deterministic issues.
+        BLOCK_SIZE, num_warps = calculate_settings(head_dim//2) # (head_dim//2)
+        # group_size = 4 # 4 or 8, too large group_size can hurt performance.
+        div : int
+        mod : int
+        div, mod = divmod(n_heads, ROPE_GROUP_SIZE)
+        n_groups : int = div + (mod != 0)
+        with torch_cuda_device(Q.device):
+            _rope_embedding[(n_rows, n_groups, )](
+                  Q,   Q.stride(0),
+                cos, cos.stride(0),
+                sin, sin.stride(0),
+                seq_len,
+                head_dim, n_heads,
+                BACKWARD_PASS = False,
+                BLOCK_SIZE = BLOCK_SIZE,
+                num_warps  = num_warps,
+            )
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps  = num_warps
+        ctx.n_groups = n_groups
+        ctx.cos = cos
+        ctx.sin = sin
+        return Q.view(batch, seq_len, n_heads, head_dim)
+    pass
+    @staticmethod
+    def backward(ctx, dY):
+        batch    : int
+        seq_len  : int
+        n_heads  : int
+        head_dim : int
+        batch, seq_len, n_heads, head_dim = dY.shape
+        dY = dY.reshape(batch*seq_len, n_heads*head_dim)
+        # Must be reshape not view
+        n_rows : int
+        n_cols : int
+        n_rows, n_cols = dY.shape
+        cos = ctx.cos
+        sin = ctx.sin
+        with torch_cuda_device(dY.device):
+            _rope_embedding[(n_rows, ctx.n_groups, )](
+                dY,  dY .stride(0),
+                cos, cos.stride(0),
+                sin, sin.stride(0),
+                seq_len, head_dim, n_heads,
+                BACKWARD_PASS = True,
+                BLOCK_SIZE = ctx.BLOCK_SIZE,
+                num_warps  = ctx.num_warps,
+            )
+        dY = dY.view(batch, seq_len, n_heads, head_dim)
+        return dY, None, None,
+    pass
+pass
+# [TODO] Unsure why RoPE Embedding is not torch.compiling properly
+@torch.compiler.disable
+def fast_rope_embedding(Q, K, cos, sin):
+    Q = Fast_RoPE_Embedding.apply(Q.transpose(1, 2), cos, sin).transpose(1, 2)
+    K = Fast_RoPE_Embedding.apply(K.transpose(1, 2), cos, sin).transpose(1, 2)
+    return Q, K
+pass
+class Slow_RoPE_Embedding(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, cos, sin, position_ids):
+        if position_ids is not None:
+            # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+            cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+            sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+            cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+            sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        # Q * cos + rotate_half(Q) * sin
+        half = Q.shape[-1]//2
+        RH_Q = torch.cat((-Q[..., half:], Q[..., :half]), dim = -1)
+        Q *= cos
+        Q.addcmul_(RH_Q, sin)
+        # RH_Q *= sin
+        # Q += RH_Q
+        ctx.save_for_backward(cos, sin)
+        return Q
+    pass
+    @staticmethod
+    def backward(ctx, dY):
+        cos, sin = ctx.saved_tensors
+        # Q * cos + rotate_half.T(Q) * sin
+        half = dY.shape[-1]//2
+        RH_dY = torch.cat((dY[..., half:], -dY[..., :half]), dim = -1)
+        dY *= cos
+        dY.addcmul_(RH_dY, sin)
+        # RH_dY *= sin
+        # dY += RH_dY
+        return dY, None, None, None
+    pass
+pass
+def inplace_rope_embedding(Q, K, cos, sin, position_ids):
+    Q = Slow_RoPE_Embedding.apply(Q, cos, sin, position_ids)
+    K = Slow_RoPE_Embedding.apply(K, cos, sin, position_ids)
+    return Q, K
+pass

torch-ext/unsloth_kernels/swiglu.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+import triton.language as tl
+import torch
+from .utils import calculate_settings, torch_cuda_device
+@triton.jit
+def _fg_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    e_row = tl.load(e + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row = tl.load(g + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # f = e * sigmoid(e)
+    f_row = e_row * tl.sigmoid(e_row) # e_row / (1 + tl.exp(-e_row))
+    f_row = f_row.to(g_row.dtype) # Exact copy from HF
+    # h = f * g
+    h_row = f_row * g_row
+    # Store h
+    tl.store(h + offsets, h_row, mask = mask)
+pass
+def swiglu_fg_kernel(e, g):
+    batch, seq_len, hd = e.shape
+    n_elements = e.numel()
+    h = torch.empty((batch, seq_len, hd), dtype = e.dtype, device = e.device)
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(e.device):
+        _fg_kernel[grid](e, g, h, n_elements, BLOCK_SIZE = 1024,)
+    return h
+pass
+@triton.jit
+def _DWf_DW_dfg_kernel(DW, e, g, n_elements, BLOCK_SIZE : tl.constexpr,):
+    """
+    e = e.float()
+    se = 1.0 / (1.0 + torch.exp(-e))
+    f = (se * e).to(dtype)
+    h = f * g
+    df = DW * f
+    dg = DW * g
+    de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
+    """
+    block_idx = tl.program_id(0)
+    offsets = block_idx*BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    DW_row = tl.load(DW + offsets, mask = mask, other = 0)#.to(tl.float32)
+    e_row  = tl.load(e  + offsets, mask = mask, other = 0).to(tl.float32)
+    g_row  = tl.load(g  + offsets, mask = mask, other = 0)#.to(tl.float32)
+    # e = e.float()
+    # se = 1.0 / (1.0 + torch.exp(-e))
+    se_row = tl.sigmoid(e_row) # 1.0 / (1.0 + tl.exp(-e_row))
+    # f = (se * e).to(dtype)
+    f_row = se_row * e_row
+    f_row = f_row.to(DW_row.dtype)
+    # h = f * g
+    h_row  =  f_row * g_row
+    # df = DW * f
+    df_row = DW_row * f_row
+    # dg = DW * g
+    dg_row = DW_row * g_row
+    # de = (dg.float() * se * (1.0 + e * (1.0 - se))).to(dtype)
+    de_row = dg_row.to(tl.float32) * se_row * (1.0 + e_row * (1.0 - se_row))
+    de_row = de_row.to(DW_row.dtype)
+    # Store derivatives in buffers
+    tl.store(DW + offsets, h_row,  mask = mask) # h  = f * g
+    tl.store(e  + offsets, df_row, mask = mask) # df = DW * f
+    tl.store(g  + offsets, de_row, mask = mask) # de
+pass
+def swiglu_DWf_DW_dfg_kernel(DW, e, g):
+    batch_seq_len, hd = e.shape
+    n_elements = e.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    with torch_cuda_device(e.device):
+        _DWf_DW_dfg_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    return DW, e, g
+pass

torch-ext/unsloth_kernels/utils.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import triton
+MAX_FUSED_SIZE : int = 65536
+next_power_of_2 = triton.next_power_of_2
+import functools
+# torch.cuda.amp.custom_fwd is deprecated >= 2.4
+import torch
+torch_Tensor = torch.Tensor
+from packaging.version import Version
+if Version(torch.__version__) < Version("2.4.0"):
+    torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
+    torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
+else:
+    torch_amp_custom_fwd = torch.amp.custom_fwd(device_type = "cuda")
+    torch_amp_custom_bwd = torch.amp.custom_bwd(device_type = "cuda")
+pass
+# tl.math.tanh now is libdevice.tanh
+from packaging.version import Version
+import triton
+import triton.language as tl
+if Version(triton.__version__) >= Version("3.0.0"):
+    from triton.language.extra import libdevice
+    triton_tanh = libdevice.tanh
+    triton_cast = tl.cast
+else:
+    triton_tanh = tl.math.tanh
+    # No casting in old Triton versions
+    @triton.jit
+    def triton_cast(x, dtype):
+        return x.to(dtype)
+    pass
+pass
+def calculate_settings(n : int) -> (int, int,):
+    BLOCK_SIZE : int = next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(f"Cannot launch Triton kernel since n = {n} exceeds "\
+                           f"the maximum CUDA blocksize = {MAX_FUSED_SIZE}.")
+    num_warps : int = 4
+    if   BLOCK_SIZE >= 32768: num_warps = 32
+    elif BLOCK_SIZE >=  8192: num_warps = 16
+    elif BLOCK_SIZE >=  2048: num_warps = 8
+    return BLOCK_SIZE, num_warps
+pass
+import bitsandbytes as bnb
+import ctypes
+# https://github.com/bitsandbytes-foundation/bitsandbytes/pull/1330/files
+HAS_CUDA_STREAM = Version(bnb.__version__) > Version("0.43.3")
+get_ptr = bnb.functional.get_ptr
+if torch.cuda.device_count() > 1:
+    torch_cuda_device = torch.cuda.device
+else:
+    from contextlib import nullcontext
+    def torch_cuda_device(device): return nullcontext()
+pass
+_cuda_getCurrentRawStream = torch._C._cuda_getCurrentRawStream
+c_void_p = ctypes.c_void_p
+def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
+    return c_void_p(_cuda_getCurrentRawStream(tensor.device.index))
+pass
+# Get array of CUDA streams and other buffers
+global CUDA_STREAMS
+global WEIGHT_BUFFERS
+global ABSMAX_BUFFERS
+_CUDA_STREAMS = {
+    (index := torch.cuda.device(i).idx) : ctypes.c_void_p(torch._C._cuda_getCurrentRawStream(index))
+    for i in range(torch.cuda.device_count())
+}
+CUDA_STREAMS   = [None] * (max(_CUDA_STREAMS.keys()) + 1)
+WEIGHT_BUFFERS = [None] * (max(_CUDA_STREAMS.keys()) + 1)
+ABSMAX_BUFFERS = [None] * (max(_CUDA_STREAMS.keys()) + 1)
+for k, v in _CUDA_STREAMS.items(): CUDA_STREAMS[k] = v
+CUDA_STREAMS = tuple(CUDA_STREAMS)
+del _CUDA_STREAMS
+# Bitsandbytes operations
+ctypes_c_int   = ctypes.c_int
+ctypes_c_int32 = ctypes.c_int32
+cdequantize_blockwise_fp32      = bnb.functional.lib.cdequantize_blockwise_fp32
+cdequantize_blockwise_fp16_nf4  = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
+cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
+cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
+cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
+torch_mm = torch.mm
+torch_mv = torch.mv
+torch_matmul = torch.matmul
+torch_addmm  = torch.addmm
+torch_empty  = torch.empty
+def QUANT_STATE(W): return getattr(W, "quant_state", None)
+def get_lora_parameters(proj):
+    # For DPO or disabled adapters
+    base_layer = getattr(proj, "base_layer", proj) # (proj.base_layer if hasattr(proj, "base_layer") else proj)
+    W = base_layer.weight
+    # if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
+    if getattr(proj, "disable_adapters", True) or proj.merged:
+        return W, getattr(W, "quant_state", None), None, None, None
+    pass
+    adapter = getattr(proj, "active_adapters", None)
+    if adapter is None: adapter = getattr(proj, "active_adapter", ("default"))
+    adapter = adapter[0]
+    return (
+        W,
+        getattr(W, "quant_state", None),
+        proj.lora_A [adapter].weight,
+        proj.lora_B [adapter].weight,
+        proj.scaling[adapter],
+    )
+pass
+def get_lora_parameters_bias(proj):
+    # For DPO or disabled adapters
+    base_layer = getattr(proj, "base_layer", proj) # (proj.base_layer if hasattr(proj, "base_layer") else proj)
+    W = base_layer.weight
+    # if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
+    if getattr(proj, "disable_adapters", True) or proj.merged:
+        return W, getattr(W, "quant_state", None), None, None, None, base_layer.bias
+    pass
+    adapter = getattr(proj, "active_adapters", None)
+    if adapter is None: adapter = getattr(proj, "active_adapter", ("default"))
+    adapter = adapter[0]
+    return (
+        W,
+        getattr(W, "quant_state", None),
+        proj.lora_A [adapter].weight,
+        proj.lora_B [adapter].weight,
+        proj.scaling[adapter],
+        base_layer.bias,
+    )
+pass
+if HAS_CUDA_STREAM:
+    @torch.inference_mode
+    def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False):
+        if quant_state is None: return W
+        if type(quant_state) is not list:
+            # New quant_state as a class
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            # Old quant_state as a list of lists
+            absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        global CUDA_STREAMS
+        device = W.device
+        device_index = device.index
+        CUDA_STREAM = CUDA_STREAMS[device_index]
+        n_elements_absmax = absmax.numel()
+        # Create weight matrix
+        if use_global_buffer:
+            # Use same buffers for faster inference
+            size = shape[0]*shape[1]
+            global WEIGHT_BUFFERS
+            global ABSMAX_BUFFERS
+            WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
+            ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
+            if WEIGHT_BUFFER is None:
+                WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(size, dtype = dtype, device = device, requires_grad = False)
+                ABSMAX_BUFFERS[device_index] = ABSMAX_BUFFER = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+            if size > WEIGHT_BUFFER.numel(): WEIGHT_BUFFER.resize_(size)
+            if n_elements_absmax > ABSMAX_BUFFER.numel(): ABSMAX_BUFFER.resize_(n_elements_absmax)
+            out = WEIGHT_BUFFER[:size].view(shape)
+            out_absmax = ABSMAX_BUFFER[:n_elements_absmax]
+        else:
+            if out is None:
+                out = torch_empty(shape, dtype = dtype, device = device, requires_grad = False)
+            else:
+                assert(out.shape == shape)
+                assert(out.dtype == dtype)
+            out_absmax = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+        pass
+        # NF4 dequantization of statistics
+        ptr_out_absmax = get_ptr(out_absmax)
+        with torch_cuda_device(device):
+            cdequantize_blockwise_fp32(
+                get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax,
+                ctypes_c_int(blocksize2), ctypes_c_int(n_elements_absmax), CUDA_STREAM
+            )
+            out_absmax += offset
+            # Dequantize W
+            fx = cdequantize_blockwise_fp16_nf4 if dtype == torch.float16 else \
+                 cdequantize_blockwise_bf16_nf4
+            fx(get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out),
+               ctypes_c_int(blocksize), ctypes_c_int(out.numel()), CUDA_STREAM,)
+        pass
+        # Careful returning transposed data
+        is_transposed = (True if W.shape[0] == 1 else False)
+        return out.t() if is_transposed else out
+    pass
+else:
+    @torch.inference_mode
+    def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False):
+        if quant_state is None: return W
+        if type(quant_state) is not list:
+            # New quant_state as a class
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            # Old quant_state as a list of lists
+            absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        n_elements_absmax = absmax.numel()
+        device = W.device
+        # Create weight matrix
+        if out is None:
+            out = torch_empty(shape, dtype = dtype, device = device, requires_grad = False)
+        else:
+            assert(out.shape == shape)
+            assert(out.dtype == dtype)
+        out_absmax = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+        # Do dequantization
+        ptr_out_absmax = get_ptr(out_absmax)
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), ptr_out_absmax,
+            ctypes_c_int(blocksize2), ctypes_c_int(n_elements_absmax),
+        )
+        out_absmax += offset
+        fx = cdequantize_blockwise_fp16_nf4 if dtype == torch.float16 else \
+             cdequantize_blockwise_bf16_nf4
+        fx(get_ptr(None), get_ptr(W), ptr_out_absmax, get_ptr(out),
+           ctypes_c_int(blocksize), ctypes_c_int(out.numel()),)
+        # Careful returning transposed data
+        is_transposed = (True if W.shape[0] == 1 else False)
+        return out.t() if is_transposed else out
+    pass
+pass
+if HAS_CUDA_STREAM:
+    def fast_gemv(X, W, quant_state, out = None):
+        if quant_state is None: return torch_matmul(X, W, out = out)
+        # For fast X @ W where seq_len == 1
+        # From https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L1469
+        _, q_len, hd = X.shape
+        # assert(q_len == 1)
+        if type(quant_state) is not list:
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            stats      = quant_state.code
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            absmax, shape, dtype, blocksize, compressed_stats, quant_type, stats = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        global CUDA_STREAMS
+        device = W.device
+        device_index = device.index
+        CUDA_STREAM = CUDA_STREAMS[device_index]
+        # assert(dtype == X.dtype)
+        bout = shape[0]
+        if out is None:
+            out = torch_empty((1, 1, bout,), dtype = dtype, device = device)
+        # else:
+        #     assert(out.shape == (1, 1, bout,))
+        # pass
+        n = 1
+        m = shape[0]
+        k = shape[1]
+        lda = shape[0]
+        ldc = shape[0]
+        ldb = (hd+1)//2
+        m = ctypes_c_int32(m)
+        n = ctypes_c_int32(n)
+        k = ctypes_c_int32(k)
+        lda = ctypes_c_int32(lda)
+        ldb = ctypes_c_int32(ldb)
+        ldc = ctypes_c_int32(ldc)
+        df = torch_empty(absmax.shape, dtype = torch.float32, device = device)
+        with torch_cuda_device(device):
+            cdequantize_blockwise_fp32(
+                get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
+                ctypes_c_int(blocksize2), ctypes_c_int(df.numel()), CUDA_STREAM,
+            )
+            df += offset
+            absmax = df
+            fx = cgemm_4bit_inference_naive_fp16 if dtype == torch.float16 else \
+                cgemm_4bit_inference_naive_bf16
+            blocksize = ctypes_c_int32(blocksize)
+            fx(m, n, k, get_ptr(X), get_ptr(W), get_ptr(absmax), get_ptr(stats), get_ptr(out),
+               lda, ldb, ldc, blocksize, CUDA_STREAM,)
+        pass
+        return out
+    pass
+else:
+    def fast_gemv(X, W, quant_state, out = None):
+        if quant_state is None: return torch.matmul(X, W, out = out)
+        # For fast X @ W where seq_len == 1
+        # From https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L1469
+        _, q_len, hd = X.shape
+        # assert(q_len == 1)
+        if type(quant_state) is not list:
+            # https://github.com/TimDettmers/bitsandbytes/pull/763/files
+            absmax     = quant_state.absmax
+            shape      = quant_state.shape
+            dtype      = quant_state.dtype
+            blocksize  = quant_state.blocksize
+            stats      = quant_state.code
+            offset     = quant_state.offset
+            state2     = quant_state.state2
+            absmax2    = state2.absmax
+            code2      = state2.code
+            blocksize2 = state2.blocksize
+        else:
+            absmax, shape, dtype, blocksize, compressed_stats, quant_type, stats = quant_state
+            offset, state2 = compressed_stats
+            absmax2, code2, blocksize2, _, _, _, _ = state2
+        pass
+        # assert(dtype == X.dtype)
+        bout = shape[0]
+        device = W.device
+        if out is None:
+            out = torch_empty((1, 1, bout,), dtype = dtype, device = device)
+        # else:
+        #     assert(out.shape == (1, 1, bout,))
+        # pass
+        n = 1
+        m = shape[0]
+        k = shape[1]
+        lda = shape[0]
+        ldc = shape[0]
+        ldb = (hd+1)//2
+        m = ctypes_c_int32(m)
+        n = ctypes_c_int32(n)
+        k = ctypes_c_int32(k)
+        lda = ctypes_c_int32(lda)
+        ldb = ctypes_c_int32(ldb)
+        ldc = ctypes_c_int32(ldc)
+        df = torch_empty(absmax.shape, dtype = torch.float32, device = device)
+        cdequantize_blockwise_fp32(
+            get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
+            ctypes_c_int(blocksize2), ctypes_c_int(df.numel()),
+        )
+        df += offset
+        absmax = df
+        fx = cgemm_4bit_inference_naive_fp16 if dtype == torch.float16 else \
+            cgemm_4bit_inference_naive_bf16
+        blocksize = ctypes_c_int32(blocksize)
+        fx(m, n, k, get_ptr(X), get_ptr(W), get_ptr(absmax), get_ptr(stats), get_ptr(out),
+           lda, ldb, ldc, blocksize,)
+        return out
+    pass
+pass
+def fast_linear_forward(proj, X, temp_lora = None, out = None):
+    W, W_quant, lora_A, lora_B, lora_S, bias = get_lora_parameters_bias(proj)
+    bsz, q_len, in_dim = X.shape
+    if q_len != 1: return matmul_lora(X, W, W_quant, lora_A, lora_B, lora_S)
+    if W_quant is None:
+        out = torch_matmul(X, W.t(), out = out)
+    elif bsz == 1 and q_len == 1:
+        out = fast_gemv(X, W, W_quant, out = out)
+    else:
+        W = fast_dequantize(W.t(), W_quant, use_global_buffer = True)
+        out = torch_matmul(X, W, out = out)
+    pass
+    # Add in LoRA weights
+    if lora_A is not None:
+        out_dim = out.shape[2]
+        dtype = X.dtype
+        if not hasattr(lora_A, "_fast_lora"):
+            lora_A._fast_lora = lora_A.to(dtype)
+            lora_B._fast_lora = lora_B.to(dtype)
+        pass
+        if bsz == 1:
+            out = out.view(out_dim)
+            temp_lora = torch_mv(lora_A._fast_lora, X.ravel(), out = temp_lora)
+            out.addmv_(lora_B._fast_lora, temp_lora, alpha = lora_S)
+        else:
+            out = out.view(bsz, out_dim)
+            temp_lora = torch_mm(X.view(bsz, in_dim), lora_A._fast_lora.t(), out = temp_lora)
+            out.addmm_(temp_lora, lora_B._fast_lora.t(), alpha = lora_S)
+        pass
+        out = out.view(bsz, 1, out_dim)
+    pass
+    if bias is not None: out += bias
+    return out
+pass
+def matmul_lora(X, W, W_quant, A, B, s, out = None):
+    dtype = X.dtype
+    W = fast_dequantize(W.t(), W_quant, use_global_buffer = True)
+    if X.dim() == 3:
+        batch, seq_len, d = X.shape
+        X = X.view(-1, X.shape[-1])
+        reshape = True
+    else:
+        reshape = False
+    pass
+    out = torch_matmul(X, W, out = out)
+    if W_quant is not None: del W
+    if A is not None:
+        # LoRA is enabled
+        A, B = A.t(), B.t()
+        XA = torch_matmul(X, A.to(dtype))
+        out.addmm_(XA, B.to(dtype), alpha = s)
+        # out += (X @ A.to(dtype)) @ (s * B.to(dtype))
+    pass
+    return out.view(batch, seq_len, -1) if reshape else out
+pass