Qwen/Qwen3-30B-A3B · Minor changes, big gains -- Huggingface MoE modeling enhancement

Apr 30

•

https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py#L282

When using the HuggingFace library, execute the following code after loading the model to significantly boost the inference performance of the Qwen3 MoE model.

import torch
from torch import nn
import torch.nn.functional as F

class MyMoeSparseMoeBlock(nn.Module):
    def __init__(self, base_moe):
        super().__init__()
        self.base_moe = base_moe
        self.num_experts = base_moe.num_experts
        self.top_k = base_moe.top_k
        self.norm_topk_prob = base_moe.norm_topk_prob

        # gating
        self.gate = base_moe.gate
        self.experts = base_moe.experts

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # One hot encode the selected experts to create an expert mask
        # this will be used to easily index which expert is going to be sollicitated
        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)

        activated_experts = torch.unique(selected_experts)
        cuda_streams = [torch.cuda.Stream() for _ in activated_experts]
        # Loop over all available experts in the model and perform the computation on each expert
        for expert_idx, cuda_stream in zip(activated_experts, cuda_streams):
            with torch.cuda.stream(cuda_stream):
                expert_layer = self.experts[expert_idx]
                idx, top_x = torch.where(expert_mask[expert_idx])

                # Index the correct hidden states and compute the expert hidden state for
                # the current expert. We need to make sure to multiply the output hidden
                # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
                current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
                current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

                # However `index_add_` only support torch tensors for indexing so we'll use
                # the `top_x` tensor here.
                final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        torch.cuda.synchronize()
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits

# apply 
for layer in model.model.layers:
    if type(layer.mlp).__name__ == "Qwen3MoeSparseMoeBlock":
        layer.mlp = MyMoeSparseMoeBlock(layer.mlp)

# recovery
# for layer in model.model.layers:
#     if type(layer.mlp).__name__ == "MyMoeSparseMoeBlock":
#         layer.mlp = layer.mlp.base_moe

xiaowei4ai

May 1

I use two H20 GPUs, the performance are as follows

Before change

After applying the changes

JaheimLee

May 1

Have you tested this with vllm?

xiaowei4ai

May 1

Have you tested this with vllm?

The modeling code for qwen3-moe in vLLM is already parallelized.
https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_moe.py#L128

xiaowei4ai changed discussion title from Minor changes, big gains -- MOE inference enhancement to Minor changes, big gains -- Huggingface moe modeling enhancement May 1

xiaowei4ai changed discussion title from Minor changes, big gains -- Huggingface moe modeling enhancement to Minor changes, big gains -- Huggingface MoE modeling enhancement May 1