Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions vllm/model_executor/layers/fused_moe/hybrid_w4a16_moe_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Shared setup for the HIP HybridW4A16 MoE path.

Used by both `CompressedTensorsWNA16MoEMethod` and `INCHybridW4A16MoEMethod`
to convert GPTQ-packed `[E, K/8, N]` int32 weights into the ExLlama-shuffled
`[E, N, K//8]` int32 layout consumed by `fused_moe_wvSplitK_int4_gemm`
(`csrc/rocm/skinny_gemms_int4.cu`), and to install the matching
`HybridW4A16MoEExperts` modular kernel on the method.
"""

import torch

import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.utils import replace_parameter


def setup_hybrid_w4a16_moe(method, layer: torch.nn.Module) -> None:
"""Convert weights and install `HybridW4A16MoEExperts` on `method`.

`method` must expose `.moe` and `.get_fused_moe_quant_config(layer)`.
`layer` must hold `w13_weight_packed`/`w2_weight_packed` (int32, GPTQ
`[E, K/8, N]` layout) and `w13_weight_scale`/`w2_weight_scale`
(`[E, K/G, N]`) as parameters.
"""
from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import (
pack_int4_exllama_shuffle,
)
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.hybrid_w4a16_moe import (
HybridW4A16MoEExperts,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
unpack_quantized_values_into_int32,
)
from vllm.scalar_type import scalar_types

wtype = scalar_types.uint4

def convert_weights(w_packed: torch.Tensor) -> torch.Tensor:
E_dim = w_packed.size(0)
experts = []
for e in range(E_dim):
unpacked = unpack_quantized_values_into_int32(
w_packed[e], wtype, packed_dim=0
)
unpacked_t = unpacked.t().contiguous()
repacked = pack_int4_exllama_shuffle(unpacked_t)
experts.append(repacked)
return torch.stack(experts)

replace_parameter(
layer,
"w13_weight_packed",
torch.nn.Parameter(
convert_weights(layer.w13_weight_packed), requires_grad=False
),
)
replace_parameter(
layer,
"w2_weight_packed",
torch.nn.Parameter(
convert_weights(layer.w2_weight_packed), requires_grad=False
),
)

layer.w13_weight_scale = torch.nn.Parameter(
layer.w13_weight_scale.transpose(1, 2).contiguous(),
requires_grad=False,
)
layer.w2_weight_scale = torch.nn.Parameter(
layer.w2_weight_scale.transpose(1, 2).contiguous(),
requires_grad=False,
)

layer.use_hybrid_w4a16_moe = True

method.moe_quant_config = method.get_fused_moe_quant_config(layer)
assert method.moe_quant_config is not None
layer.w13_weight = layer.w13_weight_packed
layer.w2_weight = layer.w2_weight_packed

prepare_finalize = maybe_make_prepare_finalize(
moe=method.moe,
quant_config=method.moe_quant_config,
routing_tables=layer._maybe_init_expert_routing_tables(),
allow_new_interface=True,
use_monolithic=False,
)
assert prepare_finalize is not None
method.moe_kernel = mk.FusedMoEKernel(
prepare_finalize,
HybridW4A16MoEExperts(
moe_config=method.moe, quant_config=method.moe_quant_config
),
shared_experts=None,
inplace=not method.moe.disable_inplace,
)
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa E501
CompressedTensorsMoEMethod,
)
from vllm.model_executor.utils import replace_parameter, set_weight_attrs
from vllm.model_executor.utils import set_weight_attrs

logger = init_logger(__name__)

Expand Down Expand Up @@ -202,94 +202,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
)

def _process_weights_hybrid_w4a16(self, layer: torch.nn.Module) -> None:
"""Hybrid W4A16 MoE path: convert GPTQ [E, K/8, N] -> skinny
[E, N, K//8] int32 (ExLlama shuffle) and transpose scales to
[E, N, K//G].

For symmetric quantization (bias=8), zero_points are not needed;
the HIP skinny kernel uses HAS_ZERO_POINTS=false with hardcoded
bias=8, and the Triton kernel uses ZP_BIAS=8.
"""
from vllm.model_executor.kernels.linear.mixed_precision.hybrid_w4a16 import (
pack_int4_exllama_shuffle,
from vllm.model_executor.layers.fused_moe.hybrid_w4a16_moe_helper import (
setup_hybrid_w4a16_moe,
)
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.hybrid_w4a16_moe import (
HybridW4A16MoEExperts,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
unpack_quantized_values_into_int32,
)
from vllm.scalar_type import scalar_types

wtype = scalar_types.uint4

def convert_weights(w_packed: torch.Tensor) -> torch.Tensor:
"""Convert [E, K/8, N] GPTQ -> [E, N, K//8] skinny
(ExLlama shuffle)."""
E_dim = w_packed.size(0)
experts = []
for e in range(E_dim):
unpacked = unpack_quantized_values_into_int32(
w_packed[e], wtype, packed_dim=0
)
unpacked_t = unpacked.t().contiguous()
repacked = pack_int4_exllama_shuffle(unpacked_t)
experts.append(repacked)
return torch.stack(experts)

replace_parameter(
layer,
"w13_weight_packed",
torch.nn.Parameter(
convert_weights(layer.w13_weight_packed), requires_grad=False
),
)
replace_parameter(
layer,
"w2_weight_packed",
torch.nn.Parameter(
convert_weights(layer.w2_weight_packed), requires_grad=False
),
)

layer.w13_weight_scale = torch.nn.Parameter(
layer.w13_weight_scale.transpose(1, 2).contiguous(),
requires_grad=False,
)
layer.w2_weight_scale = torch.nn.Parameter(
layer.w2_weight_scale.transpose(1, 2).contiguous(),
requires_grad=False,
)

layer.use_hybrid_w4a16_moe = True

self.moe_quant_config = self.get_fused_moe_quant_config(layer)
assert self.moe_quant_config is not None
layer.w13_weight = layer.w13_weight_packed
layer.w2_weight = layer.w2_weight_packed

# Build the modular kernel directly so the runner uses the hybrid
# experts even on single-GPU deployments (no DP/EP), where the
# legacy select_gemm_impl path is not invoked.
prepare_finalize = maybe_make_prepare_finalize(
moe=self.moe,
quant_config=self.moe_quant_config,
routing_tables=layer._maybe_init_expert_routing_tables(),
allow_new_interface=True,
use_monolithic=False,
)
assert prepare_finalize is not None
self.moe_kernel = mk.FusedMoEKernel(
prepare_finalize,
HybridW4A16MoEExperts(
moe_config=self.moe, quant_config=self.moe_quant_config
),
shared_experts=None,
inplace=not self.moe.disable_inplace,
)
setup_hybrid_w4a16_moe(self, layer)

def get_fused_moe_quant_config(
self, layer: torch.nn.Module
Expand Down
42 changes: 28 additions & 14 deletions vllm/model_executor/layers/quantization/inc.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,14 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
if isinstance(layer, FusedMoE):
if use_marlin:
return AWQMarlinMoEMethod(quant_args_marlin, layer.moe_config)
from vllm.model_executor.layers.quantization.inc_moe import (
INCHybridW4A16MoEMethod,
can_use_hybrid_w4a16_moe,
)

if can_use_hybrid_w4a16_moe(weight_bits, group_size, sym):
return INCHybridW4A16MoEMethod(layer.moe_config, group_size)

from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config

config = {
Expand Down Expand Up @@ -388,21 +396,27 @@ def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
if isinstance(layer, FusedMoE):
if use_marlin:
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config)
else:
from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config,
)

config = {
"quant_method": "gptq",
"bits": weight_bits,
"group_size": group_size,
"sym": sym,
"lm_head": False,
}
return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix
)
from vllm.model_executor.layers.quantization.inc_moe import (
INCHybridW4A16MoEMethod,
can_use_hybrid_w4a16_moe,
)

if can_use_hybrid_w4a16_moe(weight_bits, group_size, sym):
return INCHybridW4A16MoEMethod(layer.moe_config, group_size)

from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config,
)

config = {
"quant_method": "gptq",
"bits": weight_bits,
"group_size": group_size,
"sym": sym,
"lm_head": False,
}
return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix)

if isinstance(layer, (LinearBase, ParallelLMHead)):
if use_marlin:
Expand Down
Loading
Loading