Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,18 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
// to the runner.
const bool int_weights_consumed_by_prepack =
is_int && !weights_prepacked_ && packed_fc1_weights_ != nullptr && packed_fc2_weights_ != nullptr;
// When ``weights_prepacked == 0`` the raw ``[E, N, K/pack]`` int weights must be
// converted to the CUTLASS fpA_intB layout by PrePack before the runner can consume
// them. If PrePack never ran (e.g. ``session.disable_prepacking`` is set), the prepack
// buffers stay null and falling through to the raw initializer pointers would feed
// non-CUTLASS bytes to the runner, producing silently wrong output. Fail loudly instead.
if (is_int && !weights_prepacked_ &&
Comment thread
tianleiwu marked this conversation as resolved.
(packed_fc1_weights_ == nullptr || packed_fc2_weights_ == nullptr)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"QMoE weights_prepacked=0 requires PrePack to run, but the int weight "
"buffers were not produced (is session.disable_prepacking set?). Provide "
"CUTLASS-prepacked weights with weights_prepacked=1, or enable prepacking.");
Comment thread
justinchuby marked this conversation as resolved.
}
const Tensor* fc1_experts_weights = int_weights_consumed_by_prepack ? nullptr : context->Input<Tensor>(2);
const Tensor* fc1_scales = (is_int && !packed_fc1_scales_) ? context->Input<Tensor>(3) : nullptr;
const Tensor* fc1_experts_bias_optional = context->Input<Tensor>(4);
Expand Down Expand Up @@ -854,8 +866,19 @@ Status QMoE::ComputeInternal(OpKernelContext* context) const {
const void* fc1_weight_data = fc1_experts_weights ? fc1_experts_weights->DataRaw() : nullptr;
const void* fc2_weight_data = fc2_experts_weights ? fc2_experts_weights->DataRaw() : nullptr;
if (is_wfp4afp8 && !use_wfp4afp8_dequant_fallback_) {
fc1_weight_data = packed_fp4_fc1_weights_ ? packed_fp4_fc1_weights_.get() : fc1_weight_data;
fc2_weight_data = packed_fp4_fc2_weights_ ? packed_fp4_fc2_weights_.get() : fc2_weight_data;
// The native CUTLASS WFP4AFP8 path consumes weights in the repacked FP4
// layout produced by PrePack. If PrePack never ran (e.g.
// ``session.disable_prepacking`` is set) the repacked buffers stay null and
// falling through to the raw initializer bytes would feed a non-CUTLASS
// layout to the runner, producing silently wrong output. Fail loudly.
if (packed_fp4_fc1_weights_ == nullptr || packed_fp4_fc2_weights_ == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"QMoE wfp4afp8 requires PrePack to run, but the repacked FP4 weight "
"buffers were not produced (is session.disable_prepacking set?). "
"Enable prepacking to use the native WFP4AFP8 path.");
}
fc1_weight_data = packed_fp4_fc1_weights_.get();
fc2_weight_data = packed_fp4_fc2_weights_.get();
} else if (int_weights_consumed_by_prepack) {
// PrePack converted the raw int4/int8 weights to the CUTLASS fpA_intB
// layout that the runner consumes and freed the source initializer
Expand Down
69 changes: 69 additions & 0 deletions onnxruntime/test/contrib_ops/moe_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include "gtest/gtest.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "test/common/tensor_op_test_utils.h"
#include "test/common/cuda_op_test_utils.h"
#include "test/providers/provider_test_utils.h"
Expand Down Expand Up @@ -778,6 +779,74 @@ TEST(MoETest, MoETest_Mixtral) {
2 /*top_k*/);
}

TEST(MoETest, QMoETest_CUDA_Int4_DisablePrepackingFailsLoudly) {
constexpr int min_cuda_arch = 700;
if (!HasCudaEnvironment(min_cuda_arch)) {
GTEST_SKIP() << "CUDA execution provider not available";
}

auto cuda_ep = DefaultCudaExecutionProvider();
if (!cuda_ep) {
GTEST_SKIP() << "CUDA execution provider not available";
}

constexpr int64_t num_rows = 1;
constexpr int64_t num_experts = 1;
constexpr int64_t hidden_size = 128;
constexpr int64_t inter_size = 128;
constexpr int64_t expert_weight_bits = 4;
constexpr int64_t pack_size = 8 / expert_weight_bits;

const std::vector<float> input(num_rows * hidden_size, 0.0f);
const std::vector<float> router_probs(num_rows * num_experts, 1.0f);
const std::vector<uint8_t> fc1_experts_weights(num_experts * inter_size * (hidden_size / pack_size), 0);
const std::vector<uint8_t> fc2_experts_weights(num_experts * hidden_size * (inter_size / pack_size), 0);
const std::vector<float> fc1_scales(num_experts * inter_size, 1.0f);
const std::vector<float> fc2_scales(num_experts * hidden_size, 1.0f);
const std::vector<float> dummy_output(num_rows * hidden_size, 0.0f);

OpTester cuda_tester("QMoE", 1, onnxruntime::kMSDomain);
cuda_tester.AddAttribute<int64_t>("k", 1);
cuda_tester.AddAttribute<std::string>("activation_type", "identity");
cuda_tester.AddAttribute<int64_t>("normalize_routing_weights", 1);
cuda_tester.AddAttribute<int64_t>("expert_weight_bits", expert_weight_bits);
cuda_tester.AddAttribute<std::string>("quant_type", "int");
cuda_tester.AddAttribute<int64_t>("weights_prepacked", 0);

const std::vector<int64_t> input_dims = {num_rows, hidden_size};
const std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
const std::vector<int64_t> fc1_experts_weights_dims = {num_experts, inter_size, hidden_size / pack_size};
const std::vector<int64_t> fc2_experts_weights_dims = {num_experts, hidden_size, inter_size / pack_size};
const std::vector<int64_t> fc1_scales_dims = {num_experts, inter_size};
const std::vector<int64_t> fc2_scales_dims = {num_experts, hidden_size};
const std::vector<int64_t> output_dims = {num_rows, hidden_size};

cuda_tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
cuda_tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
cuda_tester.AddInput<uint8_t>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
cuda_tester.AddInput<MLFloat16>("fc1_scales", fc1_scales_dims, ToFloat16(fc1_scales));
cuda_tester.AddOptionalInputEdge<MLFloat16>();
cuda_tester.AddInput<uint8_t>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
cuda_tester.AddInput<MLFloat16>("fc2_scales", fc2_scales_dims, ToFloat16(fc2_scales));
cuda_tester.AddOptionalInputEdge<MLFloat16>();
cuda_tester.AddOptionalInputEdge<uint8_t>();
cuda_tester.AddOptionalInputEdge<MLFloat16>();
cuda_tester.AddOptionalInputEdge<MLFloat16>();
cuda_tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(dummy_output));

SessionOptions session_options;
session_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "1";

std::vector<std::unique_ptr<IExecutionProvider>> cuda_execution_providers;
cuda_execution_providers.push_back(std::move(cuda_ep));
cuda_tester.Run(session_options,
OpTester::ExpectResult::kExpectFailure,
"QMoE weights_prepacked=0 requires PrePack to run",
{},
nullptr,
&cuda_execution_providers);
}

TEST(MoETest, QMoETest_Mixtral_Int4) {
// This test uses FC3 (gated SiLU / Mixtral pattern) with dimensions too small for the
// CUTLASS kernel (needs hidden_size >= 128, inter_size >= 128). CPU QMoE does not
Expand Down
Loading