Skip to content

Commit 1ac3e98

Browse files
authored
test: cover QMoE prepack-disabled failure
1 parent 10a5199 commit 1ac3e98

1 file changed

Lines changed: 69 additions & 0 deletions

File tree

onnxruntime/test/contrib_ops/moe_test.cc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Licensed under the MIT License.
33

44
#include "gtest/gtest.h"
5+
#include "core/session/onnxruntime_session_options_config_keys.h"
56
#include "test/common/tensor_op_test_utils.h"
67
#include "test/common/cuda_op_test_utils.h"
78
#include "test/providers/provider_test_utils.h"
@@ -778,6 +779,74 @@ TEST(MoETest, MoETest_Mixtral) {
778779
2 /*top_k*/);
779780
}
780781

782+
TEST(MoETest, QMoETest_CUDA_Int4_DisablePrepackingFailsLoudly) {
783+
constexpr int min_cuda_arch = 700;
784+
if (!HasCudaEnvironment(min_cuda_arch)) {
785+
GTEST_SKIP() << "CUDA execution provider not available";
786+
}
787+
788+
auto cuda_ep = DefaultCudaExecutionProvider();
789+
if (!cuda_ep) {
790+
GTEST_SKIP() << "CUDA execution provider not available";
791+
}
792+
793+
constexpr int64_t num_rows = 1;
794+
constexpr int64_t num_experts = 1;
795+
constexpr int64_t hidden_size = 128;
796+
constexpr int64_t inter_size = 128;
797+
constexpr int64_t expert_weight_bits = 4;
798+
constexpr int64_t pack_size = 8 / expert_weight_bits;
799+
800+
const std::vector<float> input(num_rows * hidden_size, 0.0f);
801+
const std::vector<float> router_probs(num_rows * num_experts, 1.0f);
802+
const std::vector<uint8_t> fc1_experts_weights(num_experts * inter_size * (hidden_size / pack_size), 0);
803+
const std::vector<uint8_t> fc2_experts_weights(num_experts * hidden_size * (inter_size / pack_size), 0);
804+
const std::vector<float> fc1_scales(num_experts * inter_size, 1.0f);
805+
const std::vector<float> fc2_scales(num_experts * hidden_size, 1.0f);
806+
const std::vector<float> dummy_output(num_rows * hidden_size, 0.0f);
807+
808+
OpTester cuda_tester("QMoE", 1, onnxruntime::kMSDomain);
809+
cuda_tester.AddAttribute<int64_t>("k", 1);
810+
cuda_tester.AddAttribute<std::string>("activation_type", "identity");
811+
cuda_tester.AddAttribute<int64_t>("normalize_routing_weights", 1);
812+
cuda_tester.AddAttribute<int64_t>("expert_weight_bits", expert_weight_bits);
813+
cuda_tester.AddAttribute<std::string>("quant_type", "int");
814+
cuda_tester.AddAttribute<int64_t>("weights_prepacked", 0);
815+
816+
const std::vector<int64_t> input_dims = {num_rows, hidden_size};
817+
const std::vector<int64_t> router_probs_dims = {num_rows, num_experts};
818+
const std::vector<int64_t> fc1_experts_weights_dims = {num_experts, inter_size, hidden_size / pack_size};
819+
const std::vector<int64_t> fc2_experts_weights_dims = {num_experts, hidden_size, inter_size / pack_size};
820+
const std::vector<int64_t> fc1_scales_dims = {num_experts, inter_size};
821+
const std::vector<int64_t> fc2_scales_dims = {num_experts, hidden_size};
822+
const std::vector<int64_t> output_dims = {num_rows, hidden_size};
823+
824+
cuda_tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input));
825+
cuda_tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs));
826+
cuda_tester.AddInput<uint8_t>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights);
827+
cuda_tester.AddInput<MLFloat16>("fc1_scales", fc1_scales_dims, ToFloat16(fc1_scales));
828+
cuda_tester.AddOptionalInputEdge<MLFloat16>();
829+
cuda_tester.AddInput<uint8_t>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights);
830+
cuda_tester.AddInput<MLFloat16>("fc2_scales", fc2_scales_dims, ToFloat16(fc2_scales));
831+
cuda_tester.AddOptionalInputEdge<MLFloat16>();
832+
cuda_tester.AddOptionalInputEdge<uint8_t>();
833+
cuda_tester.AddOptionalInputEdge<MLFloat16>();
834+
cuda_tester.AddOptionalInputEdge<MLFloat16>();
835+
cuda_tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(dummy_output));
836+
837+
SessionOptions session_options;
838+
session_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "1";
839+
840+
std::vector<std::unique_ptr<IExecutionProvider>> cuda_execution_providers;
841+
cuda_execution_providers.push_back(std::move(cuda_ep));
842+
cuda_tester.Run(session_options,
843+
OpTester::ExpectResult::kExpectFailure,
844+
"QMoE weights_prepacked=0 requires PrePack to run",
845+
{},
846+
nullptr,
847+
&cuda_execution_providers);
848+
}
849+
781850
TEST(MoETest, QMoETest_Mixtral_Int4) {
782851
// This test uses FC3 (gated SiLU / Mixtral pattern) with dimensions too small for the
783852
// CUTLASS kernel (needs hidden_size >= 128, inter_size >= 128). CPU QMoE does not

0 commit comments

Comments
 (0)