|
2 | 2 | // Licensed under the MIT License. |
3 | 3 |
|
4 | 4 | #include "gtest/gtest.h" |
| 5 | +#include "core/session/onnxruntime_session_options_config_keys.h" |
5 | 6 | #include "test/common/tensor_op_test_utils.h" |
6 | 7 | #include "test/common/cuda_op_test_utils.h" |
7 | 8 | #include "test/providers/provider_test_utils.h" |
@@ -778,6 +779,74 @@ TEST(MoETest, MoETest_Mixtral) { |
778 | 779 | 2 /*top_k*/); |
779 | 780 | } |
780 | 781 |
|
| 782 | +TEST(MoETest, QMoETest_CUDA_Int4_DisablePrepackingFailsLoudly) { |
| 783 | + constexpr int min_cuda_arch = 700; |
| 784 | + if (!HasCudaEnvironment(min_cuda_arch)) { |
| 785 | + GTEST_SKIP() << "CUDA execution provider not available"; |
| 786 | + } |
| 787 | + |
| 788 | + auto cuda_ep = DefaultCudaExecutionProvider(); |
| 789 | + if (!cuda_ep) { |
| 790 | + GTEST_SKIP() << "CUDA execution provider not available"; |
| 791 | + } |
| 792 | + |
| 793 | + constexpr int64_t num_rows = 1; |
| 794 | + constexpr int64_t num_experts = 1; |
| 795 | + constexpr int64_t hidden_size = 128; |
| 796 | + constexpr int64_t inter_size = 128; |
| 797 | + constexpr int64_t expert_weight_bits = 4; |
| 798 | + constexpr int64_t pack_size = 8 / expert_weight_bits; |
| 799 | + |
| 800 | + const std::vector<float> input(num_rows * hidden_size, 0.0f); |
| 801 | + const std::vector<float> router_probs(num_rows * num_experts, 1.0f); |
| 802 | + const std::vector<uint8_t> fc1_experts_weights(num_experts * inter_size * (hidden_size / pack_size), 0); |
| 803 | + const std::vector<uint8_t> fc2_experts_weights(num_experts * hidden_size * (inter_size / pack_size), 0); |
| 804 | + const std::vector<float> fc1_scales(num_experts * inter_size, 1.0f); |
| 805 | + const std::vector<float> fc2_scales(num_experts * hidden_size, 1.0f); |
| 806 | + const std::vector<float> dummy_output(num_rows * hidden_size, 0.0f); |
| 807 | + |
| 808 | + OpTester cuda_tester("QMoE", 1, onnxruntime::kMSDomain); |
| 809 | + cuda_tester.AddAttribute<int64_t>("k", 1); |
| 810 | + cuda_tester.AddAttribute<std::string>("activation_type", "identity"); |
| 811 | + cuda_tester.AddAttribute<int64_t>("normalize_routing_weights", 1); |
| 812 | + cuda_tester.AddAttribute<int64_t>("expert_weight_bits", expert_weight_bits); |
| 813 | + cuda_tester.AddAttribute<std::string>("quant_type", "int"); |
| 814 | + cuda_tester.AddAttribute<int64_t>("weights_prepacked", 0); |
| 815 | + |
| 816 | + const std::vector<int64_t> input_dims = {num_rows, hidden_size}; |
| 817 | + const std::vector<int64_t> router_probs_dims = {num_rows, num_experts}; |
| 818 | + const std::vector<int64_t> fc1_experts_weights_dims = {num_experts, inter_size, hidden_size / pack_size}; |
| 819 | + const std::vector<int64_t> fc2_experts_weights_dims = {num_experts, hidden_size, inter_size / pack_size}; |
| 820 | + const std::vector<int64_t> fc1_scales_dims = {num_experts, inter_size}; |
| 821 | + const std::vector<int64_t> fc2_scales_dims = {num_experts, hidden_size}; |
| 822 | + const std::vector<int64_t> output_dims = {num_rows, hidden_size}; |
| 823 | + |
| 824 | + cuda_tester.AddInput<MLFloat16>("input", input_dims, ToFloat16(input)); |
| 825 | + cuda_tester.AddInput<MLFloat16>("router_probs", router_probs_dims, ToFloat16(router_probs)); |
| 826 | + cuda_tester.AddInput<uint8_t>("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights); |
| 827 | + cuda_tester.AddInput<MLFloat16>("fc1_scales", fc1_scales_dims, ToFloat16(fc1_scales)); |
| 828 | + cuda_tester.AddOptionalInputEdge<MLFloat16>(); |
| 829 | + cuda_tester.AddInput<uint8_t>("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights); |
| 830 | + cuda_tester.AddInput<MLFloat16>("fc2_scales", fc2_scales_dims, ToFloat16(fc2_scales)); |
| 831 | + cuda_tester.AddOptionalInputEdge<MLFloat16>(); |
| 832 | + cuda_tester.AddOptionalInputEdge<uint8_t>(); |
| 833 | + cuda_tester.AddOptionalInputEdge<MLFloat16>(); |
| 834 | + cuda_tester.AddOptionalInputEdge<MLFloat16>(); |
| 835 | + cuda_tester.AddOutput<MLFloat16>("output", output_dims, ToFloat16(dummy_output)); |
| 836 | + |
| 837 | + SessionOptions session_options; |
| 838 | + session_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "1"; |
| 839 | + |
| 840 | + std::vector<std::unique_ptr<IExecutionProvider>> cuda_execution_providers; |
| 841 | + cuda_execution_providers.push_back(std::move(cuda_ep)); |
| 842 | + cuda_tester.Run(session_options, |
| 843 | + OpTester::ExpectResult::kExpectFailure, |
| 844 | + "QMoE weights_prepacked=0 requires PrePack to run", |
| 845 | + {}, |
| 846 | + nullptr, |
| 847 | + &cuda_execution_providers); |
| 848 | +} |
| 849 | + |
781 | 850 | TEST(MoETest, QMoETest_Mixtral_Int4) { |
782 | 851 | // This test uses FC3 (gated SiLU / Mixtral pattern) with dimensions too small for the |
783 | 852 | // CUTLASS kernel (needs hidden_size >= 128, inter_size >= 128). CPU QMoE does not |
|
0 commit comments