microsoft · tianleiwu · Jun 19, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
@@ -76,6 +76,9 @@ def check_extra_options(kv_pairs, execution_provider):
         "hf_remote",
         "disable_qkv_fusion",
         "prune_lm_head",
+        "last_matmul_weight_int8",
+        "int8_mixed_layers",
+        "int8_linear_attn",
     ]
     for key in bools:
         if key in kv_pairs:
@@ -425,15 +428,22 @@ def get_args():
                 int4_nodes_to_exclude = Specify nodes to exclude from int4 quantization.
                     Use this option when you want to exclude certain nodes from being quantized.
                     Separate the node names with a ',' when passing them here (e.g. int4_nodes_to_exclude=/lm_head/MatMul,/model/embed_tokens/Gather)
-                int4_algo_config = Method for int4 quantization. Default is 'default'.
-                    Currently supported options are: 'default', 'rtn', 'rtn_last', 'k_quant', 'k_quant_mixed', 'k_quant_last', 'k_quant_linear'.
+                int4_algo_config = Base method for int4 quantization. Default is 'default'.
+                    Currently supported base methods are: 'default', 'rtn', 'k_quant'.
                     default = algo_config passed to MatMulNBitsQuantizer is None. Quantizer uses default RTN algorithm. All MatMuls are quantized as int4.(different node naming conventions to `rtn`)
                     rtn = RTN algorithm for int4 quantization.
-                    rtn_last = RTN algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4.
                     k_quant = k_quant algorithm for int4 quantization.
-                    k_quant_mixed = k_quant algorithm with mixed precision (int4 + int8).
-                    k_quant_last = k_quant algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4.
-                    k_quant_linear = k_quant algorithm with linear attention layer projections and MLPs promoted to int8 (for hybrid attention models like Qwen3.5).
+                    The following legacy compound values are still accepted as aliases (base method + int8 placement flags):
+                    rtn_last = rtn + last_matmul_weight_int8=true.
+                    k_quant_last = k_quant + last_matmul_weight_int8=true.
+                    k_quant_mixed = k_quant + last_matmul_weight_int8=true + int8_mixed_layers=true.
+                    k_quant_linear = k_quant + last_matmul_weight_int8=true + int8_linear_attn=true.
+                last_matmul_weight_int8 = Quantize the last MatMul (e.g. /lm_head/MatMul) as int8 instead of int4. Default is false.
+                    Orthogonal to int4_algo_config; can be combined with any base method ('default', 'rtn', 'k_quant').
+                int8_mixed_layers = Promote the most quantization-sensitive MatMuls (llama.cpp mixed strategy: first/last eighth of layers plus every third layer's qkv_proj/v_proj/down_proj) to int8. Default is false.
+                    Orthogonal to int4_algo_config; can be combined with any base method.
+                int8_linear_attn = Promote linear-attention projections and their MLPs to int8 (for hybrid attention models like Qwen3.5). Default is false.
+                    Orthogonal to int4_algo_config; can be combined with any base method.
                 shared_embeddings = Enable weight sharing between embedding and LM head layers. Default is false.
                     Use this option to share weights and reduce model size by eliminating duplicate weights.
                     For quantized models (INT4/UINT4): Shares quantized weights using GatherBlockQuantized. Only works with rtn and k_quant algorithms, and cannot be used if LM head is excluded.