sm12x: per-token early-loop-exit on sparse MLA accumulate inner candidate loop

jasl · jasl · commit ad68cb35e7fb · 2026-05-15T19:35:00.000+08:00
Redesigned suggestion #3 from PR vllm-project#41834 comment 4450901180. The first attempt (e34daef, reverted; later 72a5ff2, also reverted) tried to truncate ``topk_indices.shape[1]`` in Python so the captured launches iterated a narrower combined slice; that approach broke under cudagraph replay (shape baked at capture) and *also* mis-bounded — the combine kernel writes each token's combined buffer as ``[topk_len_t | swa_len_t | -1 padding]`` with SWA *immediately* following the per-token topk, so a fixed ``effective_topk`` cap cuts off the SWA portion (GSM8K dropped 25 pp on the prior attempt). The kernel already loads the per-token combined length (``valid_len = tl.load(lens_ptr + token_idx)`` for the four ``lens``- gated kernels, ``gather_len`` for the two paged kernels). The existing ``is_valid`` guard only short-circuits the *heavy* work past that length; the outer ``for candidate_idx in range(0, num_candidates)`` still pays one ``tl.load`` + branch per iter on the dead tail. Capping the loop at ``min(num_candidates, valid_len - candidate_offset)`` (clamped to 0) removes those wasted iterations while preserving the existing ``is_valid`` semantics: the iterations we now skip are exactly those the existing guard already discarded. Applied to six accumulate kernels in ``sparse_mla_kernels.py``: - ``_accumulate_gathered_attention_chunk_kernel`` - ``_accumulate_indexed_attention_chunk_kernel`` [autotuned in #1] - ``_accumulate_fp8ds_global_slots_attention_chunk_kernel`` [autotuned in #1] - ``_accumulate_fp8ds_global_slots_attention_chunk_multihead_kernel`` [decode] - ``_accumulate_fp8ds_paged_attention_chunk_kernel`` [autotuned in #1] - ``_accumulate_fp8ds_paged_attention_chunk_multihead_kernel`` [decode] CUDA-graph safety: ``lens_ptr`` / ``gather_lens_ptr`` are stable addresses; their values are refreshed per call by the metadata builder (outside the captured forward) and by ``combine_topk_swa_indices`` (inside the forward but writing only into the persistent buffers the accumulate kernels read from). The kernel inner-loop bound is a runtime-loaded scalar — Triton compiles a dynamic loop and the captured launch picks up the current value on each replay. Savings scale with ``combined_topk_buffer_width - actual valid length`` (i.e. mostly visible at long ``max_model_len`` with shorter actual contexts). At our test shape (``max_model_len=131072``, ISL=2048) the saved iterations come mostly from the decode multihead path; expected to be neutral / no-regression at short ``max_model_len`` where the bound equals ``num_candidates``. Signed-off-by: jasl <jasl9187@hotmail.com>
diff --git a/vllm/v1/attention/backends/mla/sparse_mla_kernels.py b/vllm/v1/attention/backends/mla/sparse_mla_kernels.py
@@ -1128,8 +1128,13 @@ def _accumulate_gathered_attention_chunk_kernel(
     running_denom = tl.load(denom_ptr + state_offset)
     running_acc = tl.load(acc_ptr + acc_offset, mask=dim_mask, other=0.0).to(tl.float32)
     valid_len = tl.load(lens_ptr + token_idx)
+    # Per-token early-loop-exit (see indexed kernel comment).
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(valid_len - candidate_offset, 0),
+    )
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         is_valid = (candidate_offset + candidate_idx) < valid_len
         if HAS_SLOT_IDS:
             slot_id = tl.load(
@@ -1289,8 +1294,21 @@ def _accumulate_indexed_attention_chunk_kernel(
     running_denom = tl.load(denom_ptr + state_offset)
     running_acc = tl.load(acc_ptr + acc_offset, mask=dim_mask, other=0.0).to(tl.float32)
     valid_len = tl.load(lens_ptr + token_idx)
+    # Per-token early-loop-exit: the combine_topk_swa_indices kernel writes
+    # ``[topk_len_t | swa_len_t | -1 padding]`` and stores
+    # ``lens[t] = topk_len_t + swa_len_t``. The existing ``is_valid`` guard
+    # already gates the heavy work past ``valid_len``, but the outer loop
+    # still iterates the full ``num_candidates`` (= chunk width). Capping
+    # the loop at ``min(num_candidates, valid_len - candidate_offset)``
+    # saves the per-iteration index load + compare overhead on the dead
+    # tail. CUDA-graph-safe because ``lens_ptr`` is a stable address and
+    # the loaded value updates per call from the metadata builder.
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(valid_len - candidate_offset, 0),
+    )
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         kv_index = tl.load(
             indices_ptr
             + token_idx * stride_indices_t
@@ -1445,12 +1463,17 @@ def _accumulate_fp8ds_global_slots_attention_chunk_kernel(
     running_denom = tl.load(denom_ptr + state_offset)
     running_acc = tl.load(acc_ptr + acc_offset, mask=dim_mask, other=0.0).to(tl.float32)
     valid_len = tl.load(lens_ptr + token_idx)
+    # Per-token early-loop-exit (see indexed kernel comment).
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(valid_len - candidate_offset, 0),
+    )
 
     fp8_mask = offsets < fp8_dim
     rope_mask = (offsets >= fp8_dim) & dim_mask
     rope_offsets = tl.maximum(offsets - fp8_dim, 0)
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         slot_id = tl.load(
             slot_ids_ptr + token_idx * stride_slot_t + candidate_idx * stride_slot_c
         )
@@ -1645,12 +1668,21 @@ def _accumulate_fp8ds_global_slots_attention_chunk_multihead_kernel(
         tl.float32
     )
     valid_len = tl.load(lens_ptr + token_idx)
+    # Per-token early-loop-exit: ``lens[t] = topk_len_t + swa_len_t`` (set
+    # by combine_topk_swa_indices). Iterating past ``valid_len`` only
+    # incurs the per-iter index-load + compare cost on padding-tail; cap
+    # the outer loop at ``valid_len - candidate_offset`` to skip the dead
+    # tail. CUDA-graph-safe because ``lens_ptr`` is a stable address.
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(valid_len - candidate_offset, 0),
+    )
 
     fp8_mask = dim_offsets < fp8_dim
     rope_mask = (dim_offsets >= fp8_dim) & dim_mask
     rope_offsets = tl.maximum(dim_offsets - fp8_dim, 0)
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         slot_id = tl.load(
             slot_ids_ptr + token_idx * stride_slot_t + candidate_idx * stride_slot_c
         )
@@ -1851,8 +1883,13 @@ def _accumulate_fp8ds_paged_attention_chunk_kernel(
     fp8_mask = offsets < fp8_dim
     rope_mask = (offsets >= fp8_dim) & dim_mask
     rope_offsets = tl.maximum(offsets - fp8_dim, 0)
+    # Per-token early-loop-exit (see indexed kernel comment).
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(gather_len - candidate_offset, 0),
+    )
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         gather_idx = candidate_offset + candidate_idx
         is_valid = gather_idx < gather_len
 
@@ -2054,8 +2091,17 @@ def _accumulate_fp8ds_paged_attention_chunk_multihead_kernel(
     fp8_mask = dim_offsets < fp8_dim
     rope_mask = (dim_offsets >= fp8_dim) & dim_mask
     rope_offsets = tl.maximum(dim_offsets - fp8_dim, 0)
+    # Per-token early-loop-exit: ``gather_len`` is the per-token count of
+    # cached entries available for this paged read; the existing
+    # ``is_valid`` guard skips heavy work past that, but we can also skip
+    # the per-iter index load + branch by capping the loop. CUDA-graph-
+    # safe because ``gather_lens_ptr`` is a stable address.
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(gather_len - candidate_offset, 0),
+    )
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         gather_idx = candidate_offset + candidate_idx
         is_valid = gather_idx < gather_len
 
@@ -2247,8 +2293,17 @@ def _fp8ds_paged_attention_with_sink_multihead_kernel(
     fp8_mask = dim_offsets < fp8_dim
     rope_mask = (dim_offsets >= fp8_dim) & dim_mask
     rope_offsets = tl.maximum(dim_offsets - fp8_dim, 0)
+    # Per-token early-loop-exit: ``gather_len`` is the per-token count of
+    # cached entries available for this paged read; the existing
+    # ``is_valid`` guard skips heavy work past that, but we can also skip
+    # the per-iter index load + branch by capping the loop. CUDA-graph-
+    # safe because ``gather_lens_ptr`` is a stable address.
+    local_eff = tl.minimum(
+        num_candidates,
+        tl.maximum(gather_len - candidate_offset, 0),
+    )
 
-    for candidate_idx in range(0, num_candidates):
+    for candidate_idx in range(0, local_eff):
         gather_idx = candidate_offset + candidate_idx
         is_valid = gather_idx < gather_len
         if is_valid: