Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 71 additions & 24 deletions mojo_opset/backends/ttx/kernels/ilu/swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,16 +248,34 @@ def _swa_acc_fwd_nomask_mxn(
BLOCK_N: tl.constexpr,
BLOCK_D: tl.constexpr,
):
k = tl.load(k_block_ptr, boundary_check=(0, 1), padding_option="zero")
qk = tl.dot(q, tl.trans(k))
# When BLOCK_D == HEAD_DIM (head_dim a power of two — the common case):
# • K block_ptr is transposed (BLOCK_D, BLOCK_N) by the caller, so
# tl.dot(q, k) = Q @ K^T directly — no TransOp, no smeLayoutTrans
# warp-shuffle.
# • boundary_check is dropped on both K and V (the nomask region
# guarantees full in-bounds tiles + PAGE_SIZE % BLOCK_N == 0 keeps
# each tile within one page), letting MMALoad / MatmulLoad activate.
# When BLOCK_D != HEAD_DIM (non-power-of-two head_dim):
# • K is (BLOCK_N, BLOCK_D); we keep boundary_check + tl.trans(k)
# because the D-dim may be padded (BLOCK_D > HEAD_DIM).
KT_NOTRANS: tl.constexpr = BLOCK_D == HEAD_DIM
if KT_NOTRANS:
k = tl.load(k_block_ptr)
qk = tl.dot(q, k)
else:
k = tl.load(k_block_ptr, boundary_check=(0, 1), padding_option="zero")
qk = tl.dot(q, tl.trans(k))
qk = qk * qk_scale

m_ij = tl.maximum(m_i, tl.max(qk, 1))
qk = qk - m_ij[:, None]
p = tl.math.exp(qk)
p_cast = p.to(k.dtype)

v = tl.load(v_block_ptr, boundary_check=(0, 1), padding_option="zero")
if KT_NOTRANS:
v = tl.load(v_block_ptr)
else:
v = tl.load(v_block_ptr, boundary_check=(0, 1), padding_option="zero")
l_ij = tl.sum(p, 1)
alpha = tl.math.exp(m_i - m_ij)
l_i = l_i * alpha + l_ij
Expand Down Expand Up @@ -480,14 +498,26 @@ def _swa_infer_kernel(

for kv_block_id in range(full_local_start_block, full_local_end_block + 1):
kv_block_start = kv_block_id * BLOCK_N
k_block_ptr = tl.make_block_ptr(
base=k_ptr + kv_start * stride_kt + kv_head_id * stride_kh,
shape=(kv_seq_len, HEAD_DIM),
strides=(stride_kt, stride_kd),
offsets=(kv_block_start.to(tl.int32), 0),
block_shape=(BLOCK_N, BLOCK_D),
order=(1, 0),
)
# When BLOCK_D == HEAD_DIM, construct K block_ptr transposed (BLOCK_D, BLOCK_N)
# so _swa_acc_fwd_nomask_mxn uses tl.dot(q, k) without tl.trans.
if BLOCK_D == HEAD_DIM:
k_block_ptr = tl.make_block_ptr(
base=k_ptr + kv_start * stride_kt + kv_head_id * stride_kh,
shape=(HEAD_DIM, kv_seq_len),
strides=(stride_kd, stride_kt),
offsets=(0, kv_block_start.to(tl.int32)),
block_shape=(BLOCK_D, BLOCK_N),
order=(0, 1),
)
else:
k_block_ptr = tl.make_block_ptr(
base=k_ptr + kv_start * stride_kt + kv_head_id * stride_kh,
shape=(kv_seq_len, HEAD_DIM),
strides=(stride_kt, stride_kd),
offsets=(kv_block_start.to(tl.int32), 0),
block_shape=(BLOCK_N, BLOCK_D),
order=(1, 0),
)
v_block_ptr = tl.make_block_ptr(
base=v_ptr + kv_start * stride_vt + kv_head_id * stride_vh,
shape=(kv_seq_len, HEAD_DIM),
Expand Down Expand Up @@ -659,8 +689,8 @@ def _swa_paged_prefill_kernel(
tl.static_assert(PAGE_SIZE % BLOCK_N == 0, "BLOCK_N must divide PAGE_SIZE for paged KV tiling")
pid = tl.program_id(0)
n_programs = tl.num_programs(0)
has_global_window = GLOBAL_WINDOW is not None
has_local_window = LOCAL_WINDOW is not None
has_global_window: tl.constexpr = GLOBAL_WINDOW is not None
has_local_window: tl.constexpr = LOCAL_WINDOW is not None

cu_q_chunks = 0
q_offsets = tl.arange(0, BLOCK_M)
Expand Down Expand Up @@ -867,17 +897,34 @@ def _swa_paged_prefill_kernel(
physical_page_id = tl.load(
block_table_ptr + b_id * stride_block_table_b + logical_page_id * stride_block_table_p
)
k_block_ptr = tl.make_block_ptr(
base=k_cache_ptr
+ physical_page_id * stride_kp
+ kv_head_id * stride_kh
+ kv_block_start_in_page * stride_kt,
shape=(kv_block_len, HEAD_DIM),
strides=(stride_kt, stride_kd),
offsets=(0, 0),
block_shape=(BLOCK_N, BLOCK_D),
order=(1, 0),
)

# When BLOCK_D == HEAD_DIM, construct K block_ptr transposed (BLOCK_D, BLOCK_N) so
# _swa_acc_fwd_nomask_mxn uses tl.dot(q, k) without tl.trans. When BLOCK_D != HEAD_DIM,
# keep the original (BLOCK_N, BLOCK_D) layout with boundary_check fallback handled inside the helper.
if BLOCK_D == HEAD_DIM:
k_block_ptr = tl.make_block_ptr(
base=k_cache_ptr
+ physical_page_id * stride_kp
+ kv_head_id * stride_kh
+ kv_block_start_in_page * stride_kt,
shape=(HEAD_DIM, kv_block_len),
strides=(stride_kd, stride_kt),
offsets=(0, 0),
block_shape=(BLOCK_D, BLOCK_N),
order=(0, 1),
)
else:
k_block_ptr = tl.make_block_ptr(
base=k_cache_ptr
+ physical_page_id * stride_kp
+ kv_head_id * stride_kh
+ kv_block_start_in_page * stride_kt,
shape=(kv_block_len, HEAD_DIM),
strides=(stride_kt, stride_kd),
offsets=(0, 0),
block_shape=(BLOCK_N, BLOCK_D),
order=(1, 0),
)
Comment on lines +904 to +927

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In the nomask loop of the paged prefill kernel, the block is mathematically guaranteed to be fully in-bounds, meaning kv_block_len is always exactly equal to BLOCK_N.

By replacing the dynamic kv_block_len with the compile-time constant BLOCK_N in the shape parameter of tl.make_block_ptr, we make the block pointer shape fully static. This allows the Triton compiler to perform better static analysis, address generation, and instruction scheduling optimizations.

                    if BLOCK_D == HEAD_DIM:
                        k_block_ptr = tl.make_block_ptr(
                            base=k_cache_ptr
                            + physical_page_id * stride_kp
                            + kv_head_id * stride_kh
                            + kv_block_start_in_page * stride_kt,
                            shape=(HEAD_DIM, BLOCK_N),
                            strides=(stride_kd, stride_kt),
                            offsets=(0, 0),
                            block_shape=(BLOCK_D, BLOCK_N),
                            order=(0, 1),
                        )
                    else:
                        k_block_ptr = tl.make_block_ptr(
                            base=k_cache_ptr
                            + physical_page_id * stride_kp
                            + kv_head_id * stride_kh
                            + kv_block_start_in_page * stride_kt,
                            shape=(BLOCK_N, HEAD_DIM),
                            strides=(stride_kt, stride_kd),
                            offsets=(0, 0),
                            block_shape=(BLOCK_N, BLOCK_D),
                            order=(1, 0),
                        )

v_block_ptr = tl.make_block_ptr(
base=v_cache_ptr
+ physical_page_id * stride_vp
Expand Down
Loading