Move Q4_K env-var dispatch into emit_linear/emit_embedding so patterns stays unchanged.

uddeshsingh · uddeshsingh · commit bbfe5d665904 · 2026-06-12T14:45:37.000-05:00
diff --git a/backends/mlx/custom_kernel_ops/gguf/patterns.py b/backends/mlx/custom_kernel_ops/gguf/patterns.py
@@ -114,18 +114,9 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
                 emit_linear,
             )
         else:  # q4_k
-            from executorch.backends.mlx.custom_kernel_ops.gguf.q4k import (
-                emit_direct_gguf,
+            from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.linear import (
+                emit_linear,
             )
-
-            if emit_direct_gguf():
-                from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.linear import (
-                    emit_linear,
-                )
-            else:
-                from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.linear_mlx_native import (
-                    emit_linear,
-                )
         return emit_linear(P, n, x_node, self.weight, bias_node)
 
 
@@ -177,8 +168,8 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot:
                 from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.embedding import (
                     emit_embedding,
                 )
-            else:
-                from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.embedding_mlx_native import (
-                    emit_embedding,
-                )
+            else:  # q4_k
+                from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.embedding import (
+                emit_embedding,
+            )
         return emit_embedding(P, n, self.weight, indices_node, self.output_dtype)
diff --git a/backends/mlx/custom_kernel_ops/gguf/q4k/common.py b/backends/mlx/custom_kernel_ops/gguf/q4k/common.py
@@ -51,8 +51,6 @@
     _Q4K_D_BYTES + _Q4K_DMIN_BYTES + _Q4K_SCALES_BYTES + _Q4K_QS_BYTES
 )  # 144
 
-# Q4_K mat-mat uses NL = QK_K / 32 (8 sub-blocks of 32 elements).
-Q4K_NL = QK_K // 32  # 8
 
 # ---------------------------------------------------------------------------
 # Shared Metal header
diff --git a/backends/mlx/custom_kernel_ops/gguf/q4k/embedding.py b/backends/mlx/custom_kernel_ops/gguf/q4k/embedding.py
@@ -56,7 +56,7 @@
 """
 
 
-def emit_embedding(
+def _emit_embedding_fused(
     P: MLXProgramBuilder,
     head: Node,
     weight_node: Node,
@@ -125,3 +125,28 @@ def emit_embedding(
     )
 
     return out
+
+
+
+def emit_embedding(
+    P: MLXProgramBuilder,
+    head: Node,
+    weight_node: Node,
+    indices_node: Node,
+    output_dtype: torch.dtype,
+) -> Slot:
+    """Dispatch to fused Metal gather or the legacy MLX-native repack path."""
+    from executorch.backends.mlx.custom_kernel_ops.gguf.q4k import emit_direct_gguf
+
+    if emit_direct_gguf():
+        return _emit_embedding_fused(
+            P, head, weight_node, indices_node, output_dtype
+        )
+
+    from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.embedding_mlx_native import (
+        emit_embedding as emit_embedding_mlx_native,
+    )
+
+    return emit_embedding_mlx_native(
+        P, head, weight_node, indices_node, output_dtype
+    )
diff --git a/backends/mlx/custom_kernel_ops/gguf/q4k/linear.py b/backends/mlx/custom_kernel_ops/gguf/q4k/linear.py
@@ -180,7 +180,7 @@ def _q4k_matmul_source(has_bias: bool) -> str:
     short il0 = tid % NL0;
     short il  = il0;  // current dequant sub-block index within Q4_K block
     
-    const short offset1 = il0 / NL;  // always 0 for NL=8, NL0=2
+    const short offset1 = il0 / NL;  // always 0 (il0 < NL0=2, NL=16)
 
     // Pointer to weight block for this thread's assigned row.
     device const block_q4_K * wblk = (device const block_q4_K *) weight
@@ -417,7 +417,7 @@ def _emit_q4k_matmul(
     )
 
 
-def emit_linear(
+def _emit_linear_fused(
     P: MLXProgramBuilder,
     head: Node,
     x_node: Node,
@@ -513,3 +513,22 @@ def emit_linear(
             ),
         )
     return out
+
+def emit_linear(
+    P: MLXProgramBuilder,
+    head: Node,
+    x_node: Node,
+    weight_node: Node,
+    bias_node: Optional[Node],
+) -> Slot:
+    """Dispatch to fused Metal kernels or the legacy MLX-native repack path."""
+    from executorch.backends.mlx.custom_kernel_ops.gguf.q4k import emit_direct_gguf
+
+    if emit_direct_gguf():
+        return _emit_linear_fused(P, head, x_node, weight_node, bias_node)
+
+    from executorch.backends.mlx.custom_kernel_ops.gguf.q4k.linear_mlx_native import (
+        emit_linear as emit_linear_mlx_native,
+    )
+
+    return emit_linear_mlx_native(P, head, x_node, weight_node, bias_node)