Fix MLX RoPE for proportional partial rotary (Gemma 4 full-attention layers)

mergennachin · claude · mergennachin · commit ae02b38b2569 · 2026-05-19T07:55:54.000-07:00
- custom_ops.py: support 1D freqs in the Python fake op. When freqs is
  1D, compute inv_freq = 1/freqs and build angles from positions,
  matching the C++ runtime behavior. 2D freqs path unchanged.
- MLXInterpreter.h: pass base=nullopt when freqs is provided. MLX's
  fast::rope requires exactly one of base or freqs.
- mlx_source_transformations.py: pass dims=rotary_dim (not head_dim)
  with 1D freqs containing only the non-zero rotary frequencies. The
  old code passed 2D precomputed angles which was incorrect at the C++
  level.
- test_ops.py: add RopeCustomFreqsTest (3 configs) verifying export and
  MLX delegation with 1D custom frequencies.

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/backends/mlx/custom_ops.py b/backends/mlx/custom_ops.py
@@ -228,8 +228,16 @@ def rope(
         # final angles: [1, 1, T, half]
         angles = (pos_range * inv_freq) * float(scale)
     else:
-        # assume freqs is already per-position, just reshape to [1,1,T,half]
-        angles = freqs.to(torch.float32).view(1, 1, T, half)
+        if freqs.ndim == 1:
+            # 1D raw frequencies: compute angles = positions * (1/freqs)
+            inv_freq = (1.0 / freqs.to(torch.float32)).view(1, 1, 1, half)
+            pos_range = torch.arange(
+                pos, pos + T, device=x.device, dtype=torch.float32
+            ).view(1, 1, T, 1)
+            angles = (pos_range * inv_freq) * float(scale)
+        else:
+            # 2D per-position angles: reshape to [1,1,T,half]
+            angles = freqs.to(torch.float32).view(1, 1, T, half)
 
     cos = angles.cos().to(x.dtype)  # [1,1,T,half]
     sin = angles.sin().to(x.dtype)  # [1,1,T,half]
diff --git a/backends/mlx/runtime/MLXInterpreter.h b/backends/mlx/runtime/MLXInterpreter.h
@@ -242,6 +242,11 @@ inline void exec_rope(const RopeNode& n, ExecutionState& st, StreamOrDevice s) {
     freqs_arr = st.const_tensor_ref(*n.freqs);
   }
 
+  // MLX requires exactly one of base or freqs — when freqs is provided,
+  // base must be nullopt.
+  std::optional<float> base =
+      freqs_arr ? std::nullopt : std::optional<float>(n.base);
+
   // MLX has two overloads: rope(..., int offset, ...) and rope(..., const
   // array& offset, ...) Call the appropriate one based on is_vid
   if (n.offset.is_vid) {
@@ -250,14 +255,14 @@ inline void exec_rope(const RopeNode& n, ExecutionState& st, StreamOrDevice s) {
     st.set_tensor(
         n.out,
         fast::rope(
-            x, n.dims, n.traditional, n.base, n.scale, offset, freqs_arr, s));
+            x, n.dims, n.traditional, base, n.scale, offset, freqs_arr, s));
   } else {
     // Tensor offset from Tid
     const array& offset = st.const_tensor_ref(n.offset.tid);
     st.set_tensor(
         n.out,
         fast::rope(
-            x, n.dims, n.traditional, n.base, n.scale, offset, freqs_arr, s));
+            x, n.dims, n.traditional, base, n.scale, offset, freqs_arr, s));
   }
 }
 
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
@@ -1803,6 +1803,82 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]:
         return (q, k, pos_tensor)
 
 
+class RopeCustomFreqsModel(nn.Module):
+    """Model that applies RoPE with custom 1D frequencies (partial rotary)."""
+
+    def __init__(self, dims: int = 32, head_dim: int = 64):
+        super().__init__()
+        self.dims = dims
+        self.head_dim = head_dim
+        # Simulate proportional RoPE: compute freqs for rotary dims only
+        inv_freq = 1.0 / (
+            500000.0 ** (torch.arange(0, dims, 2, dtype=torch.float32) / head_dim)
+        )
+        self.register_buffer("freqs", 1.0 / inv_freq, persistent=False)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        pos_tensor: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        pos = pos_tensor.item()
+        q_rot = torch.ops.mlx.rope(q, self.dims, pos, False, 0.0, 1.0, self.freqs)
+        k_rot = torch.ops.mlx.rope(k, self.dims, pos, False, 0.0, 1.0, self.freqs)
+        return q_rot, k_rot
+
+
+@register_test
+class RopeCustomFreqsTest(OpTestCase):
+    """Test RoPE with custom 1D frequencies (partial rotary, like Gemma 4)."""
+
+    name = "rope_custom_freqs"
+    rtol = 1e-4
+    atol = 1e-4
+
+    def __init__(
+        self,
+        batch_size: int = 1,
+        num_heads: int = 8,
+        seq_len: int = 4,
+        head_dim: int = 64,
+        dims: int = 32,
+        pos: int = 0,
+    ):
+        self.batch_size = batch_size
+        self.num_heads = num_heads
+        self.seq_len = seq_len
+        self.head_dim = head_dim
+        self.dims = dims
+        self.pos = pos
+        self.name = "rope_custom_freqs"
+
+    @classmethod
+    def get_test_configs(cls) -> List["RopeCustomFreqsTest"]:
+        configs = [
+            cls(),
+            cls(pos=10),
+            cls(head_dim=128, dims=64),
+        ]
+        for cfg in configs:
+            parts = ["rope_custom_freqs"]
+            if cfg.pos > 0:
+                parts.append(f"pos{cfg.pos}")
+            if cfg.head_dim != 64:
+                parts.append(f"hd{cfg.head_dim}")
+            cfg.name = "_".join(parts)
+        return configs
+
+    def create_model(self) -> nn.Module:
+        return RopeCustomFreqsModel(dims=self.dims, head_dim=self.head_dim)
+
+    def create_inputs(self) -> Tuple[torch.Tensor, ...]:
+        q = torch.randn(self.batch_size, self.num_heads, self.seq_len, self.head_dim)
+        k = torch.randn(self.batch_size, self.num_heads, self.seq_len, self.head_dim)
+        pos_tensor = torch.tensor(self.pos, dtype=torch.int64)
+        return (q, k, pos_tensor)
+
+
 from executorch.backends.mlx.llm.cache import KVCache
 
 
diff --git a/examples/models/gemma4_31b/mlx_source_transformations.py b/examples/models/gemma4_31b/mlx_source_transformations.py
@@ -51,10 +51,7 @@ def _mlx_forward(
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
-        # RoPE via mlx::rope. For proportional partial RoPE (full-attention
-        # layers), pass precomputed frequencies since mlx.rope's built-in
-        # frequency computation uses dims as the denominator, but Gemma 4
-        # uses head_dim.
+        # RoPE via mlx::rope.
         if self.is_sliding:
             q = torch.ops.mlx.rope(
                 q, self.head_dim, start_pos, False, self.rope_theta, 1.0, None
@@ -63,9 +60,15 @@ def _mlx_forward(
                 k, self.head_dim, start_pos, False, self.rope_theta, 1.0, None
             )
         else:
-            freqs = torch.outer(input_pos.float(), self.inv_freq)
-            q = torch.ops.mlx.rope(q, self.head_dim, start_pos, False, 0.0, 0.0, freqs)
-            k = torch.ops.mlx.rope(k, self.head_dim, start_pos, False, 0.0, 0.0, freqs)
+            # Full-attention layers use proportional partial RoPE: only
+            # rotary_dim out of head_dim dimensions are rotated. Pass
+            # dims=rotary_dim and the non-zero frequencies as 1D freqs.
+            # MLX computes inv_freq = 1/freqs internally.
+            rotary_dim = int(self.head_dim * self.partial_rotary)
+            rotary_inv_freq = self.inv_freq[: rotary_dim // 2]
+            mlx_freqs = 1.0 / rotary_inv_freq
+            q = torch.ops.mlx.rope(q, rotary_dim, start_pos, False, 0.0, 1.0, mlx_freqs)
+            k = torch.ops.mlx.rope(k, rotary_dim, start_pos, False, 0.0, 1.0, mlx_freqs)
 
         k_cache, v_cache = self.kv_cache.update(start_pos, k, v)