Add MaxPool1D decomposition pass support (#17022)

Ninja91 · facebook-github-bot · commit a56c81dc2985 · 2026-02-20T17:01:05.000-08:00
Summary:

Implement DecomposeMaxPool1dPass to enable MaxPool1D support on ARM backend
by decomposing max_pool1d to view_copy → max_pool2d → view_copy.

## Implementation Strategy

### Decomposition Approach (Optimal for TOSA/Vela)
The pass decomposes max_pool1d into max_pool2d via view_copy operations:
1. view_copy: (N, C, L) → (N, C, 1, L) - add height dimension
2. max_pool2d: with adapted params [k]→[1,k], [s]→[1,s], [p]→[0,p]
3. view_copy: (N, C, 1, L_out) → (N, C, L_out) - remove height dimension

### Why This Approach is Optimal

1. **view_copy maps to TOSA RESHAPE** which is zero-cost in Vela:
   - Classified as memory_only_ops (Reshape, Squeeze, ExpandDims, Identity)
   - Bypassed entirely when conditions met (NPU-produced, single consumer)
   - Tensor equivalence enables memory aliasing (same address)

2. **TFA Pipeline Placement (before quantization)**:
   - view_copy is in _one_to_one_shared_input_qspec (line 407)
   - max_pool2d is in _one_to_one_shared_input_or_input_act_qspec (line 455)
   - Both get proper SharedQuantizationSpec from annotator automatically

3. **Quantization Handling**:
   - Clear qparams on intermediate view_copy ops (let annotator fill them)
   - Preserve original meta on max_pool2d for proper tracing
   - MAX_POOL2D doesn't need zero-point handling (unlike AVG_POOL2D)

### TOSA/Vela Constraints Validated
- U55: Stride ≤3 ✓, Kernel ≤256x256 ✓
- U85: Extended stride support via accumulator save/restore
- Dilation: Handled by separate DecomposeMaxPool2dPass if needed

Reviewed By: 3l1

Differential Revision: D91760459
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -66,6 +66,7 @@
 from .decompose_logit_pass import DecomposeLogitPass  # noqa
 from .decompose_masked_fill_pass import DecomposeMaskedFillPass  # noqa
 from .decompose_matmul import DecomposeMatmulPass  # noqa
+from .decompose_max_pool1d_pass import DecomposeMaxPool1dPass  # noqa
 from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -68,6 +68,7 @@
     DecomposeLogitPass,
     DecomposeMaskedFillPass,
     DecomposeMatmulPass,
+    DecomposeMaxPool1dPass,
     DecomposeMaxPool2dPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
@@ -343,6 +344,7 @@ def _tosa_pipeline(
                 DecomposeCumsumPass(exported_program),
                 DecomposeAsStridedCopyPass(),
                 DecomposeMaxPool2dPass(),
+                DecomposeMaxPool1dPass(),
                 SizeAdjustInputPass(),
                 DecomposeSelectPass(),
                 ConvertSqueezesToViewPass(),
@@ -447,6 +449,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                 DecomposeLinalgVectorNormPass(tfa_pass=True),
                 DecomposeSqrtPass(tfa_pass=True),
                 DecomposeAvgPool2dPass(tfa_pass=True),
+                DecomposeMaxPool1dPass(tfa_pass=True),
                 DecomposeSoftmaxUnstablePass(tfa_pass=True),
                 DecomposeSoftmaxPass(tfa_pass=True),
                 ConvertMinMaxPass(tfa_pass=True),
diff --git a/backends/arm/_passes/decompose_max_pool1d_pass.py b/backends/arm/_passes/decompose_max_pool1d_pass.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Set, Type, Union
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.pass_base import ExportPass
+
+
+def _normalize_to_list(
+    value: Optional[Union[int, List[int], tuple]],
+    default: Optional[List[int]] = None,
+) -> List[int]:
+    """Normalize parameter to list: handle None, int, tuple, list."""
+    if value is None:
+        if default is None:
+            raise ValueError("Value cannot be None without a default")
+        return default
+    if isinstance(value, int):
+        return [value]
+    return list(value)
+
+
+class DecomposeMaxPool1dPass(ArmPass):
+    """
+    Decomposes max_pool1d into max_pool2d via unsqueeze_copy/squeeze_copy operations.
+
+    This pass runs in transform_for_annotation (TFA) pipeline before quantization,
+    ensuring proper quantization annotation for the decomposed ops.
+
+    Transformation:
+        max_pool1d(x, kernel, stride, padding, dilation, ceil_mode)
+            → unsqueeze_copy(x, dim=2)           # (N,C,L) → (N,C,1,L)
+            → max_pool2d(..., [1,k], [1,s], [0,p], [1,d], ceil_mode)
+            → squeeze_copy(..., dims=[2])        # (N,C,1,L') → (N,C,L')
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != torch.ops.aten.max_pool1d.default or not self.allowed_to_transform(
+            meta
+        ):
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Extract and normalize arguments
+        x = args[0]
+        kernel_size = _normalize_to_list(args[1])
+        stride = _normalize_to_list(
+            args[2] if len(args) > 2 else None,
+            default=kernel_size,  # stride defaults to kernel_size
+        )
+        padding = _normalize_to_list(args[3] if len(args) > 3 else 0)
+        dilation = _normalize_to_list(args[4] if len(args) > 4 else 1)
+        ceil_mode = args[5] if len(args) > 5 else False
+
+        # Step 1: Unsqueeze input from 3D to 4D at dim=2
+        # (N, C, L) → (N, C, 1, L)
+        x_4d = super().call_operator(
+            torch.ops.aten.unsqueeze_copy.default,
+            (x, 2),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # Step 2: Call max_pool2d with 2D parameters
+        # kernel: [k] → [1, k], stride: [s] → [1, s]
+        # padding: [p] → [0, p], dilation: [d] → [1, d]
+        pooled = super().call_operator(
+            torch.ops.aten.max_pool2d.default,
+            (
+                x_4d,
+                [1] + kernel_size,
+                [1] + stride,
+                [0] + padding,
+                [1] + dilation,
+                ceil_mode,
+            ),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # Step 3: Squeeze output back to 3D at dims=[2]
+        # (N, C, 1, L') → (N, C, L')
+        output = super().call_operator(
+            torch.ops.aten.squeeze_copy.dims,
+            (pooled, [2]),
+            {},
+            meta,
+            updated=True,
+        )
+
+        return output
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -4,9 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 """Provide quantization annotation logic for Arm backends.
 
-This module computes per-node quantization properties and applies input/output
-annotations to FX graphs using TorchAO qspecs.
-
+This module computes per-node quantization properties and applies
+input/output annotations to FX graphs using TorchAO qspecs.
 """
 
 import logging
@@ -57,7 +56,6 @@ class _OpQuantProperties:
             indexed by argument positions.
         quant_output (Optional[_QuantProperty]): Quantization spec for the
             node's output when applicable.
-
     """
 
     def __init__(self):
@@ -73,7 +71,6 @@ def _as_list(x):
 
     Returns:
         list: ``x`` if already a list; otherwise ``[x]``.
-
     """
     if isinstance(x, (list, tuple)):
         return x
@@ -122,7 +119,6 @@ def _is_ok_for_quantization(
 
     Returns:
         bool: `True` if the node can be quantized, otherwise `False`.
-
     """
     # Check output
     if quant_properties.quant_output is not None:
@@ -182,7 +178,6 @@ def _get_node_target(module: torch.nn.Module | torch.fx.GraphModule, target_str:
 
     Returns:
         Any: Resolved attribute on the module.
-
     """
     targets = target_str.split(".")
     for target in targets[:-1]:
@@ -195,7 +190,6 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
 
     Large scalars are skipped because ``torch.histc`` supports values only up
     to a certain upper bound.
-
     """
     HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
@@ -213,7 +207,8 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
 
 
 def _is_non_float_tensor(node: Node) -> bool:
-    """Check if the output of a node has a data type other than `torch.float32`.
+    """Check if the output of a node has a data type other than
+    `torch.float32`.
 
     If the output is not `torch.float32`, quantization cannot be performed, as
     observers only work with floating-point tensors.
@@ -230,7 +225,6 @@ def _is_non_float_tensor(node: Node) -> bool:
           `torch.float32` as its data type.
         - If node.meta["val"] is missing or is not an instance of `FakeTensor`,
           the function returns True.
-
     """
     if "val" in node.meta and isinstance(node.meta["val"], Sequence):
         return any(
@@ -258,7 +252,6 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
     Raises:
         RuntimeError: If the node is already annotated.
         TypeError: If an input argument is not a ``Node`` instance.
-
     """
     if is_annotated(node):
         raise RuntimeError(
@@ -295,7 +288,6 @@ def _annotate_output(node: Node, quant_property: _QuantProperty):
         RuntimeError: If the node is already annotated.
         ValueError: If ``mark_annotated`` is True, ``optional`` is True, or
             ``index`` is not zero.
-
     """
     if is_annotated(node):
         raise RuntimeError(
@@ -322,7 +314,6 @@ def _match_pattern(
     ``pattern``. If ``filter_fn`` is provided, require all nodes in the chain
     to pass the filter. Each pattern element is a list of disjunctive node
     targets.
-
     """
     if len(pattern) < 1:
         raise ValueError("No pattern provided")
@@ -408,6 +399,7 @@ def _match_pattern(
     torch.ops.aten.squeeze_copy.default,
     torch.ops.aten.squeeze_copy.dim,
     torch.ops.aten.squeeze_.dim,
+    torch.ops.aten.squeeze_copy.dims,
     torch.ops.aten.squeeze.dim,
     torch.ops.aten.squeeze.dims,
     torch.ops.aten.unbind.int,
@@ -503,7 +495,6 @@ def get_quant_properties(  # noqa: C901
     Returns:
         _OpQuantProperties | None: Properties to apply, or ``None`` if the
             node is unsupported or not suitable for quantization.
-
     """
     if node.target == torch.ops.aten.conv_transpose2d.input:
         weight_qspec = _adjust_weight_qspec_for_conv_transpose(
@@ -820,7 +811,6 @@ def annotate_graph(  # type: ignore[return]
 
     Returns:
         Optional[List[List[Node]]]: Reserved for future use; currently None.
-
     """
     for node in gm.graph.nodes:
         if node.op != "call_function":