Add MaxPool1D decomposition pass support (#17022)

Ninja91 · facebook-github-bot · commit ad1f6267df09 · 2026-05-19T18:35:22.000-07:00
Summary:

Implement DecomposeMaxPool1dPass to enable MaxPool1D support on ARM backend
by decomposing max_pool1d into unsqueeze_copy → max_pool2d → squeeze_copy.

## Implementation Strategy

### Decomposition Approach (Optimal for TOSA/Vela)
The pass decomposes max_pool1d into max_pool2d via unsqueeze_copy/squeeze_copy
operations:
1. unsqueeze_copy(dim=2): (N, C, L) → (N, C, 1, L) - add height dimension
2. max_pool2d: with adapted params [k]→[1,k], [s]→[1,s], [p]→[0,p], [d]→[1,d]
3. squeeze_copy(dims=[2]): (N, C, 1, L_out) → (N, C, L_out) - remove height dimension

### Why This Approach is Optimal

1. **unsqueeze_copy and squeeze_copy map to TOSA RESHAPE** which is zero-cost in Vela:
   - Classified as memory_only_ops (Reshape, Squeeze, ExpandDims, Identity)
   - Bypassed entirely when conditions met (NPU-produced, single consumer)
   - Tensor equivalence enables memory aliasing (same address)

2. **TFA Pipeline Placement (before quantization)**:
   - unsqueeze_copy.default is in _one_to_one_shared_input_qspec
   - squeeze_copy.dims is added to _one_to_one_shared_input_qspec
   - max_pool2d is in _one_to_one_shared_input_or_input_act_qspec
   - All get proper SharedQuantizationSpec from the annotator automatically

3. **Quantization Handling**:
   - Clear qparams on intermediate unsqueeze_copy and squeeze_copy ops (let annotator fill them)
   - Preserve original meta on max_pool2d for proper tracing
   - MAX_POOL2D doesn't need zero-point handling (unlike AVG_POOL2D)

### TOSA/Vela Constraints Validated
- U55: Stride ≤3 ✓, Kernel ≤256x256 ✓
- U85: Extended stride support via accumulator save/restore
- Dilation: Handled by separate DecomposeMaxPool2dPass if needed

Reviewed By: 3l1

Differential Revision: D91760459
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -70,6 +70,7 @@
 from .decompose_lstm_pass import DecomposeLstmPass  # noqa
 from .decompose_masked_fill_pass import DecomposeMaskedFillPass  # noqa
 from .decompose_matmul import DecomposeMatmulPass  # noqa
+from .decompose_max_pool1d_pass import DecomposeMaxPool1dPass  # noqa
 from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -72,6 +72,7 @@
     DecomposeLstmPass,
     DecomposeMaskedFillPass,
     DecomposeMatmulPass,
+    DecomposeMaxPool1dPass,
     DecomposeMaxPool2dPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
@@ -506,6 +507,7 @@ def _tosa_pipeline(
                 UnsqueezeBeforeRepeatPass(),
                 DecomposeCumsumPass(exported_program),
                 DecomposeAsStridedCopyPass(),
+                DecomposeMaxPool1dPass(),
                 DecomposeMaxPool2dPass(),
                 SizeAdjustInputPass(),
                 RewriteAvgPool2dPass(),
@@ -638,6 +640,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                     DecomposeDivPass(tfa_pass=True),
                     DecomposeLinalgVectorNormPass(tfa_pass=True),
                     DecomposeSqrtPass(tfa_pass=True),
+                    DecomposeMaxPool1dPass(tfa_pass=True),
                     DecomposeSoftmaxPass(
                         tfa_pass=True,
                     ),
diff --git a/backends/arm/_passes/decompose_max_pool1d_pass.py b/backends/arm/_passes/decompose_max_pool1d_pass.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Set, Type, Union
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.pass_base import ExportPass
+
+
+def _normalize_to_list(
+    value: Optional[Union[int, List[int], tuple]],
+    default: Optional[List[int]] = None,
+) -> List[int]:
+    """Normalize parameter to list: handle None, int, tuple, list."""
+    if value is None:
+        if default is None:
+            raise ValueError("Value cannot be None without a default")
+        return default
+    if isinstance(value, int):
+        return [value]
+    return list(value)
+
+
+class DecomposeMaxPool1dPass(ArmPass):
+    """Decomposes max_pool1d into max_pool2d via unsqueeze_copy/squeeze_copy
+    operations.
+
+    This pass runs in transform_for_annotation (TFA) pipeline before quantization,
+    ensuring proper quantization annotation for the decomposed ops.
+
+    Transformation:
+        max_pool1d(x, kernel, stride, padding, dilation, ceil_mode)
+            → unsqueeze_copy(x, dim=2)           # (N,C,L) → (N,C,1,L)
+            → max_pool2d(..., [1,k], [1,s], [0,p], [1,d], ceil_mode)
+            → squeeze_copy(..., dims=[2])        # (N,C,1,L') → (N,C,L')
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != torch.ops.aten.max_pool1d.default or not self.allowed_to_transform(
+            meta
+        ):
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Extract and normalize arguments
+        x = args[0]
+        kernel_size = _normalize_to_list(args[1])
+        stride = _normalize_to_list(
+            args[2] if len(args) > 2 else None,
+            default=kernel_size,  # stride defaults to kernel_size
+        )
+        padding = _normalize_to_list(args[3] if len(args) > 3 else 0)
+        dilation = _normalize_to_list(args[4] if len(args) > 4 else 1)
+        ceil_mode = args[5] if len(args) > 5 else False
+
+        # Step 1: Unsqueeze input from 3D to 4D at dim=2
+        # (N, C, L) → (N, C, 1, L)
+        unsqueeze_meta = meta.copy()
+        unsqueeze_meta.data["input_qparams"] = {}
+        unsqueeze_meta.data["output_qparams"] = {}
+        x_4d = super().call_operator(
+            torch.ops.aten.unsqueeze_copy.default,
+            (x, 2),
+            {},
+            unsqueeze_meta,
+            updated=True,
+        )
+
+        # Step 2: Call max_pool2d with 2D parameters
+        # kernel: [k] → [1, k], stride: [s] → [1, s]
+        # padding: [p] → [0, p], dilation: [d] → [1, d]
+        pooled = super().call_operator(
+            torch.ops.aten.max_pool2d.default,
+            (
+                x_4d,
+                [1] + kernel_size,
+                [1] + stride,
+                [0] + padding,
+                [1] + dilation,
+                ceil_mode,
+            ),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # Step 3: Squeeze output back to 3D at dims=[2]
+        # (N, C, 1, L') → (N, C, L')
+        squeeze_meta = meta.copy()
+        squeeze_meta.data["input_qparams"] = {}
+        squeeze_meta.data["output_qparams"] = {}
+        output = super().call_operator(
+            torch.ops.aten.squeeze_copy.dims,
+            (pooled, [2]),
+            {},
+            squeeze_meta,
+            updated=True,
+        )
+
+        return output
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -4,9 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 """Provide quantization annotation logic for Arm backends.
 
-This module computes per-node quantization properties and applies input/output
-annotations to FX graphs using TorchAO qspecs.
-
+This module computes per-node quantization properties and applies
+input/output annotations to FX graphs using TorchAO qspecs.
 """
 
 import functools
@@ -72,7 +71,6 @@ class _OpQuantProperties:
             indexed by argument positions.
         quant_output (Optional[_QuantProperty]): Quantization spec for the
             node's output when applicable.
-
     """
 
     def __init__(self):
@@ -93,7 +91,6 @@ def _as_list(x):
 
     Returns:
         list: ``x`` if already a list; otherwise ``[x]``.
-
     """
     if isinstance(x, (list, tuple)):
         return x
@@ -206,7 +203,6 @@ def _is_ok_for_quantization(
 
     Returns:
         bool: `True` if the node can be quantized, otherwise `False`.
-
     """
     # Check output
     if quant_properties.quant_output is not None:
@@ -266,7 +262,6 @@ def _get_node_target(module: torch.nn.Module | torch.fx.GraphModule, target_str:
 
     Returns:
         Any: Resolved attribute on the module.
-
     """
     targets = target_str.split(".")
     for target in targets[:-1]:
@@ -279,7 +274,6 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
 
     Large scalars are skipped because ``torch.histc`` supports values only up
     to a certain upper bound.
-
     """
     HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
@@ -297,7 +291,8 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
 
 
 def _is_non_float_tensor(node: Node) -> bool:
-    """Check if the output of a node has a data type other than `torch.float32`.
+    """Check if the output of a node has a data type other than
+    `torch.float32`.
 
     If the output is not `torch.float32`, quantization cannot be performed, as
     observers only work with floating-point tensors.
@@ -314,7 +309,6 @@ def _is_non_float_tensor(node: Node) -> bool:
           `torch.float32` as its data type.
         - If node.meta["val"] is missing or is not an instance of `FakeTensor`,
           the function returns True.
-
     """
     if "val" in node.meta and isinstance(node.meta["val"], Sequence):
         return any(
@@ -342,7 +336,6 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
     Raises:
         RuntimeError: If the node is already annotated.
         TypeError: If an input argument is not a ``Node`` instance.
-
     """
     if is_annotated(node):
         raise RuntimeError(
@@ -379,7 +372,6 @@ def _annotate_output(node: Node, quant_property: _QuantProperty):
         RuntimeError: If the node is already annotated.
         ValueError: If ``mark_annotated`` is True, ``optional`` is True, or
             ``index`` is not zero.
-
     """
     if is_annotated(node):
         raise RuntimeError(
@@ -408,7 +400,6 @@ def _match_pattern(
     ``pattern``. If ``filter_fn`` is provided, require all nodes in the chain
     to pass the filter. Each pattern element is an iterable of disjunctive
     node targets.
-
     """
     if len(pattern) < 1:
         raise ValueError("No pattern provided")
@@ -517,6 +508,9 @@ def _match_pattern(
     torch.ops.aten.squeeze_copy.default,
     torch.ops.aten.squeeze_copy.dim,
     torch.ops.aten.squeeze_.dim,
+    # DecomposeMaxPool1dPass emits squeeze_copy.dims as a view-like intermediate;
+    # include here so it receives SharedQuantizationSpec from its input.
+    torch.ops.aten.squeeze_copy.dims,
     torch.ops.aten.squeeze.dim,
     torch.ops.aten.squeeze.dims,
     torch.ops.aten.unbind.int,
@@ -612,7 +606,6 @@ def get_quant_properties(  # noqa: C901
     Returns:
         _OpQuantProperties | None: Properties to apply, or ``None`` if the
             node is unsupported or not suitable for quantization.
-
     """
     if node.target == torch.ops.aten.conv_transpose2d.input:
         weight_qspec = _adjust_weight_qspec_for_conv_transpose(
@@ -950,7 +943,6 @@ def annotate_graph(  # type: ignore[return]
 
     Returns:
         Optional[List[List[Node]]]: Reserved for future use; currently None.
-
     """
     for node in gm.graph.nodes:
         if node.op != "call_function":
diff --git a/backends/arm/test/ops/test_max_pool1d.py b/backends/arm/test/ops/test_max_pool1d.py
@@ -88,7 +88,6 @@ def test_max_pool2d_tosa_FP_decomposed(test_data: Callable):
 
 
 @common.parametrize("test_data", test_data_suite_all)
-@pytest.mark.xfail(reason="MaxPool1D not yet supported", strict=False)
 def test_max_pool2d_tosa_INT_decomposed(test_data: Callable):
     """Test max_pool1d with TOSA INT pipeline (quantized)."""
     test_data, model_params = test_data()