Add MaxPool1D decomposition pass support

Ninja91 · facebook-github-bot · commit 3a1a61ec6c7d · 2026-01-29T16:44:42.000-08:00
Summary:
Implement DecomposeMaxPool1dPass to enable MaxPool1D support on ARM backend
by decomposing max_pool1d to view_copy → max_pool2d → view_copy.

## Implementation Strategy

### Decomposition Approach (Optimal for TOSA/Vela)
The pass decomposes max_pool1d into max_pool2d via view_copy operations:
1. view_copy: (N, C, L) → (N, C, 1, L) - add height dimension
2. max_pool2d: with adapted params [k]→[1,k], [s]→[1,s], [p]→[0,p]
3. view_copy: (N, C, 1, L_out) → (N, C, L_out) - remove height dimension

### Why This Approach is Optimal

1. **view_copy maps to TOSA RESHAPE** which is zero-cost in Vela:
   - Classified as memory_only_ops (Reshape, Squeeze, ExpandDims, Identity)
   - Bypassed entirely when conditions met (NPU-produced, single consumer)
   - Tensor equivalence enables memory aliasing (same address)

2. **TFA Pipeline Placement (before quantization)**:
   - view_copy is in _one_to_one_shared_input_qspec (line 407)
   - max_pool2d is in _one_to_one_shared_input_or_input_act_qspec (line 455)
   - Both get proper SharedQuantizationSpec from annotator automatically

3. **Quantization Handling**:
   - Clear qparams on intermediate view_copy ops (let annotator fill them)
   - Preserve original meta on max_pool2d for proper tracing
   - MAX_POOL2D doesn't need zero-point handling (unlike AVG_POOL2D)

### TOSA/Vela Constraints Validated
- U55: Stride ≤3 ✓, Kernel ≤256x256 ✓
- U85: Extended stride support via accumulator save/restore
- Dilation: Handled by separate DecomposeMaxPool2dPass if needed

Differential Revision: D91760459
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -65,6 +65,7 @@
 from .decompose_log1p_pass import DecomposeLog1pPass  # noqa
 from .decompose_logit_pass import DecomposeLogitPass  # noqa
 from .decompose_masked_fill_pass import DecomposeMaskedFillPass  # noqa
+from .decompose_max_pool1d_pass import DecomposeMaxPool1dPass  # noqa
 from .decompose_maxpool2d_with_dilation_pass import DecomposeMaxPool2dPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -66,6 +66,7 @@
     DecomposeLog1pPass,
     DecomposeLogitPass,
     DecomposeMaskedFillPass,
+    DecomposeMaxPool1dPass,
     DecomposeMaxPool2dPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
@@ -437,6 +438,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                 DecomposeSqrtPass(tfa_pass=True),
                 DecomposeSiluPass(tfa_pass=True),
                 DecomposeAvgPool2dPass(tfa_pass=True),
+                DecomposeMaxPool1dPass(tfa_pass=True),
                 DecomposeSoftmaxUnstablePass(tfa_pass=True),
                 DecomposeSoftmaxPass(tfa_pass=True),
                 ConvertMinMaxPass(tfa_pass=True),
diff --git a/backends/arm/_passes/decompose_max_pool1d_pass.py b/backends/arm/_passes/decompose_max_pool1d_pass.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+# Support both aten and edge dialects
+edge_max_pool1d_ops = (exir_ops.edge.aten.max_pool1d.default,)
+aten_max_pool1d_ops = (torch.ops.aten.max_pool1d.default,)
+
+
+def get_ops_for_dialect(op) -> tuple:
+    """Get the appropriate ops for the given dialect."""
+    if op in edge_max_pool1d_ops:
+        return (
+            exir_ops.edge.aten.view_copy.default,
+            exir_ops.edge.aten.max_pool2d.default,
+        )
+    if op in aten_max_pool1d_ops:
+        return (
+            torch.ops.aten.view_copy.default,
+            torch.ops.aten.max_pool2d.default,
+        )
+    raise RuntimeError(f"Can't get decomposition ops for {op}")
+
+
+class DecomposeMaxPool1dPass(ArmPass):
+    """
+    This pass decomposes max_pool1d ops into max_pool2d by unsqueezing the input
+    from 3D to 4D, calling max_pool2d, and squeezing the output back to 3D.
+
+    This is needed to avoid issues with quantization metadata not propagating
+    correctly when max_pool1d decomposes naturally after quantization.
+
+    The transformation is:
+    1. Unsqueeze input from (N, C, L) to (N, C, 1, L) by adding dim at position 2
+    2. Call max_pool2d with adapted kernel_size, stride, padding
+    3. Squeeze output from (N, C, 1, L_out) back to (N, C, L_out)
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in (edge_max_pool1d_ops + aten_max_pool1d_ops):
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Get the appropriate ops for this dialect
+        view_copy_op, max_pool2d_op = get_ops_for_dialect(op)
+
+        x = args[0]
+        kernel_size = args[1]
+        stride = args[2] if len(args) > 2 else kernel_size
+        padding = args[3] if len(args) > 3 else 0
+        dilation = args[4] if len(args) > 4 else 1
+        ceil_mode = args[5] if len(args) > 5 else False
+
+        # Convert scalar values to lists if needed
+        if isinstance(kernel_size, int):
+            kernel_size = [kernel_size]
+        if isinstance(stride, int):
+            stride = [stride]
+        if isinstance(padding, int):
+            padding = [padding]
+        if isinstance(dilation, int):
+            dilation = [dilation]
+
+        # Create metadata for intermediate operations (without qparams)
+        intermediate_meta = meta.copy()
+        intermediate_meta.data["input_qparams"] = {}
+        intermediate_meta.data["output_qparams"] = {}
+
+        # Step 1: Unsqueeze input from 3D to 4D (add dimension at position 2)
+        # (N, C, L) -> (N, C, 1, L)
+        x_shape = list(x.data.shape)
+        x_unsqueezed_shape = x_shape[:2] + [1] + x_shape[2:]
+        x_unsqueezed = super().call_operator(
+            view_copy_op,
+            (x, x_unsqueezed_shape),
+            {},
+            intermediate_meta,
+            updated=True,
+        )
+
+        # Step 2: Call max_pool2d with 2D parameters
+        # kernel_size: [k] -> [1, k]
+        # stride: [s] -> [1, s]
+        # padding: [p] -> [0, p]
+        # dilation: [d] -> [1, d]
+        kernel_2d = [1] + kernel_size
+        stride_2d = [1] + stride
+        padding_2d = [0] + padding
+        dilation_2d = [1] + dilation
+
+        pooled = super().call_operator(
+            max_pool2d_op,
+            (x_unsqueezed, kernel_2d, stride_2d, padding_2d, dilation_2d, ceil_mode),
+            {},
+            meta,
+            updated=True,
+        )
+
+        # Step 3: Squeeze output back to 3D
+        # (N, C, 1, L_out) -> (N, C, L_out)
+        pooled_shape = list(pooled.data.shape)
+        output_shape = pooled_shape[:2] + pooled_shape[3:]
+        output = super().call_operator(
+            view_copy_op,
+            (pooled, output_shape),
+            {},
+            intermediate_meta,
+            updated=True,
+        )
+
+        return output
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -406,6 +406,7 @@ def _match_pattern(
     torch.ops.aten.squeeze.default,
     torch.ops.aten.squeeze_copy.default,
     torch.ops.aten.squeeze_copy.dim,
+    torch.ops.aten.squeeze_copy.dims,
     torch.ops.aten.squeeze.dim,
     torch.ops.aten.squeeze.dims,
     torch.ops.aten.unbind.int,
diff --git a/backends/arm/test/ops/test_max_pool1d.py b/backends/arm/test/ops/test_max_pool1d.py
@@ -8,14 +8,14 @@
 """
 Tests for max_pool1d operation.
 
-max_pool1d is decomposed by PyTorch into:
-    unsqueeze -> max_pool2d_with_indices -> getitem -> squeeze
+max_pool1d is decomposed by DecomposeMaxPool1dPass into:
+    view_copy -> max_pool2d -> view_copy
 
-This test verifies that the decomposed pattern is correctly quantized and
+This is done before quantization to ensure proper qparams propagation.
+The test verifies that the decomposed pattern is correctly quantized and
 delegated to the Arm backend (U55/U85).
 """
 
-import pytest
 from typing import Tuple
 
 import torch
@@ -50,6 +50,7 @@ def forward(self, x):
         return self.max_pool_1d(x)
 
 
+# Test data for TOSA pipelines (no stride constraints)
 test_data_suite = {
     # (test_name, test_data, [kernel_size, stride, padding])
     "simple": lambda: (torch.rand(1, 16, 50), [4, 2, 0]),
@@ -59,65 +60,82 @@ def forward(self, x):
     "multi_batch": lambda: (torch.rand(4, 16, 50), [4, 2, 0]),
 }
 
-# After PyTorch decomposition, max_pool1d becomes max_pool2d_with_indices
-# After to_edge, becomes max_pool2d in edge dialect
-aten_op = "torch.ops.aten.max_pool1d.default"
+# Test data for U55/U85 pipelines (stride must be <= 3)
+test_data_suite_u55 = {
+    # (test_name, test_data, [kernel_size, stride, padding])
+    "simple": lambda: (torch.rand(1, 16, 50), [4, 2, 0]),
+    "with_padding": lambda: (torch.rand(1, 16, 50), [3, 2, 1]),
+    "stride_1": lambda: (torch.rand(1, 8, 32), [3, 1, 0]),
+    "stride_3": lambda: (torch.rand(1, 4, 64), [8, 3, 0]),
+}
+
+# max_pool1d is decomposed before quantization by DecomposeMaxPool1dPass
+# After the pass, max_pool1d becomes view_copy -> max_pool2d -> view_copy
+# So for the INT (quantized) tests we should not expect max_pool1d
+aten_op_INT = "torch.ops.aten.view_copy.default"
+# For FP (non-quantized) tests, max_pool1d remains
+aten_op_FP = "torch.ops.aten.max_pool1d.default"
+# After decomposition and passes, becomes max_pool2d in edge dialect
 exir_op = "executorch_exir_dialects_edge__ops_aten_max_pool2d_default"
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(reason="MaxPool1D not yet supported", strict=False)
 def test_max_pool1d_tosa_FP(test_data: torch.Tensor):
     """Test max_pool1d with TOSA FP pipeline."""
     test_data, model_params = test_data()
     pipeline = TosaPipelineFP[input_t1](
         MaxPool1d(*model_params),
         (test_data,),
-        aten_op,
+        aten_op_FP,
         exir_op,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(reason="MaxPool1D not yet supported", strict=False)
 def test_max_pool1d_tosa_INT(test_data: torch.Tensor):
     """Test max_pool1d with TOSA INT pipeline (quantized)."""
     test_data, model_params = test_data()
     pipeline = TosaPipelineINT[input_t1](
         MaxPool1d(*model_params),
         (test_data,),
-        aten_op,
+        aten_op_INT,
         exir_op,
     )
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(reason="MaxPool1D not yet supported", strict=False)
+@common.parametrize("test_data", test_data_suite_u55)
 @common.XfailIfNoCorstone300
 def test_max_pool1d_u55_INT(test_data: torch.Tensor):
-    """Test max_pool1d on Ethos-U55 (quantized)."""
+    """Test max_pool1d on Ethos-U55 (quantized).
+
+    Note: U55 has stride constraint <= 3, so we use test_data_suite_u55
+    which excludes larger_kernel (stride=4).
+    """
     test_data, model_params = test_data()
     pipeline = EthosU55PipelineINT[input_t1](
         MaxPool1d(*model_params),
         (test_data,),
-        aten_op,
+        aten_op_INT,
         exir_ops=[],
     )
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(reason="MaxPool1D not yet supported", strict=False)
+@common.parametrize("test_data", test_data_suite_u55)
 @common.XfailIfNoCorstone320
 def test_max_pool1d_u85_INT(test_data: torch.Tensor):
-    """Test max_pool1d on Ethos-U85 (quantized)."""
+    """Test max_pool1d on Ethos-U85 (quantized).
+
+    Note: U85 has stride constraint <= 3, so we use test_data_suite_u55
+    which excludes larger_kernel (stride=4).
+    """
     test_data, model_params = test_data()
     pipeline = EthosU85PipelineINT[input_t1](
         MaxPool1d(*model_params),
         (test_data,),
-        aten_op,
+        aten_op_INT,
         exir_ops=[],
     )
     pipeline.run()