pytorch
diff --git a/‎.github/workflows/_test_cortex_m_ops.yml‎
Lines changed: 56 additions & 0 deletions b/‎.github/workflows/_test_cortex_m_ops.yml‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 6 additions & 1 deletion b/‎.lintrunner.toml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 92 additions & 26 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 92 additions & 26 deletions
@@ -0,0 +1,56 @@
+name: Test Cortex-M ops
+
+permissions:
+  id-token: write
+  contents: read
+
+on:
+  workflow_call:
+    inputs:
+      targets:
+        description: 'JSON array of cortex-m target CPUs to run the op tests against, e.g. ["cortex-m7", "cortex-m0plus"]'
+        required: true
+        type: string
+      timeout:
+        description: 'Per-matrix-entry timeout in minutes'
+        required: false
+        type: number
+        default: 120
+
+jobs:
+  run:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        target: ${{ fromJSON(inputs.targets) }}
+      fail-fast: false
+    with:
+      job-name: cortex-m-ops-${{ matrix.target }}
+      runner: linux.2xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: ${{ inputs.timeout }}
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        # Install arm dependencies
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/arm-scratch/setup_path.sh
+
+        # Build the runner for this target (written to a target-suffixed dir
+        # that the op tests resolve from via --cortex-m-target below).
+        backends/cortex_m/test/build_test_runner.sh --target=${{ matrix.target }}
+
+        # Run the op suite against this target: dialect tests check the lowered
+        # op set, implementation tests check FVP numerics. Both are parametrized
+        # over --cortex-m-target, so a future target-dependent lowering change is
+        # caught here. (cortex-m55 runs on pull via the full-suite job.)
+        pytest --config-file=backends/arm/test/pytest.ini \
+          backends/cortex_m/test/ops \
+          --cortex-m-target=${{ matrix.target }}
@@ -1076,3 +1076,12 @@ jobs:
     with:
       models: '["mv2", "mv3"]'
       targets: '["cortex-m55", "cortex-m7", "cortex-m0plus"]'
+
+  test-cortex-m-ops:
+    name: test-cortex-m-ops
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/_test_cortex_m_ops.yml
+    with:
+      targets: '["cortex-m7", "cortex-m0plus"]'
@@ -195,7 +195,6 @@ exclude_patterns = [
     # Kernel areas to onboard separately.
     'kernels/optimized/**',
     'kernels/portable/**',
-    'kernels/quantized/**',
     'kernels/test/**',
 
     # Runtime areas to onboard incrementally.
@@ -229,6 +228,12 @@ command = [
     '--extra-arg=--suppress=unknownMacro:*kernels/prim_ops/*',
     '--extra-arg=--suppress=syntaxError:*kernels/prim_ops/*',
     '--extra-arg=--suppress=unusedFunction:*kernels/prim_ops/*',
+    # Quantized kernels have NEON-gated code and registration helpers that
+    # cppcheck cannot see in every configuration.
+    '--extra-arg=--suppress=unreadVariable:*kernels/quantized/*',
+    '--extra-arg=--suppress=unusedFunction:*kernels/quantized/*',
+    '--extra-arg=--suppress=constParameterReference:*kernels/quantized/*',
+    '--extra-arg=--suppress=suspiciousFloatingPointCast:*kernels/quantized/*',
     '--',
     '@{{PATHSFILE}}'
 ]
 
@@ -43,6 +43,9 @@
 from .decompose_cumsum_pass import DecomposeCumsumPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_div_tensor_mode import DecomposeDivTensorModePass  # noqa
+from .decompose_dynamic_adaptive_avg_pool2d_pass import (  # noqa
+    DecomposeDynamicAdaptiveAvgPool2dPass,
+)
 from .decompose_dynamic_full_pass import DecomposeDynamicFullPass  # noqa
 from .decompose_einsum_pass import DecomposeEinsumPass  # noqa
 from .decompose_elu_pass import ConvertEluFamilyToEluPass, DecomposeEluPass  # noqa
 
@@ -7,8 +7,9 @@
 
 import logging
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass, field
+from typing import Any, cast
 
 from executorch.backends.arm._passes import (
     AccumulateIndexPutPass,
@@ -49,6 +50,7 @@
     DecomposeCumsumPass,
     DecomposeDivPass,
     DecomposeDivTensorModePass,
+    DecomposeDynamicAdaptiveAvgPool2dPass,
     DecomposeDynamicFullPass,
     DecomposeEinsumPass,
     DecomposeEluPass,
@@ -166,12 +168,17 @@
 )
 
 from executorch.exir import ExportedProgram
-from executorch.exir.pass_base import ExportPass
-from executorch.exir.pass_manager import PassManager
+from executorch.exir._program_utils import _get_updated_graph_signature
+from executorch.exir.pass_base import (
+    ExportedProgramPassBase,
+    ExportedProgramPassResult,
+    ExportPass,
+)
+from executorch.exir.pass_manager import ExportedProgramPassManager
 from torch._export.utils import _get_shape_env_from_gm
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
-from torch.nn.modules import Module
+from torch.fx.passes.infra.pass_manager import PassManager as GraphModulePassManager
 
 logger = logging.getLogger(__name__)
 
@@ -187,6 +194,50 @@ class PassInsertions:
 _registered_pass_insertions: dict[type, PassInsertions] = {}
 
 
+def _graph_pass_name(graph_pass: Callable[[GraphModule], PassResult | None]) -> str:
+    if isinstance(graph_pass, ExportPass):
+        return ArmPass.get_name(graph_pass)
+    if hasattr(graph_pass, "__name__"):
+        return graph_pass.__name__
+    return type(graph_pass).__name__
+
+
+class _ExportedProgramGraphPassAdapter(ExportedProgramPassBase):
+    def __init__(self, graph_pass: Callable[[GraphModule], PassResult | None]) -> None:
+        self.graph_pass = graph_pass
+
+    def call(self, exported_program: ExportedProgram) -> ExportedProgramPassResult:
+        graph_pass = cast(Any, self.graph_pass)
+        pass_exported_program = getattr(graph_pass, "exported_program", None)
+        if pass_exported_program is not None:
+            # ExportedProgramPassManager works on a shallow copy; Arm graph
+            # passes that store an ExportedProgram must update that copy.
+            graph_pass.exported_program = exported_program
+
+        try:
+            result = self.graph_pass(exported_program.graph_module)
+        finally:
+            if pass_exported_program is not None:
+                graph_pass.exported_program = pass_exported_program
+
+        if result is None:
+            raise TypeError(
+                f"The result of pass {_graph_pass_name(self.graph_pass)} should be type PassResult."
+            )
+
+        if result.modified:
+            result.graph_module.recompile()
+            exported_program._graph_module = result.graph_module
+            exported_program._graph_signature = _get_updated_graph_signature(
+                exported_program.graph_signature,
+                result.graph_module,
+            )
+            # Arm graph passes do not change symbolic shape constraints, and
+            # metadata-only fake modes may differ after propagation.
+
+        return ExportedProgramPassResult(exported_program, result.modified)
+
+
 def register_pass_insertions_before(
     target_pass_type: type, passes: list[ExportPass]
 ) -> None:
@@ -210,7 +261,7 @@ def clear_registered_pass_insertions() -> None:
     _registered_pass_insertions.clear()
 
 
-class ArmPassManager(PassManager):
+class ArmPassManager(ExportedProgramPassManager):
     def __init__(self, compile_spec: ArmCompileSpec) -> None:
         self.compile_spec = compile_spec
         self.tosa_spec = compile_spec.tosa_spec
@@ -373,8 +424,39 @@ def _tosa_context(self, graph_module: GraphModule) -> TosaLoweringContext:
         shape_env = _get_shape_env_from_gm(graph_module)
         return TosaLoweringContext(self.tosa_spec, shape_env)
 
-    def _transform(self, graph_module: GraphModule):
-        return self(graph_module).graph_module
+    def _transform_graph_module(self, graph_module: GraphModule):
+        # TFA and control-flow submodule paths operate on bare GraphModules
+        # without a standalone ExportedProgram to keep in sync.
+        return GraphModulePassManager(self.passes)(graph_module).graph_module
+
+    def __call__(  # type: ignore[override]
+        self,
+        module: ExportedProgram | GraphModule,
+        override_verifiers: Any | None = None,
+    ) -> ExportedProgramPassResult | PassResult:
+        if isinstance(module, GraphModule):
+            if override_verifiers is not None:
+                raise ValueError("override_verifiers is only valid for ExportedProgram")
+            return GraphModulePassManager(self.passes)(module)
+        return super().__call__(module, override_verifiers)
+
+    def _transform(
+        self,
+        exported_program: ExportedProgram,
+        graph_module: GraphModule,
+    ) -> GraphModule:
+        if graph_module is exported_program.graph_module:
+            passes: list[
+                ExportedProgramPassBase | Callable[[GraphModule], PassResult | None]
+            ] = [_ExportedProgramGraphPassAdapter(p) for p in self.passes]
+            transformed_program = ExportedProgramPassManager(passes)(
+                exported_program
+            ).exported_program
+            exported_program._graph_module = transformed_program.graph_module
+            exported_program._graph_signature = transformed_program.graph_signature
+            exported_program._range_constraints = transformed_program.range_constraints
+            return exported_program.graph_module
+        return self._transform_graph_module(graph_module)
 
     def add_pass(self, pipeline_pass):
         if type(pipeline_pass) in self._skip_pass_types:
@@ -463,6 +545,7 @@ def _tosa_pipeline(
                 AccumulateIndexPutPass(),
                 DecomposeIndexTensorToGatherPass(),
                 DecomposeAdaptiveAvgPool2dPass(),
+                DecomposeDynamicAdaptiveAvgPool2dPass(),
                 DecomposeAvgPool2dPass(),
                 Conv1dUnsqueezePass(),
             ]
@@ -556,7 +639,7 @@ def _tosa_pipeline(
         self._apply_pass_insertions()
 
         self.validate_constraints_mandatory()
-        return self._transform(graph_module)
+        return self._transform(exported_program, graph_module)
 
     def transform_to_backend_pipeline(
         self, exported_program: ExportedProgram, graph_module: GraphModule
@@ -661,21 +744,4 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
                 ]
             )
 
-            return self._transform(graph_module)
-
-    def __call__(self, module: Module) -> PassResult:
-        try:
-            return super().__call__(module)
-        except Exception as e:
-            first_exception = e.__cause__ or e.__context__ or e
-            import re
-
-            message = e.args[0]
-            m = re.search(r"An error occurred when running the '([^']+)' pass", message)
-            if m:
-                pass_name = m.group(1)
-                first_exception.args = (
-                    f"{pass_name}: {first_exception.args[0]}",
-                    *first_exception.args[1:],
-                )
-            raise first_exception
+            return self._transform_graph_module(graph_module)