pytorch
diff --git a/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/ci_commit_pins/optimum-executorch.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml‎
Lines changed: 4 additions & 12 deletions b/‎.github/workflows/android-perf.yml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml‎
Lines changed: 4 additions & 12 deletions b/‎.github/workflows/apple-perf.yml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 32 additions & 11 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 32 additions & 11 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 13 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_atan_pass.py‎
Lines changed: 119 additions & 0 deletions b/‎backends/arm/_passes/decompose_atan_pass.py‎
Lines changed: 119 additions & 0 deletions
@@ -0,0 +1 @@
+a3942627f5ac048e06b4b1d703b0a6a53bf6da5b
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
+      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
       devices: samsung_galaxy_s22_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: samsung_galaxy_s22
         run: |
           set -eux
@@ -341,10 +341,11 @@ jobs:
               echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
 
               # Install optimum-executorch
+              OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
               git clone https://github.com/huggingface/optimum-executorch
               pushd optimum-executorch
               # There is no release yet, for CI stability, always test from the same commit on main
-              git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+              git checkout $OPTIMUM_ET_COMMIT
               python install_dev.py --skip_override_torch
               pip list
 
@@ -353,21 +354,12 @@ jobs:
                 "--task" "text-generation"
                 "--recipe" "xnnpack"
                 "--use_custom_sdpa"
+                "--use_custom_kv_cache"
                 "--qlinear"
                 "--qembedding"
                 "--output_dir" ".."
               )
 
-              # Add conditional arguments based on model
-              case "${HF_MODEL_REPO}" in
-                *"google/gemma-3-1b-it"*)
-                  echo "--use_custom_kv_cache can not be used for HybridCache"
-                  ;;
-                *)
-                  ARGS+=("--use_custom_kv_cache")
-                  ;;
-              esac
-
               optimum-cli export executorch "${ARGS[@]}"
               popd
 
 
@@ -57,6 +57,6 @@ jobs:
       id-token: write
       contents: read
     with:
-      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
+      models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
       devices: apple_iphone_15_private
       benchmark_configs: ${{ inputs.benchmark_configs }}
@@ -72,7 +72,7 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
           CRON_DEFAULT_DEVICES: apple_iphone_15
         run: |
           set -eux
@@ -346,10 +346,11 @@ jobs:
             echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
 
             # Install optimum-executorch
+            OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
             git clone https://github.com/huggingface/optimum-executorch
             pushd optimum-executorch
             # There is no release yet, for CI stability, always test from the same commit on main
-            git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+            git checkout $OPTIMUM_ET_COMMIT
             ${CONDA_RUN} python install_dev.py --skip_override_torch
             pip list
 
@@ -358,21 +359,12 @@ jobs:
               "--task" "text-generation"
               "--recipe" "xnnpack"
               "--use_custom_sdpa"
+              "--use_custom_kv_cache"
               "--qlinear"
               "--qembedding"
               "--output_dir" ".."
             )
 
-            # Add conditional arguments based on model
-            case "${HF_MODEL_REPO}" in
-              *"google/gemma-3-1b-it"*)
-                echo "--use_custom_kv_cache can not be used for HybridCache"
-                ;;
-              *)
-                ARGS+=("--use_custom_kv_cache")
-                ;;
-            esac
-
             ${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
             popd
 
 
@@ -594,10 +594,11 @@ jobs:
         echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         git clone https://github.com/huggingface/optimum-executorch
         pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+        git checkout $OPTIMUM_ET_COMMIT
         python install_dev.py --skip_override_torch
         popd
         pip list
@@ -614,21 +615,12 @@ jobs:
           "--task" "text-generation"
           "--recipe" "xnnpack"
           "--use_custom_sdpa"
+          "--use_custom_kv_cache"
           "--qlinear"
           "--qembedding"
           "--output_dir" "${OUTPUT_DIR}"
         )
 
-        # Add conditional arguments based on model
-        case "${MODEL_ID}" in
-          *"google/gemma-3-1b-it"*)
-            echo "--use_custom_kv_cache can not be used for HybridCache"
-            ;;
-          *)
-            ARGS+=("--use_custom_kv_cache")
-            ;;
-        esac
-
         optimum-cli export executorch "${ARGS[@]}"
 
         ls -FlAGhp ${OUTPUT_DIR}
@@ -718,3 +710,32 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-nxp-neutron:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Build and install Executorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+
+        # Install test requirements
+        pip install -r backends/nxp/requirements-tests.txt
+
+        # Run pytest
+        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
@@ -86,6 +86,17 @@ announce_configured_options(CMAKE_TOOLCHAIN_FILE)
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
 
+# Enable ccache if available
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+  set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+  message(STATUS "ccache found and enabled for faster builds")
+else()
+  message(STATUS "ccache not found, builds will not be cached")
+endif()
+announce_configured_options(CCACHE_PROGRAM)
+
 # Print all the configs that were called with announce_configured_options.
 print_configured_options()
 
@@ -606,9 +617,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    # need to explicitly specify XNNPACK and microkernels-prod here otherwise
+    # need to explicitly specify XNNPACK and xnnpack-microkernels-prod here otherwise
     # uses XNNPACK and microkernel-prod symbols from libtorch_cpu
-    list(APPEND _dep_libs xnnpack_backend XNNPACK microkernels-prod)
+    list(APPEND _dep_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
   endif()
 
   # compile options for pybind
 
@@ -22,6 +22,7 @@
 from .convert_split_to_slice import ConvertSplitToSlicePass  # noqa
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
+from .decompose_atan_pass import DecomposeAtanPass  # noqa
 from .decompose_avg_pool2d import DecomposeAvgPool2d  # noqa
 from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass  # noqa
 from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 
@@ -25,6 +25,7 @@
     ConvertSplitToSlicePass,
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
+    DecomposeAtanPass,
     DecomposeAvgPool2d,
     DecomposeBatchNormNoStatsPass,
     DecomposeCosineSimilarityPass,
@@ -151,6 +152,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
     def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(DecomposeRoundPass())
         self.add_pass(DecomposeSqrtPass())
+        self.add_pass(DecomposeAtanPass())
         self.add_pass(ConvertIntPowToMuls())
         self.add_pass(CastBoolToInt8Pass())
         self.add_pass(DecomposeSinhPass())
 
@@ -0,0 +1,119 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from math import pi
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+edge_atan = exir_ops.edge.aten.atan.default  # MI case
+
+
+def _get_atan_ops(op):
+    """Return the primitive ops required.."""
+    if op is not edge_atan:
+        raise RuntimeError(f"Can't decompose atan for op {op}")
+
+    return (
+        exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.mul.Scalar,
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.add.Scalar,
+        exir_ops.edge.aten.sub.Tensor,
+        exir_ops.edge.aten.abs.default,
+        exir_ops.edge.aten.gt.Scalar,
+        exir_ops.edge.aten.reciprocal.default,
+        exir_ops.edge.aten.where.self,
+        exir_ops.edge.aten.neg.default,
+    )
+
+
+class DecomposeAtanPass(ArmPass):
+    """Decomposes the atan operator into a rational (Padé) approximation."""
+
+    def _rational_approximation(self, z, ops, meta):
+        """Creates a (2,1) Padé approximation for atan(x) on [-1, 1]."""
+
+        op_mul, op_mul_scalar, op_add, op_add_scalar, _, _, _, op_recip, _, _ = ops
+
+        # Coefficients calculated using minimax on the interval [-1, 1].
+        a1 = 0.3529666667
+        a2 = -0.0287666667
+        b1 = 0.6863
+
+        z2 = super().call_operator(op_mul, (z, z), {}, meta, updated=True)
+        z4 = super().call_operator(op_mul, (z2, z2), {}, meta, updated=True)
+
+        num1 = super().call_operator(op_mul_scalar, (z2, a1), {}, meta, updated=True)
+        num2 = super().call_operator(op_mul_scalar, (z4, a2), {}, meta, updated=True)
+        num = super().call_operator(op_add_scalar, (num1, 1.0), {}, meta, updated=True)
+        num = super().call_operator(op_add, (num, num2), {}, meta, updated=True)
+
+        den1 = super().call_operator(op_mul_scalar, (z2, b1), {}, meta, updated=True)
+        den = super().call_operator(op_add_scalar, (den1, 1.0), {}, meta, updated=True)
+
+        inv_den = super().call_operator(op_recip, (den,), {}, meta, updated=True)
+
+        prod = super().call_operator(op_mul, (num, inv_den), {}, meta, updated=True)
+        return super().call_operator(op_mul, (z, prod), {}, meta, updated=True)
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op is not edge_atan:
+            return super().call_operator(op, args, kwargs, meta, updated=False)
+
+        logging.info(
+            f"Approximating atan. This may introduce small numerical errors. For details, see {__file__}."
+        )
+
+        ops = _get_atan_ops(op)
+        (
+            _,
+            op_mul_scalar,
+            _,
+            op_add_scalar,
+            op_sub,
+            op_abs,
+            op_gt,
+            op_recip,
+            op_where,
+            op_neg,
+        ) = ops
+
+        x = args[0]
+
+        # |x| > 1 is reduced to [0, 1] using atan(x) = pi/2 - atan(1/x) and atan(-x) = -atan(x).
+
+        abs_x = super().call_operator(op_abs, (x,), {}, meta, updated=True)
+        mask_hi = super().call_operator(op_gt, (abs_x, 1.0), {}, meta, updated=True)
+
+        inv_x = super().call_operator(op_recip, (abs_x,), {}, meta, updated=True)
+        z = super().call_operator(
+            op_where, (mask_hi, inv_x, abs_x), {}, meta, updated=True
+        )
+
+        atan_z = self._rational_approximation(z, ops, meta)
+
+        zero_tensor = super().call_operator(
+            op_mul_scalar, (x, 0.0), {}, meta, updated=True
+        )
+        half_pi_tensor = super().call_operator(
+            op_add_scalar, (zero_tensor, pi / 2), {}, meta, updated=True
+        )
+
+        diff = super().call_operator(
+            op_sub, (half_pi_tensor, atan_z), {}, meta, updated=True
+        )
+        atan_abs = super().call_operator(
+            op_where, (mask_hi, diff, atan_z), {}, meta, updated=True
+        )
+
+        mask_pos = super().call_operator(op_gt, (x, 0.0), {}, meta, updated=True)
+        neg_val = super().call_operator(op_neg, (atan_abs,), {}, meta, updated=True)
+
+        return super().call_operator(
+            op_where, (mask_pos, atan_abs, neg_val), {}, meta, updated=True
+        )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+a3942627f5ac048e06b4b1d703b0a6a53bf6da5b`