Skip to content

Commit 3a81579

Browse files
author
pytorchbot
committed
2025-06-27 nightly release (8dde918)
1 parent c8820bb commit 3a81579

102 files changed

Lines changed: 1430 additions & 915 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
a3942627f5ac048e06b4b1d703b0a6a53bf6da5b

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7777
run: |
7878
set -eux
@@ -341,10 +341,11 @@ jobs:
341341
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
342342
343343
# Install optimum-executorch
344+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
344345
git clone https://github.com/huggingface/optimum-executorch
345346
pushd optimum-executorch
346347
# There is no release yet, for CI stability, always test from the same commit on main
347-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
348+
git checkout $OPTIMUM_ET_COMMIT
348349
python install_dev.py --skip_override_torch
349350
pip list
350351
@@ -353,21 +354,12 @@ jobs:
353354
"--task" "text-generation"
354355
"--recipe" "xnnpack"
355356
"--use_custom_sdpa"
357+
"--use_custom_kv_cache"
356358
"--qlinear"
357359
"--qembedding"
358360
"--output_dir" ".."
359361
)
360362
361-
# Add conditional arguments based on model
362-
case "${HF_MODEL_REPO}" in
363-
*"google/gemma-3-1b-it"*)
364-
echo "--use_custom_kv_cache can not be used for HybridCache"
365-
;;
366-
*)
367-
ARGS+=("--use_custom_kv_cache")
368-
;;
369-
esac
370-
371363
optimum-cli export executorch "${ARGS[@]}"
372364
popd
373365

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: apple_iphone_15_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/apple-perf.yml

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: apple_iphone_15
7777
run: |
7878
set -eux
@@ -346,10 +346,11 @@ jobs:
346346
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
347347
348348
# Install optimum-executorch
349+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
349350
git clone https://github.com/huggingface/optimum-executorch
350351
pushd optimum-executorch
351352
# There is no release yet, for CI stability, always test from the same commit on main
352-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
353+
git checkout $OPTIMUM_ET_COMMIT
353354
${CONDA_RUN} python install_dev.py --skip_override_torch
354355
pip list
355356
@@ -358,21 +359,12 @@ jobs:
358359
"--task" "text-generation"
359360
"--recipe" "xnnpack"
360361
"--use_custom_sdpa"
362+
"--use_custom_kv_cache"
361363
"--qlinear"
362364
"--qembedding"
363365
"--output_dir" ".."
364366
)
365367
366-
# Add conditional arguments based on model
367-
case "${HF_MODEL_REPO}" in
368-
*"google/gemma-3-1b-it"*)
369-
echo "--use_custom_kv_cache can not be used for HybridCache"
370-
;;
371-
*)
372-
ARGS+=("--use_custom_kv_cache")
373-
;;
374-
esac
375-
376368
${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
377369
popd
378370

.github/workflows/trunk.yml

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -594,10 +594,11 @@ jobs:
594594
echo "::group::Set up Hugging Face"
595595
pip install -U "huggingface_hub[cli]"
596596
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
597+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
597598
git clone https://github.com/huggingface/optimum-executorch
598599
pushd optimum-executorch
599600
# There is no release yet, for CI stability, always test from the same commit on main
600-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
601+
git checkout $OPTIMUM_ET_COMMIT
601602
python install_dev.py --skip_override_torch
602603
popd
603604
pip list
@@ -614,21 +615,12 @@ jobs:
614615
"--task" "text-generation"
615616
"--recipe" "xnnpack"
616617
"--use_custom_sdpa"
618+
"--use_custom_kv_cache"
617619
"--qlinear"
618620
"--qembedding"
619621
"--output_dir" "${OUTPUT_DIR}"
620622
)
621623
622-
# Add conditional arguments based on model
623-
case "${MODEL_ID}" in
624-
*"google/gemma-3-1b-it"*)
625-
echo "--use_custom_kv_cache can not be used for HybridCache"
626-
;;
627-
*)
628-
ARGS+=("--use_custom_kv_cache")
629-
;;
630-
esac
631-
632624
optimum-cli export executorch "${ARGS[@]}"
633625
634626
ls -FlAGhp ${OUTPUT_DIR}
@@ -718,3 +710,32 @@ jobs:
718710
build-mode: Release
719711
build-tool: cmake
720712
docker-image: executorch-ubuntu-22.04-clang12
713+
714+
unittest-nxp-neutron:
715+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
716+
permissions:
717+
id-token: write
718+
contents: read
719+
with:
720+
runner: linux.2xlarge
721+
docker-image: executorch-ubuntu-22.04-clang12
722+
submodules: 'recursive'
723+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
724+
timeout: 90
725+
script: |
726+
set -eux
727+
728+
# The generic Linux job chooses to use base env, not the one setup by the image
729+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
730+
conda activate "${CONDA_ENV}"
731+
732+
# Build and install Executorch
733+
PYTHON_EXECUTABLE=python \
734+
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
735+
.ci/scripts/setup-linux.sh --build-tool "cmake"
736+
737+
# Install test requirements
738+
pip install -r backends/nxp/requirements-tests.txt
739+
740+
# Run pytest
741+
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

CMakeLists.txt

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ announce_configured_options(CMAKE_TOOLCHAIN_FILE)
8686
load_build_preset()
8787
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
8888

89+
# Enable ccache if available
90+
find_program(CCACHE_PROGRAM ccache)
91+
if(CCACHE_PROGRAM)
92+
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
93+
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
94+
message(STATUS "ccache found and enabled for faster builds")
95+
else()
96+
message(STATUS "ccache not found, builds will not be cached")
97+
endif()
98+
announce_configured_options(CCACHE_PROGRAM)
99+
89100
# Print all the configs that were called with announce_configured_options.
90101
print_configured_options()
91102

@@ -606,9 +617,9 @@ if(EXECUTORCH_BUILD_PYBIND)
606617
endif()
607618

608619
if(EXECUTORCH_BUILD_XNNPACK)
609-
# need to explicitly specify XNNPACK and microkernels-prod here otherwise
620+
# need to explicitly specify XNNPACK and xnnpack-microkernels-prod here otherwise
610621
# uses XNNPACK and microkernel-prod symbols from libtorch_cpu
611-
list(APPEND _dep_libs xnnpack_backend XNNPACK microkernels-prod)
622+
list(APPEND _dep_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
612623
endif()
613624

614625
# compile options for pybind

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from .convert_split_to_slice import ConvertSplitToSlicePass # noqa
2323
from .convert_squeezes_to_view import ConvertSqueezesToViewPass # noqa
2424
from .convert_to_clamp import ConvertToClampPass # noqa
25+
from .decompose_atan_pass import DecomposeAtanPass # noqa
2526
from .decompose_avg_pool2d import DecomposeAvgPool2d # noqa
2627
from .decompose_batch_norm_no_stats import DecomposeBatchNormNoStatsPass # noqa
2728
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
ConvertSplitToSlicePass,
2626
ConvertSqueezesToViewPass,
2727
ConvertToClampPass,
28+
DecomposeAtanPass,
2829
DecomposeAvgPool2d,
2930
DecomposeBatchNormNoStatsPass,
3031
DecomposeCosineSimilarityPass,
@@ -151,6 +152,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
151152
def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
152153
self.add_pass(DecomposeRoundPass())
153154
self.add_pass(DecomposeSqrtPass())
155+
self.add_pass(DecomposeAtanPass())
154156
self.add_pass(ConvertIntPowToMuls())
155157
self.add_pass(CastBoolToInt8Pass())
156158
self.add_pass(DecomposeSinhPass())
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import logging
7+
from math import pi
8+
9+
from executorch.backends.arm._passes import ArmPass
10+
from executorch.exir.dialects._ops import ops as exir_ops
11+
12+
13+
edge_atan = exir_ops.edge.aten.atan.default # MI case
14+
15+
16+
def _get_atan_ops(op):
17+
"""Return the primitive ops required.."""
18+
if op is not edge_atan:
19+
raise RuntimeError(f"Can't decompose atan for op {op}")
20+
21+
return (
22+
exir_ops.edge.aten.mul.Tensor,
23+
exir_ops.edge.aten.mul.Scalar,
24+
exir_ops.edge.aten.add.Tensor,
25+
exir_ops.edge.aten.add.Scalar,
26+
exir_ops.edge.aten.sub.Tensor,
27+
exir_ops.edge.aten.abs.default,
28+
exir_ops.edge.aten.gt.Scalar,
29+
exir_ops.edge.aten.reciprocal.default,
30+
exir_ops.edge.aten.where.self,
31+
exir_ops.edge.aten.neg.default,
32+
)
33+
34+
35+
class DecomposeAtanPass(ArmPass):
36+
"""Decomposes the atan operator into a rational (Padé) approximation."""
37+
38+
def _rational_approximation(self, z, ops, meta):
39+
"""Creates a (2,1) Padé approximation for atan(x) on [-1, 1]."""
40+
41+
op_mul, op_mul_scalar, op_add, op_add_scalar, _, _, _, op_recip, _, _ = ops
42+
43+
# Coefficients calculated using minimax on the interval [-1, 1].
44+
a1 = 0.3529666667
45+
a2 = -0.0287666667
46+
b1 = 0.6863
47+
48+
z2 = super().call_operator(op_mul, (z, z), {}, meta, updated=True)
49+
z4 = super().call_operator(op_mul, (z2, z2), {}, meta, updated=True)
50+
51+
num1 = super().call_operator(op_mul_scalar, (z2, a1), {}, meta, updated=True)
52+
num2 = super().call_operator(op_mul_scalar, (z4, a2), {}, meta, updated=True)
53+
num = super().call_operator(op_add_scalar, (num1, 1.0), {}, meta, updated=True)
54+
num = super().call_operator(op_add, (num, num2), {}, meta, updated=True)
55+
56+
den1 = super().call_operator(op_mul_scalar, (z2, b1), {}, meta, updated=True)
57+
den = super().call_operator(op_add_scalar, (den1, 1.0), {}, meta, updated=True)
58+
59+
inv_den = super().call_operator(op_recip, (den,), {}, meta, updated=True)
60+
61+
prod = super().call_operator(op_mul, (num, inv_den), {}, meta, updated=True)
62+
return super().call_operator(op_mul, (z, prod), {}, meta, updated=True)
63+
64+
def call_operator(self, op, args, kwargs, meta):
65+
if op is not edge_atan:
66+
return super().call_operator(op, args, kwargs, meta, updated=False)
67+
68+
logging.info(
69+
f"Approximating atan. This may introduce small numerical errors. For details, see {__file__}."
70+
)
71+
72+
ops = _get_atan_ops(op)
73+
(
74+
_,
75+
op_mul_scalar,
76+
_,
77+
op_add_scalar,
78+
op_sub,
79+
op_abs,
80+
op_gt,
81+
op_recip,
82+
op_where,
83+
op_neg,
84+
) = ops
85+
86+
x = args[0]
87+
88+
# |x| > 1 is reduced to [0, 1] using atan(x) = pi/2 - atan(1/x) and atan(-x) = -atan(x).
89+
90+
abs_x = super().call_operator(op_abs, (x,), {}, meta, updated=True)
91+
mask_hi = super().call_operator(op_gt, (abs_x, 1.0), {}, meta, updated=True)
92+
93+
inv_x = super().call_operator(op_recip, (abs_x,), {}, meta, updated=True)
94+
z = super().call_operator(
95+
op_where, (mask_hi, inv_x, abs_x), {}, meta, updated=True
96+
)
97+
98+
atan_z = self._rational_approximation(z, ops, meta)
99+
100+
zero_tensor = super().call_operator(
101+
op_mul_scalar, (x, 0.0), {}, meta, updated=True
102+
)
103+
half_pi_tensor = super().call_operator(
104+
op_add_scalar, (zero_tensor, pi / 2), {}, meta, updated=True
105+
)
106+
107+
diff = super().call_operator(
108+
op_sub, (half_pi_tensor, atan_z), {}, meta, updated=True
109+
)
110+
atan_abs = super().call_operator(
111+
op_where, (mask_hi, diff, atan_z), {}, meta, updated=True
112+
)
113+
114+
mask_pos = super().call_operator(op_gt, (x, 0.0), {}, meta, updated=True)
115+
neg_val = super().call_operator(op_neg, (atan_abs,), {}, meta, updated=True)
116+
117+
return super().call_operator(
118+
op_where, (mask_pos, atan_abs, neg_val), {}, meta, updated=True
119+
)

0 commit comments

Comments
 (0)