Merge branch 'main' into feature/standalone-runner

usamahz · web-flow · commit 62583200d74e · 2026-05-20T16:53:38.000+01:00
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -97,10 +97,6 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
-  PYTORCH_BUILD_MAX_JOBS=6
-fi
-
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -113,7 +109,6 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
-  --build-arg "PYTORCH_BUILD_MAX_JOBS=${PYTORCH_BUILD_MAX_JOBS:-}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.12
+release/2.11
diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
@@ -76,9 +76,6 @@ init_sccache() {
   # This is the remote cache bucket
   export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
   export SCCACHE_S3_KEY_PREFIX=executorch
-  export SCCACHE_REGION=us-east-1
-  export AWS_REGION=us-east-1
-  export AWS_DEFAULT_REGION=us-east-1
   export SCCACHE_IDLE_TIMEOUT=0
   export SCCACHE_ERROR_LOG=/tmp/sccache_error.log
   export RUST_LOG=sccache::server=error
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
@@ -27,20 +27,14 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  if [[ "$(uname -m)" == "aarch64" ]]; then
-    export BUILD_IGNORE_SVE_UNAVAILABLE=1
-  fi
-  if [[ -n "${PYTORCH_BUILD_MAX_JOBS:-}" ]]; then
-    export MAX_JOBS="${PYTORCH_BUILD_MAX_JOBS}"
-  fi
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.27
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -62,12 +62,9 @@ RUN bash ./install_cache.sh && rm install_cache.sh utils.sh
 ENV SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
 ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
-ENV AWS_REGION us-east-1
-ENV AWS_DEFAULT_REGION us-east-1
 
 ARG TORCH_VERSION
 ARG SKIP_PYTORCH
-ARG PYTORCH_BUILD_MAX_JOBS
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
 RUN if [ -z "${SKIP_PYTORCH}" ]; then bash ./install_pytorch.sh; fi && rm install_pytorch.sh utils.sh
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -107,7 +107,7 @@ install_pytorch_and_domains() {
   local torch_release=$(cat version.txt)
   # Download key must match the upload key below (basename of dist/*.whl,
   # which always carries setup.py's resolved +gitHASH). Branch-ref pins
-  # like `release/2.12` would otherwise produce `+gitrelease` here and
+  # like `release/2.11` would otherwise produce `+gitrelease` here and
   # never hit the cache.
   local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
@@ -132,9 +132,6 @@ install_pytorch_and_domains() {
     # (e.g. executorch's requirements-ci.txt).
     pip install -r requirements-build.txt
     git submodule update --init --recursive
-    if [[ "$(uname -m)" == "aarch64" ]]; then
-      export BUILD_IGNORE_SVE_UNAVAILABLE=1
-    fi
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
@@ -178,7 +175,7 @@ install_pytorch_and_domains() {
   # Grab the pinned audio and vision commits from PyTorch
   TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=release/0.27
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -120,10 +120,10 @@ jobs:
           --prompt-len 4 \
           --max-new-tokens 5 2>&1)
         echo "$OUTPUT"
-        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 94, 253, 88, 227\]"; then
+        if echo "$OUTPUT" | grep -q "Generated token ids: \[167, 167, 81, 167, 81\]"; then
           echo "Success: Qwen 3.5 MoE MLX export + inference completed with expected output"
         else
-          echo "Failed: unexpected output (expected [167, 94, 253, 88, 227])"
+          echo "Failed: unexpected output (expected [167, 167, 81, 167, 81])"
           exit 1
         fi
         echo "::endgroup::"
diff --git a/backends/arm/README.md b/backends/arm/README.md
@@ -116,6 +116,33 @@ Developers who need local source builds can use:
 The current flow lowers to TOSA and converts to VGF for use in external projects,
 so the `executor_runner` is not typically used here.
 
+### Compiling models with the Python API
+
+Use the Python API as the primary way to compile your own models. It lets you
+keep model construction, export inputs, quantization, custom passes, and artifact
+generation in your application code. The `aot_arm_compiler.py` script is useful
+for simple examples and smoke tests, but production code should call the
+ExecuTorch and Arm backend APIs directly.
+
+The delegated Python API flow is:
+
+1. Prepare the model and representative example inputs.
+2. Create a target-specific Arm compile spec.
+3. Export the model with `torch.export.export`.
+4. Optionally quantize with the target-specific Arm quantizer and re-export the
+   quantized graph.
+5. Create the matching Arm partitioner from the compile spec.
+6. Lower with `to_edge_transform_and_lower`.
+7. Convert to an ExecuTorch program and save the PTE file.
+
+For complete examples of that flow, including quantization and target-specific
+compile specs, see:
+
+- `docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md`
+- `docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md`
+
+Additional examples are available in `examples/arm`.
+
 ### Direct Drive (experimental, Ethos-U85 on Linux) workflow
 
 Direct Drive enables execution on Ethos-U85 via the Linux driver stack.
@@ -159,7 +186,8 @@ scp -P 2222 arm_test/cmake-out/executor_runner root@127.0.0.1:/tmp/
 
 #### Direct Drive model (PTE) workflow
 
-Create a PTE file:
+For a quick test with the example `add` model,
+`aot_arm_compiler.py` can be used:
 
 ```
 python3 -m backends.arm.scripts.aot_arm_compiler \
@@ -170,16 +198,30 @@ python3 -m backends.arm.scripts.aot_arm_compiler \
   --direct_drive
 ```
 
+For production use, the Python API described in
+[Compiling models with the Python API](#compiling-models-with-the-python-api)
+should be used. Use an Ethos-U85 target and set the Direct Drive `extra_flags` when creating the `EthosUCompileSpec`:
+
+```python
+compile_spec = EthosUCompileSpec(
+    target="ethos-u85-256",
+    extra_flags=["--separate-io-regions", "--cop-format=COP2"],
+)
+```
+
+Then save the generated program as e.g. `model.pte` or
+update the copy and run commands below to match your output file name.
+
 Copy the `executor_runner` binary and the generated PTE file to the running FVP:
 
 ```
-scp -P 2222 arm_test/cmake-out/executor_runner add_arm_delegate_ethos-u85-256.pte root@127.0.0.1:/tmp/
+scp -P 2222 arm_test/cmake-out/executor_runner model.pte root@127.0.0.1:/tmp/
 ```
 
 Run the model on the FVP:
 
 ```
-ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/add_arm_delegate_ethos-u85-256.pte -num_executions 1"
+ssh -p 2222 root@127.0.0.1 -t "/tmp/executor_runner -model_path /tmp/model.pte -num_executions 1"
 ```
 
 ## Testing
diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py
@@ -9,14 +9,12 @@
 from abc import abstractmethod
 from typing import Any, List, Optional, Set, Type
 
-import torch
 from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
-from torch.utils import _pytree as pytree
 
 
 class ArmPass(ExportPass):
@@ -81,13 +79,6 @@ def get_name(pass_) -> str:
             )
 
     def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
-        if (
-            op == exir_ops.edge.aten.bmm.default
-            and isinstance(meta, NodeMetadata)
-            and len(meta.data.get("input_qparams", {})) > 0
-        ):
-            return self._call_quantized_bmm_without_fake_kernel(op, args, kwargs, meta)
-
         if not updated:
             return super().call_operator(op, args, kwargs, meta)
 
@@ -100,35 +91,6 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False)
         new_meta["stack_trace"] = f"{old_stack_trace}\n{traceback.format_stack()[-2]}"
         return super().call_operator(op, args, kwargs, NodeMetadata(new_meta))
 
-    def _call_quantized_bmm_without_fake_kernel(
-        self,
-        op,
-        args: tuple[ProxyValue, ...],
-        kwargs: dict[str, Any],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        old_val = meta.data["val"]
-        output_qparams = meta.data.get("output_qparams", {})
-        dtype = (
-            next(iter(output_qparams.values())).dtype
-            if len(output_qparams) > 0
-            else old_val.dtype
-        )
-        res_data = torch.empty_like(old_val, dtype=dtype)
-
-        args_proxy, kwargs_proxy = pytree.tree_map_only(
-            ProxyValue, lambda x: x.proxy, (args, kwargs)
-        )
-        res_proxy = self.tracer.create_proxy(
-            "call_function",
-            op,
-            args_proxy,
-            kwargs_proxy,
-        )
-        res_proxy.node.meta.update(meta.data)
-        self.tracer.set_metadata(res_proxy.node, res_data)
-        return ProxyValue(res_data, res_proxy)
-
     def call_submodule(
         self, graph_module: GraphModule, inputs: tuple[Any, ...]
     ) -> PassResult:
diff --git a/backends/nxp/tests/generic_tests/test_per_channel_conversion.py b/backends/nxp/tests/generic_tests/test_per_channel_conversion.py
@@ -169,19 +169,14 @@ def test_per_channel_convolution(self, _, use_qat: bool):
                 atol=1.0,
             )
 
-            conv_nodes = [
-                node
-                for node in exported_program.graph.nodes
-                if node.target == exir_ops.edge.aten.convolution.default
-            ]
-            assert len(conv_nodes) == 1
-
-            conv_node = conv_nodes[0]
+            nodes = list(exported_program.graph.nodes)
+
             assert (
-                conv_node.args[1].target
+                nodes[8].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
             assert (
-                conv_node.args[2].target
+                nodes[9].target
                 == exir_ops.edge.quantized_decomposed.dequantize_per_channel.default
             )
+            assert nodes[10].target == exir_ops.edge.aten.convolution.default
diff --git a/docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md b/docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md
@@ -20,7 +20,7 @@ In this tutorial you will learn how to export a simple PyTorch model for the Exe
 ```{tip}
 If you are already familiar with this delegate, you may want to jump directly to the examples:
 * [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
-* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
+* [A commandline compiler for quick tests and example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
 ```
 
 This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on Arm&reg; Ethos&trade;-U targets. It is based on `ethos_u_minimal_example.ipynb`, provided in Arm’s examples folder.
@@ -142,9 +142,10 @@ save_pte_program(executorch_program_manager, "ethos_u_minimal_example.pte")
 
 
 ```{tip}
-For a quick start, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
+For a quick test, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
 To produce a pte file equivalent to the one above, run
-`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte`
+`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte`.
+For production use, you should instead use the stable Python API shown above.
 ```
 
 ### Runtime:
diff --git a/docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md b/docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md
@@ -26,7 +26,7 @@ You may encounter some rough edges and features which may be documented or plann
 ```{tip}
 If you are already familiar with this delegate, you may want to jump directly to the examples:
 * [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
-* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
+* [A commandline compiler for quick tests and example models](https://github.com/pytorch/executorch/blob/main/backends/arm/scripts/aot_arm_compiler.py)
 ```
 
 This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on VGF targets. The tutorial is based on `vgf_minimal_example.ipyb`, provided in Arm's example folder.
@@ -163,9 +163,10 @@ assert os.path.exists(pte_path), "Build failed; no .pte-file found"
 
 
 ```{tip}
-For a quick start, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
+For a quick test, you can use the script `backends/arm/scripts/aot_arm_compiler.py` to produce the pte.
 To produce a pte file equivalent to the one above, run
-`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=simple_example.pte --target=vgf`
+`python -m backends.arm.scripts.aot_arm_compiler --model_name=add --delegate --quantize --output=simple_example.pte --target=vgf`.
+For production use, you should instead use the stable Python API shown above.
 ```
 
 ## Runtime
diff --git a/examples/arm/README.md b/examples/arm/README.md
@@ -37,10 +37,6 @@ this can be changed with --et_build_root=<FOLDER>
 `aot_arm_compiler.py` is used to convert a Python model or a saved .pt model to a PTE file and is used by `run.sh`
 and other test script but can also be used directly.
 
-If you prefer to use the ExecuTorch API, there is also the `ethos_u_minimal_example.ipynb` notebook example.
-This shows the workflow if you prefer to integrate a python torch.export and ExecuTorch flow directly into your
-model codebase. This is particularly useful if you want to perform more complex training, such as quantization
-aware training using the ArmQuantizer.
 
 ## Create a PTE file for Arm backends
 
diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
@@ -181,19 +181,19 @@ def get_example_kwarg_inputs(self):
             return None
 
     def get_dynamic_shapes(self):
-        static = torch.export.Dim.STATIC
+        batch_size = 1
         dim_seq_len = torch.export.Dim("token_dim", min=1, max=self.max_seq_len)
         # Hardcoding # of tiles to be 2. image tokens per tile is 1601.
         if self.use_kv_cache:
             dynamic_shapes = {
-                "tokens": {0: static, 1: dim_seq_len},
-                "encoder_input": {0: static, 1: static, 2: static},
-                "encoder_mask": {0: static, 1: dim_seq_len, 2: static},
-                "mask": {0: static, 1: dim_seq_len, 2: static},
-                "input_pos": {0: static, 1: dim_seq_len},
+                "tokens": {0: batch_size, 1: dim_seq_len},
+                "encoder_input": None,
+                "encoder_mask": {0: 1, 1: dim_seq_len, 2: None},
+                "mask": {0: batch_size, 1: dim_seq_len, 2: None},
+                "input_pos": {0: batch_size, 1: dim_seq_len},
             }
         else:
             dynamic_shapes = {
-                "tokens": {0: static, 1: dim_seq_len},
+                "tokens": {0: batch_size, 1: dim_seq_len},
             }
         return dynamic_shapes
diff --git a/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py b/examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py
@@ -69,6 +69,7 @@ def test_llama3_2_text_decoder_aoti(self) -> None:
                 encoder,
                 model.get_example_inputs(),
                 kwargs=model.get_example_kwarg_inputs(),
+                dynamic_shapes=model.get_dynamic_shapes(),
                 strict=True,
             )
         with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
@@ -360,8 +360,8 @@ def export_all(
         preprocessor_wrapper,
         (sample_audio, sample_length),
         dynamic_shapes={
-            # min=10 frames = 0.1 sec @ 16kHz, max aligned with encoder limit.
-            "audio": {0: Dim.AUTO(min=1600, max=max_audio_samples)},
+            # min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit
+            "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
             "length": {},
         },
         strict=False,
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleConversationHistoryTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmModuleConversationHistoryTest.kt
diff --git a/extension/llm/modules/attention.py b/extension/llm/modules/attention.py
diff --git a/install_requirements.py b/install_requirements.py
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
diff --git a/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h b/runtime/core/portable_type/c10/torch/headeronly/util/BFloat16.h
diff --git a/torch_pin.py b/torch_pin.py