pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/build-qnn-direct-sdk.sh‎
Lines changed: 33 additions & 0 deletions b/‎.ci/scripts/build-qnn-direct-sdk.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/test_cortex_m_e2e.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 14 additions & 0 deletions b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/_test_cortex_m_e2e.yml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎.github/workflows/cuda-windows.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/cuda-windows.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 5 additions & 1 deletion b/‎.lintrunner.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 30 additions & 11 deletions b/‎CMakeLists.txt‎
Lines changed: 30 additions & 11 deletions
@@ -102,7 +102,7 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-if [[ "${GCC_VERSION:-}" == "11" && -z "${SKIP_PYTORCH:-}" ]]; then
+if [[ -n "${GCC_VERSION:-}" && -z "${SKIP_PYTORCH:-}" ]]; then
   PYTORCH_BUILD_MAX_JOBS=6
 fi
 
 
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -eux
+
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+
+setup_android_ndk
+install_qnn
+install_hexagon_sdk
+
+bash backends/qualcomm/scripts/build.sh \
+    --build_direct_mode 3 --soc_model SM8750 \
+    --skip_x86_64 --skip_linux_android \
+    --release
+
+ARTIFACT="build-direct/backends/qualcomm/libqnn_executorch_backend.so"
+if [ ! -f "${ARTIFACT}" ]; then
+    echo "ERROR: direct-mode build did not produce ${ARTIFACT}" >&2
+    exit 1
+fi
+
+MAX_SIZE_BYTES=$((200 * 1024))
+ARTIFACT_SIZE=$(stat -c%s "${ARTIFACT}")
+if [ "${ARTIFACT_SIZE}" -gt "${MAX_SIZE_BYTES}" ]; then
+    echo "ERROR: ${ARTIFACT} is ${ARTIFACT_SIZE} bytes, exceeds ${MAX_SIZE_BYTES}-byte (200 KiB) limit" >&2
+    exit 1
+fi
+echo "PASSED: direct-mode build produced ${ARTIFACT} (${ARTIFACT_SIZE} bytes, under ${MAX_SIZE_BYTES}-byte limit)"
@@ -14,13 +14,14 @@
 set -eu
 
 MODEL=$1
+TARGET=${2:-cortex-m55}
 script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
 et_root_dir=$(realpath "${script_dir}/../..")
 
-# Quantization is the default for the cortex-m55 target; run.sh's
+# Quantization is the default for cortex-m targets; run.sh's
 # arg parser only recognizes --no_quantize, so we omit any explicit flag.
 export ARM_FVP_INSTALL_I_AGREE_TO_THE_CONTAINED_EULA=True
 bash "${et_root_dir}/examples/arm/run.sh" \
     --model_name="${MODEL}" \
-    --target=cortex-m55 \
+    --target="${TARGET}" \
     --bundleio
@@ -2,6 +2,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
+# Copyright 2026 Arm Limited and/or its affiliates.
+#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
@@ -57,6 +59,18 @@ fi
 
 "${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
 
+# Enable VGF in pybind wheel builds when the platform-specific build input is
+# available from pip.
+if [[ "$UNAME_S" == "Linux" || "$UNAME_S" == "Darwin" ]]; then
+  if python3 -m pip install -r \
+    "${GITHUB_WORKSPACE}/${REPOSITORY}/backends/arm/requirements-arm-vgf-runtime.txt"; then
+    export EXECUTORCH_PYBIND_ENABLE_VGF=ON
+    echo "EXECUTORCH_PYBIND_ENABLE_VGF=ON" >> "${GITHUB_ENV}"
+  else
+    echo "VGF build dependency unavailable on this platform; building without VGF"
+  fi
+fi
+
 # Download Qualcomm QNN SDK on Linux x86_64 so the wheel build can include the
 # QNN backend.  The SDK is large, so we download it here (outside CMake) rather
 # than during cmake configure.
 
@@ -11,6 +11,11 @@ on:
         description: 'JSON array of model names to run on the Corstone-300 FVP, e.g. ["mv2", "mv3"]'
         required: true
         type: string
+      targets:
+        description: 'JSON array of cortex-m target CPUs to build the runner for, e.g. ["cortex-m55", "cortex-m7", "cortex-m0plus"]'
+        required: false
+        type: string
+        default: '["cortex-m55"]'
       timeout:
         description: 'Per-matrix-entry timeout in minutes'
         required: false
@@ -23,9 +28,10 @@ jobs:
     strategy:
       matrix:
         model: ${{ fromJSON(inputs.models) }}
+        target: ${{ fromJSON(inputs.targets) }}
       fail-fast: false
     with:
-      job-name: ${{ matrix.model }}
+      job-name: ${{ matrix.model }}-${{ matrix.target }}
       runner: linux.2xlarge.memory
       docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
       submodules: 'recursive'
@@ -44,4 +50,4 @@ jobs:
         source examples/arm/arm-scratch/setup_path.sh
 
         # Export and run model on FVP (run.sh internally builds the test runner).
-        bash .ci/scripts/test_cortex_m_e2e.sh ${{ matrix.model }}
+        bash .ci/scripts/test_cortex_m_e2e.sh "${{ matrix.model }}" "${{ matrix.target }}"
@@ -16,6 +16,7 @@ on:
       - .github/workflows/cuda-windows.yml
       - backends/cuda/**
       - backends/aoti/**
+      - extension/cuda/**
   workflow_dispatch:
 
 concurrency:
@@ -49,6 +50,7 @@ jobs:
       (
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, 'extension/cuda') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
@@ -150,6 +152,7 @@ jobs:
       (
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, 'extension/cuda') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
 
@@ -948,6 +948,25 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-direct-build-linux:
+    name: test-qnn-direct-build-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 30
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-direct-sdk.sh
+
   test-qnn-testsuite-linux:
     name: test-qnn-testsuite-linux
     permissions:
 
@@ -1075,3 +1075,4 @@ jobs:
     uses: ./.github/workflows/_test_cortex_m_e2e.yml
     with:
       models: '["mv2", "mv3"]'
+      targets: '["cortex-m55", "cortex-m7", "cortex-m0plus"]'
@@ -173,6 +173,7 @@ exclude_patterns = [
     'extension/asr/runner/transducer_runner.h',
     'extension/aten_util/**',
     'extension/benchmark/apple/**',
+    'extension/cuda/**',
     'extension/data_loader/**',
     'extension/evalue_util/**',
     'extension/flat_tensor/**',
@@ -195,7 +196,6 @@ exclude_patterns = [
     'kernels/aten/**',
     'kernels/optimized/**',
     'kernels/portable/**',
-    'kernels/prim_ops/**',
     'kernels/quantized/**',
     'kernels/test/**',
 
@@ -226,6 +226,10 @@ command = [
     '--extra-arg=--suppress=toomanyconfigs',
     '--extra-arg=--suppress=unusedFunction:*.h',
     '--extra-arg=--suppress=unusedFunction:*.hpp',
+    # Prim ops use the same ExecuTorch macro idioms as portable kernels.
+    '--extra-arg=--suppress=unknownMacro:*kernels/prim_ops/*',
+    '--extra-arg=--suppress=syntaxError:*kernels/prim_ops/*',
+    '--extra-arg=--suppress=unusedFunction:*kernels/prim_ops/*',
     '--',
     '@{{PATHSFILE}}'
 ]
 
@@ -49,17 +49,6 @@ cmake_minimum_required(VERSION 3.24)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
-# Hexagon toolchain with release build complains about code in third party
-# libraries.
-if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "Hexagon" AND "${CMAKE_BUILD_TYPE}"
-                                                     STREQUAL "Release"
-)
-  add_compile_options(
-    -Wno-error=format -Wno-error=implicit-int-conversion
-    -Wno-error=unused-variable -Wno-error=unused-function
-  )
-endif()
-
 # --- ExecuTorch Version ---
 # Parse version from version.txt (single source of truth)
 file(READ "${EXECUTORCH_ROOT}/version.txt" ET_VERSION_STRING)
@@ -90,6 +79,18 @@ project(executorch
         VERSION "${ET_VERSION_MAJOR}.${ET_VERSION_MINOR}.${ET_VERSION_PATCH}"
 )
 
+# Hexagon toolchain with release build complains about code in third party
+# libraries. Must come after project(), which runs the toolchain file that sets
+# CMAKE_SYSTEM_PROCESSOR, and before add_subdirectory(third-party).
+if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "Hexagon" AND "${CMAKE_BUILD_TYPE}"
+                                                     STREQUAL "Release"
+)
+  add_compile_options(
+    -Wno-error=format -Wno-error=implicit-int-conversion
+    -Wno-error=unused-variable -Wno-error=unused-function
+  )
+endif()
+
 message(
   STATUS
     "ExecuTorch version: ${ET_VERSION_MAJOR}.${ET_VERSION_MINOR}.${ET_VERSION_PATCH}"
@@ -764,6 +765,20 @@ if(EXECUTORCH_BUILD_CUDA
   find_package_torch()
 endif()
 
+# Backend-neutral caller-stream guard consumed by the CUDA AOTI backend (and the
+# vendored torch-tensorrt delegate). Built before backends/aoti and
+# backends/cuda, which link it.
+if(EXECUTORCH_BUILD_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/cuda)
+  install(
+    DIRECTORY extension/cuda/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/cuda
+    FILES_MATCHING
+    PATTERN "*.h"
+  )
+  list(APPEND _executorch_extensions extension_cuda)
+endif()
+
 # Build common AOTI functionality if needed by CUDA or Metal backends
 if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
@@ -1026,6 +1041,10 @@ if(EXECUTORCH_BUILD_PYBIND)
     list(APPEND _dep_libs coremldelegate)
   endif()
 
+  if(EXECUTORCH_BUILD_VGF)
+    list(APPEND _dep_libs vgf_backend)
+  endif()
+
   if(EXECUTORCH_BUILD_MPS)
     list(APPEND _dep_libs mpsdelegate)
   endif()
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ on:`
`16`	`16`	`- .github/workflows/cuda-windows.yml`
`17`	`17`	`- backends/cuda/**`
`18`	`18`	`- backends/aoti/**`
	`19`	`+ - extension/cuda/**`
`19`	`20`	`workflow_dispatch:`
`20`	`21`
`21`	`22`	`concurrency:`
`@@ -49,6 +50,7 @@ jobs:`
`49`	`50`	`(`
`50`	`51`	`contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|`
`51`	`52`	`contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|`
	`53`	`+ contains(needs.changed-files.outputs.changed-files, 'extension/cuda') \|\|`
`52`	`54`	`contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|`
`53`	`55`	`needs.run-decision.outputs.is-full-run == 'true'`
`54`	`56`	`)`
`@@ -150,6 +152,7 @@ jobs:`
`150`	`152`	`(`
`151`	`153`	`contains(needs.changed-files.outputs.changed-files, 'backends/cuda') \|\|`
`152`	`154`	`contains(needs.changed-files.outputs.changed-files, 'backends/aoti') \|\|`
	`155`	`+ contains(needs.changed-files.outputs.changed-files, 'extension/cuda') \|\|`
`153`	`156`	`contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-windows.yml') \|\|`
`154`	`157`	`needs.run-decision.outputs.is-full-run == 'true'`
`155`	`158`	`)`