Xnnpack: Support clone.default with skip_dim_order=True

mansnils · mansnils · commit 63fb2aef060e · 2026-05-21T23:46:59.000+02:00
With the default XNNPACK test config, skip_dim_order=False rewrites
aten.clone.default to dim_order_ops._clone_dim_order.default. That
path is already supported through CloneDimOrderConfig.

Some XNNPACK export flows use skip_dim_order=True, where
aten.clone.default stays as aten.clone.default and is not selected
by the partitioner.

Adds CloneConfig for dim-order-preserving aten.clone.default nodes
so this path is partitioned directly.

This reduces delegate splits in the EdgeTAM mask decoder, where
profiling exports use skip_dim_order=True.

Signed-off-by: Måns Nilsson &lt;mans.nilsson@arm.com&gt;
Change-Id: Ic48ec187f26048b68a805c6edd6dad41b3dab481
diff --git a/backends/xnnpack/operators/op_clone.py b/backends/xnnpack/operators/op_clone.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -13,6 +14,7 @@
     NodeVisitor,
     register_node_visitor,
 )
+from executorch.backends.xnnpack.operators.quant_params import QuantParams
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     XNNCopy,
     XNNGraph,
@@ -25,17 +27,26 @@
 class CloneVisitor(NodeVisitor):
     target = "aten.clone.default"
 
-    def __init__(self, *args) -> None:
-        super().__init__(*args)
-
     def define_node(
         self,
         node: torch.fx.Node,
         xnn_graph: XNNGraph,
         vals_to_ids: Dict[torch.fx.Node, int],
         debug_handle: int,
     ) -> None:
-        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+        self.define_tensor(
+            node,
+            xnn_graph,
+            vals_to_ids,
+            quant_params=QuantParams.from_outputs(node),
+        )
+        input_node = get_input_node(node, 0)
+        self.define_tensor(
+            input_node,
+            xnn_graph,
+            vals_to_ids,
+            quant_params=QuantParams.from_inputs(input_node, self._exported_program),
+        )
 
         # Sanity check that the input and output dim order are the same. We don't
         # handle dim order conversions yet.
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -23,6 +24,7 @@
     CatConfig,
     CeilConfig,
     ClampConfig,
+    CloneConfig,
     CloneDimOrderConfig,
     ConstantPadConfig,
     CosConfig,
@@ -82,6 +84,7 @@
     BMMConfig,
     CatConfig,
     CeilConfig,
+    CloneConfig,
     CloneDimOrderConfig,
     ConstantPadConfig,
     ConvolutionConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -239,6 +239,27 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
 
 
+class CloneConfig(GenericNodePartitionerConfig):
+    target_name = "clone.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        input_meta = node.args[0].meta["val"]
+        output_meta = node.meta["val"]
+        input_dim_order = list(input_meta.dim_order())
+        output_dim_order = list(output_meta.dim_order())
+        if input_dim_order != output_dim_order:
+            why(node, reason="Only dim-order preserving clones are supported.")
+            return False
+
+        return True
+
+
 class ClampConfig(GenericNodePartitionerConfig):
     target_name = "clamp.default"
 
diff --git a/backends/xnnpack/test/ops/test_clone.py b/backends/xnnpack/test/ops/test_clone.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,7 +10,8 @@
 import unittest
 
 import torch
-from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester import Tester, ToEdgeTransformAndLower
+from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
 
 
 class TestClone(unittest.TestCase):
@@ -62,6 +64,32 @@ def test_fp32_clone(self):
         inputs = (torch.randn(2, 3, 4, 5),)
         self._test_clone_partitioned(inputs)
 
+    def test_fp32_clone_default_partitions_with_skip_dim_order(self):
+        """Test plain aten.clone.default partitioning without dim-order rewrite."""
+        inputs = (torch.randn(2, 3, 4, 5),)
+        (
+            Tester(self.Clone(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.clone.default": 1})
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower(
+                    edge_compile_config=get_xnnpack_edge_compile_config(
+                        skip_dim_order=True
+                    )
+                )
+            )
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_aten_clone_default",
+                    "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default",
+                ]
+            )
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
     def test_fp32_clone_2d(self):
         """Test FP32 clone with 2D tensor - should be partitioned"""
         inputs = (torch.randn(10, 20),)