pytorch
diff --git a/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/compiler.py
Lines changed: 5 additions & 1 deletion b/‎backends/cadence/aot/compiler.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 73 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 73 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_exp.cpp
Lines changed: 4 additions & 4 deletions b/‎backends/cadence/fusion_g3/operators/op_exp.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/cadence/hifi/kernels/targets.bzl
Lines changed: 5 additions & 0 deletions b/‎backends/cadence/hifi/kernels/targets.bzl
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_where.cpp
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/hifi/operators/op_where.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
Lines changed: 5 additions & 6 deletions b/‎backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
Lines changed: 5 additions & 6 deletions
@@ -125,6 +125,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         # Check dim_order (to_dim_order_copy)
         if "dim_order" in node.kwargs:
             dim_order = node.kwargs["dim_order"]
+            # pyre-ignore[6]
             if dim_order != list(range(len(dim_order))):
                 logger.info(
                     f"Argument {dim_order=} is not supported for "
 
@@ -33,6 +33,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -186,14 +187,17 @@ def export_to_edge(
     edge_prog_manager = to_edge(
         expo_program,
         compile_config=EdgeCompileConfig(
-            _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
                 torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                # cadence replaced to_dim_order_copy with _to_copy for performance
+                # skip _to_copy op to get around of dim order check
+                # We should remove this op once cadence can support dim order
+                exir_ops.edge.aten._to_copy.default,
             ],
         ),
         constant_methods=constant_methods,
 
@@ -11,6 +11,7 @@
 
 # pyre-unsafe
 
+import copy
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -35,7 +36,12 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from executorch.exir.passes.dim_order_ops_registry import (
+    DimOrderOpsMap,
+    MemoryFormatOpsMap,
+)
 from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
@@ -1799,6 +1805,72 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
+    """
+    dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
+    If the dim order is sequential, we don't need the extra work with strides and
+    can just use to_copy.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in DimOrderOpsMap:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # new kwargs with dim_order, and no memory_format for the new op
+        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+        ndim = None
+
+        # can always get the shape, assuming rank is specialized
+
+        # pyre-ignore[16]: `None` has no attribute `to_tensor`
+        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
+            # pyre-ignore[16]: `None` has no attribute `to_tensor`
+            ndim = args[0].to_tensor().dim()
+        elif isinstance(args[0], torch.Tensor):
+            # pyre-ignore[16]: `None` has no attribute `dim`
+            ndim = args[0].dim()
+        elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
+            # pyre-ignore[6]: Incompatible parameter type
+            ndim = len(args[0])
+        else:
+            assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
+
+        # get the "to" memory format for the EdgeOp
+        contiguous_dim_order = list(range(ndim))
+        dim_order = nkwargs.pop("dim_order", None)
+
+        # Cadence only supports contiguous memory format
+        assert (
+            dim_order is None
+            # pyre-ignore[6]: Incompatible parameter type
+            or len(dim_order) == 0
+            or dim_order == contiguous_dim_order
+        ), "Expected dim order in congituous or prevserve memory format, but got {}".format(
+            dim_order
+        )
+
+        # bring back memory format
+        # pyre-ignore[6]: Incompatible parameter type
+        nkwargs["memory_format"] = get_memory_format(dim_order)
+
+        memory_format_op = MemoryFormatOpsMap[op]
+
+        return super().call_operator(
+            memory_format_op,
+            args,
+            nkwargs,
+            meta,
+        )
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceFullLikeWithFullPass(ExportPass):
     """
@@ -2108,4 +2180,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
+        ReplaceToDimOrderCopyWithToCopyPass,
     ]
@@ -49,9 +49,9 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out);
 #endif
 
-  if (out.scalar_type() == ScalarType::Float) {
-    float* const out_data = out.mutable_data_ptr<float>();
-    const float* const in_data = in.const_data_ptr<float>();
+  if (in.scalar_type() == ScalarType::Float) {
+    float* __restrict__ out_data = out.mutable_data_ptr<float>();
+    const float* __restrict__ in_data = in.const_data_ptr<float>();
 
     XT_KERNEL_CHECK(
         ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
@@ -66,4 +66,4 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
@@ -2,12 +2,17 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
+    common_deps = [
+        "//executorch/runtime/kernel:kernel_includes",
+    ]
+
     runtime.cxx_library(
         name = "kernels",
         srcs = ["kernels.cpp"],
         exported_headers = [
             "kernels.h",
         ],
+        deps = common_deps,
         visibility = [
             "//executorch/backends/cadence/...",
         ],
 
@@ -28,7 +28,7 @@ namespace impl {
 namespace HiFi {
 namespace native {
 
-Tensor& where_out(
+Tensor& where_self_out(
     RuntimeContext& ctx,
     const Tensor& cond,
     const Tensor& a,
 
@@ -19,12 +19,11 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 ******************************************************************************/
-#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
-#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
-#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+#include "xa_type_def.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+#include "xa_nnlib_kernels_api.h"
 
 
 #if !HAVE_VFPU