pytorch
diff --git a/‎.gitmodules
Lines changed: 1 addition & 1 deletion b/‎.gitmodules
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/mps_preprocess.py
Lines changed: 6 additions & 0 deletions b/‎backends/apple/mps/mps_preprocess.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/apple/mps/operators/constant_ops.py
Lines changed: 19 additions & 0 deletions b/‎backends/apple/mps/operators/constant_ops.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎backends/apple/mps/operators/op_clone.py
Lines changed: 19 additions & 0 deletions b/‎backends/apple/mps/operators/op_clone.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎backends/apple/mps/test/test_mps.py
Lines changed: 15 additions & 0 deletions b/‎backends/apple/mps/test/test_mps.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 6 deletions b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 6 deletions
diff --git a/‎backends/cadence/aot/functions_fusion_g3.yaml
Lines changed: 20 additions & 6 deletions b/‎backends/cadence/aot/functions_fusion_g3.yaml
Lines changed: 20 additions & 6 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎backends/cadence/fusion_g3/operators/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_add.cpp
Lines changed: 4 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_add.cpp
Lines changed: 4 additions & 2 deletions
@@ -66,7 +66,7 @@
 	url = https://github.com/pybind/pybind11.git
 [submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
 	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
-	url = https://github.com/foss-xtensa/nnlib-FusionG3/
+	url = https://github.com/foss-xtensa/nnlib-FusionG3.git
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
@@ -32,6 +32,9 @@
     CompileSpec,
     PreprocessResult,
 )
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
+from executorch.exir.program._program import _transform
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -83,6 +86,9 @@ def preprocess(
         #    FlatBuffer graph, process the `output` nodes and add their id to
         #    the `output_ids` array in the schema.
 
+        # TODO: Remove this once we have a better support for the dim-order ops.
+        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+
         mps_graph = MPSGraph(
             version="0",
             mps_nodes=[],
 
@@ -79,6 +79,25 @@ def define_node(
         )
 
 
+@register_node_visitor
+class ToDimOrderEmptyVisitor(NodeVisitor):
+    target = ["dim_order_ops._empty_dim_order.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        # We should never get here, because DimOrderOpsRevertPass replaces this with an aten.empty.memory_format op
+        # But if we do, we can't handle it ATM, so raise an exception
+        raise NotImplementedError(
+            "dim_order_ops._empty_dim_order.default is not supported yet"
+        )
+
+
 @register_node_visitor
 class FullLikeVisitor(NodeVisitor):
     target = "aten.full_like.default"
 
@@ -33,3 +33,22 @@ def define_node(
                 )
         input_id = self.define_tensor(get_input_node(node, 0), mps_graph)
         self.tensor_to_id[node] = input_id
+
+
+@register_node_visitor
+class ToDimOrderCopyVisitor(NodeVisitor):
+    target = ["dim_order_ops._to_dim_order_copy.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        # We should never get here, because DimOrderOpsRevertPass replaces this with an aten._to_copy op
+        # But if we do, we can't handle it ATM, so raise an exception
+        raise NotImplementedError(
+            "dim_order_ops._to_dim_order_copy.default is not supported yet"
+        )
@@ -1829,6 +1829,21 @@ def forward(self, x):
             Clone(), model_inputs, func_name=inspect.stack()[0].function[5:]
         )
 
+    def test_mps_backend_to_copy(self):
+        class Copy(torch.nn.Module):
+            def forward(self, x):
+                return (
+                    torch.ops.aten._to_copy.default(
+                        x + 2, memory_format=torch.contiguous_format
+                    )
+                    + x
+                )
+
+        model_inputs = (torch.randn(1, 3, 3),)
+        self.lower_and_test_with_partitioner(
+            Copy(), model_inputs, func_name=inspect.stack()[0].function[5:]
+        )
+
     def test_mps_backend_floor(self):
         class Floor(torch.nn.Module):
             def forward(self, x):
 
@@ -26,10 +26,7 @@
 
 # Config for Capturing the weights, will be moved in the future
 
-# TODO(T182928844): Delegate dim order op to backend.
-_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(
-    _check_ir_validity=False, _skip_dim_order=True
-)
+_EDGE_COMPILE_CONFIG = exir.EdgeCompileConfig(_check_ir_validity=False)
 
 
 class ansi_colors:
@@ -219,7 +216,6 @@ def lower_module_and_test_output(
             dynamic_shapes=dynamic_shapes,
             edge_compile_config=EdgeCompileConfig(
                 _check_ir_validity=False,
-                _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
             ),
         )
 
@@ -250,7 +246,6 @@ def lower_module_and_test_output(
                 export(delegated_program, sample_inputs, strict=True),
                 compile_config=exir.EdgeCompileConfig(
                     _check_ir_validity=False,
-                    _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
                 config=ExecutorchBackendConfig(extract_delegate_segments=False)
 
@@ -50,12 +50,12 @@
 - op: div.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out
+      kernel_name: cadence::impl::G3::div_out
 
 - op: div.out_mode
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
+      kernel_name: cadence::impl::G3::div_out_mode
 
 - op: embedding.out
   kernels:
@@ -71,7 +71,6 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::G3::mul_out
-
 - op: mul.Scalar_out
   kernels:
     - arg_meta: null
@@ -80,7 +79,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
+      kernel_name: cadence::impl::G3::permute_copy_out
 
 - op: sigmoid.out
   kernels:
@@ -90,7 +89,7 @@
 - op: slice_copy.Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::slice_copy_Tensor_out
+      kernel_name: cadence::impl::G3::slice_copy_Tensor_out
 
 - op: split_with_sizes_copy.out
   kernels:
@@ -100,7 +99,12 @@
 - op: sub.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sub_out
+      kernel_name: cadence::impl::G3::sub_out
+
+- op: sub.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::sub_scalar_out
 
 - op: view_copy.out
   kernels:
@@ -117,6 +121,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::native_layer_norm_out
 
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name:  cadence::impl::G3::mean_dim_out
+
+- op: exp.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::exp_out
+
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
@@ -36,6 +36,12 @@ set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_sub.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_div.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
@@ -51,6 +57,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 
@@ -39,6 +39,7 @@ Tensor& add_out(
   ScalarType common_type =
       executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -62,12 +63,12 @@ Tensor& add_out(
       torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
+#endif
 
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
 
-  // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.out";
 
   int kTensorDimensionLimit = 5;
@@ -253,6 +254,7 @@ Tensor& add_scalar_out(
       torch::executor::native::utils::promote_type_with_scalar(
           a.scalar_type(), b);
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -276,7 +278,7 @@ Tensor& add_scalar_out(
       executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
       InvalidArgument,
       out);
-
+#endif
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);