pytorch
diff --git a/‎backends/apple/mps/mps_preprocess.py
Lines changed: 7 additions & 0 deletions b/‎backends/apple/mps/mps_preprocess.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/apple/mps/operators/indexing_ops.py
Lines changed: 74 additions & 2 deletions b/‎backends/apple/mps/operators/indexing_ops.py
Lines changed: 74 additions & 2 deletions
diff --git a/‎backends/apple/mps/operators/unary_ops.py
Lines changed: 3 additions & 0 deletions b/‎backends/apple/mps/operators/unary_ops.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/apple/mps/partition/mps_partitioner.py
Lines changed: 42 additions & 2 deletions b/‎backends/apple/mps/partition/mps_partitioner.py
Lines changed: 42 additions & 2 deletions
diff --git a/‎backends/apple/mps/runtime/MPSDevice.h
Lines changed: 23 additions & 0 deletions b/‎backends/apple/mps/runtime/MPSDevice.h
Lines changed: 23 additions & 0 deletions
diff --git a/‎backends/apple/mps/runtime/MPSDevice.mm
Lines changed: 65 additions & 0 deletions b/‎backends/apple/mps/runtime/MPSDevice.mm
Lines changed: 65 additions & 0 deletions
diff --git a/‎backends/apple/mps/runtime/MPSGraphBuilder.h
Lines changed: 7 additions & 0 deletions b/‎backends/apple/mps/runtime/MPSGraphBuilder.h
Lines changed: 7 additions & 0 deletions
@@ -18,6 +18,7 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSGraph,
     MPSTensor,
+    OpType,
 )
 
 from executorch.backends.apple.mps.serialization.mps_graph_serialize import (
@@ -65,6 +66,7 @@ def preprocess(
             input_ids=[],
             output_ids=[],
             constant_ids=[],
+            graph_type=OpType.mps_graph
         )
 
         convert_model_to_fp16 = True
@@ -111,6 +113,11 @@ def handle_call_function(
         mps_graph: MPSGraph,
     ) -> None:
         logging.info(f"Visiting: {node}, {node.target.__name__}")
+
+        if "delegation_tag" in node.meta and "metal_kernel" in node.meta["delegation_tag"]:
+            logging.info(f"Node '{node.target.__name__}' was marked as a Metal kernel by the MPSPartitioner!")
+            mps_graph.graph_type = OpType.metal_kernel
+
         if node.target.__name__ in node_visitors:
             node_visitors[node.target.__name__].define_node(node, mps_graph)
         else:
 
@@ -3,7 +3,7 @@
 #  Provided subject to the LICENSE file in the top level directory.
 #
 
-from typing import cast
+from typing import cast, List
 
 import torch
 from executorch.backends.apple.mps.operators.node_visitor import (
@@ -13,11 +13,13 @@
 from executorch.backends.apple.mps.serialization.mps_graph_schema import (
     MPSEmbedding,
     MPSGraph,
+    MPSIndexTensor,
+    MPSIndexPut,
     MPSIndexSelect,
 )
 from executorch.backends.apple.mps.utils.mps_utils import get_input_node
 from executorch.exir.sym_util import eval_expr
-
+from executorch.backends.transforms import get_shape
 
 @register_node_visitor
 class IndexSelectVisitor(NodeVisitor):
@@ -39,6 +41,76 @@ def define_node(
 
         mps_graph.mps_nodes.append(mps_node)
 
+@register_node_visitor
+class IndexTensorVisitor(NodeVisitor):
+    target = "aten.index.Tensor"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexTensor)
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(self.define_tensor(tensor, mps_graph))
+
+        mps_graph.mps_nodes.append(mps_node)
+
+
+
+# [MPS TODO]: Works on a single iteration of llama2, but subsequent tokens
+# are wrong when using Index put. Disabling it for now.
+@register_node_visitor
+class IndexPutVisitor(NodeVisitor):
+    # target = "aten.index_put.default"
+    target = "disabled"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def infer_sizes(self, a: List[int], b: List[int]):
+        dimsA = len(a)
+        dimsB = len(b)
+        print(dimsA)
+        print(dimsB)
+        ndim = dimsA if dimsA > dimsB else dimsB
+        expandedSizes = [0] * ndim
+        for i in range(ndim - 1, -1, -1):
+            offset = ndim - 1 - i
+            dimA = dimsA - 1 - offset
+            dimB = dimsB - 1 - offset
+            sizeA = a[dimA] if dimA >= 0 else -1
+            sizeB = b[dimB] if dimB >= 0 else -1
+            expandedSizes[i] = sizeA if sizeB == -1 else sizeB
+
+        return expandedSizes
+
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        mps_graph: MPSGraph,
+    ) -> None:
+        mps_node = self.create_unary_node(node, mps_graph, MPSIndexPut)
+        updates_shape = get_shape(node.args[2])
+        input_shape = get_shape(node.args[0])
+        new_shape = []
+        if len(updates_shape) != 1 and len(updates_shape) != len(input_shape):
+            new_shape = self.infer_sizes(input_shape, updates_shape)
+            mps_node.mpsnode_union.values_shape = new_shape
+
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        for tensor in tensors:
+            mps_node.mpsnode_union.indices_id.append(self.define_tensor(tensor, mps_graph))
+
+        mps_node.mpsnode_union.values_id = self.define_tensor(
+            get_input_node(node, 2), mps_graph
+        )
+        mps_graph.mps_nodes.append(mps_node)
 
 @register_node_visitor
 class EmbeddingVisitor(NodeVisitor):
 
@@ -41,6 +41,7 @@
     MPSSqrt,
     MPSTan,
     MPSTanh,
+    MPSLogicalNot,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
 
@@ -79,6 +80,7 @@ class UnaryOpVisitor(NodeVisitor):
         "aten.isnan.default",
         "aten.isinf.default",
         "aten.round.default",
+        "aten.logical_not.default",
     ]
 
     def __init__(self, *args) -> None:
@@ -115,6 +117,7 @@ def __init__(self, *args) -> None:
             exir_ops.edge.aten.isnan.default: MPSIsnan,
             exir_ops.edge.aten.isinf.default: MPSIsinf,
             exir_ops.edge.aten.round.default: MPSRound,
+            exir_ops.edge.aten.logical_not.default: MPSLogicalNot,
         }
 
     def define_node(
 
@@ -4,7 +4,7 @@
 #
 
 import logging
-from typing import Any, Dict, List, Union
+from typing import cast, Any, Dict, List, Union
 
 import torch
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
@@ -23,11 +23,19 @@
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.backends.transforms import get_shape
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.DEBUG, format=FORMAT)
 
 
+# ops implemented as Metal kernels.
+METAL_KERNELS = [
+    exir_ops.edge.aten.index.Tensor,
+    exir_ops.edge.aten.index_put.default,
+]
+
 class MPSOperatorSupport(OperatorSupportBase):
     def __init__(self, edge_program: torch.export.ExportedProgram, compiler_specs):
         self.node_visitors = get_node_visitors(edge_program)
@@ -65,10 +73,42 @@ def generate_partitions(self, edge_program: ExportedProgram) -> List[Any]:
             op_support=self.supported_ops,
         )
 
+    def mps_graph_advanced_indexing_support(self, node: torch.fx.Node):
+        num_indices = 0
+        tensors = cast(List[torch.fx.Node], node.args[1])
+        input = cast(torch.fx.Node, node.args[0])
+        for t in tensors:
+            if t is not None:
+                num_indices += 1
+        # Can dispatch to MPSGraph if the length of the slices is equal
+        # to the number of dimensions of the sliced tensors, or only one
+        # slice is present. All other cases will fallback to a Metal kernel.
+        if num_indices == len(get_shape(input)) or num_indices == 1:
+            return True
+
+        return False
+
+    def use_metal_kernel(self, node: torch.fx.Node):
+        if node.target in METAL_KERNELS:
+            if node.target == exir_ops.edge.aten.index.Tensor or node.target == exir_ops.edge.aten.index_put.default:
+                if not self.mps_graph_advanced_indexing_support(node):
+                    return True
+        return False
+
     def tag_nodes(self, partitions: List[Partition]) -> None:
         for partition in partitions:
-            for node in partition.nodes:
+            crt_partition_counter = 0
+            for node in sorted(partition.nodes):
                 delegation_tag = f"mps_{partition.id}"
+                if self.use_metal_kernel(node):
+                    logging.warning(f"[WARNING] Using Metal kernel for op {node.name}!")
+                    # Partition the Metal kernel into a separate partition
+                    crt_partition_counter += 1
+                    delegation_tag = f"{delegation_tag}_metal_kernel_{crt_partition_counter}"
+                    crt_partition_counter += 1
+                else:
+                    delegation_tag = f"{delegation_tag}_{crt_partition_counter}"
+
                 node.meta["delegation_tag"] = delegation_tag
                 self.partition_tags[delegation_tag] = self.delegation_spec
 
 
@@ -5,10 +5,19 @@
 
 #pragma once
 
+// Obj-C headers
 #include <Foundation/Foundation.h>
 #include <Metal/Metal.h>
+
+// Runtime headers
+#include <executorch/runtime/backend/interface.h>
+
+// MPS headers
 #include <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+#include <unordered_map>
+#include <vector>
+
 #define MB(x) (x * 1048576UL)
 
 namespace torch {
@@ -25,6 +34,11 @@ enum class MacOSVersion : uint32_t {
   MACOS_VER_14_0_PLUS,
 };
 
+enum class LibraryType : uint32_t {
+  INDEXING_KERNELS = 0,
+  MAX = INDEXING_KERNELS,
+};
+
 class MPSDevice {
  public:
   /**
@@ -53,9 +67,18 @@ class MPSDevice {
 
   ~MPSDevice();
 
+  /**
+   * Compile a PSO for a given library type.
+   * Once compiled, the library and PSOs are cached.
+   */
+  Error compilePSO(LibraryType libraryType, const char* kernelName);
+  Error compileLibrary(LibraryType);
+
  private:
   static MPSDevice* _device;
   id<MTLDevice> _mtl_device;
+  std::unordered_map<LibraryType, id<MTLLibrary>> _m_library_cache;
+  std::unordered_map<std::string, id<MTLComputePipelineState>> _m_pso_cache;
   MPSDevice();
 };
 
 
@@ -16,6 +16,20 @@
 static std::unique_ptr<MPSDevice> mps_device;
 static std::once_flag mpsdev_init;
 
+static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device, bool macOS13Plus) {
+  // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
+  // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+)
+  MTLLanguageVersion languageVersion = MTLLanguageVersion2_3;
+#if defined(__MAC_13_0)
+  if (macOS13Plus) {
+    languageVersion = MTLLanguageVersion3_0;
+  }
+#endif
+
+  ET_CHECK_MSG([device supportsFamily:MTLGPUFamilyMac2], "Missing Metal support for MTLGPUFamilyMac2");
+  return languageVersion;
+}
+
 MPSDevice::~MPSDevice() {
   [_mtl_device release];
   _mtl_device = nil;
@@ -79,6 +93,57 @@
   }
 }
 
+const char* getLibraryCString(LibraryType libraryType) {
+  switch (libraryType) {
+    case LibraryType::INDEXING_KERNELS:
+      return "Hello";
+    default:
+      ET_CHECK_MSG(false, "Unhandled library type!");
+  }
+}
+
+Error
+MPSDevice::compileLibrary(LibraryType libraryType) {
+  Error err = Error::Ok;
+  NSError* error = nil;
+  MTLCompileOptions* options = [MTLCompileOptions new];
+  [options setLanguageVersion:getMetalLanguageVersion(_mtl_device, isMacOS13Plus(MacOSVersion::MACOS_VER_13_0_PLUS))];
+  [options setFastMathEnabled:YES];
+  id<MTLLibrary> lib =
+      [_mtl_device newLibraryWithSource:[NSString stringWithCString:getLibraryCString(libraryType)
+                                                           encoding:NSASCIIStringEncoding]
+                                options:options
+                                  error:&error];
+
+  ET_CHECK_OR_RETURN_ERROR(
+    lib != nil,
+    Internal,
+    "Failed to create indexing library, error: %s", [[error description] UTF8String]
+  );
+
+  _m_library_cache[libraryType] = lib;
+  return err;
+}
+
+Error
+MPSDevice::compilePSO(LibraryType libraryType, const char* kernelName) {
+  Error err = Error::Ok;
+  if (_m_library_cache.find(libraryType) == _m_library_cache.end()) {
+    ET_LOG(Debug, "Compiling library type: %d", libraryType);
+    err = compileLibrary(libraryType);
+    ET_CHECK_OR_RETURN_ERROR(
+      err == Error::Ok,
+      Internal,
+      "An error occured occured while compiling library %d", libraryType
+    );
+  }
+  if (_m_pso_cache.find(kernelName) == _m_pso_cache.end()) {
+    ET_LOG(Debug, "Compiling kernel: %s", kernelName);
+    // err = compilePSO(libraryType, kernelName);
+  }
+  return err;
+}
+
 bool isMacOS13OrNewer(MacOSVersion version) {
   return MPSDevice::getInstance()->isMacOS13Plus(version);
 }
 
@@ -109,6 +109,7 @@ class MPSGraphBuilder {
   _DEFINE_MPS_OP(Isnan);
   _DEFINE_MPS_OP(Isinf);
   _DEFINE_MPS_OP(Round);
+  _DEFINE_MPS_OP(LogicalNot);
   _DEFINE_MPS_OP(NormCdf);
   // Clamp ops
   _DEFINE_MPS_OP(Clamp);
@@ -120,6 +121,8 @@ class MPSGraphBuilder {
   // Indexing ops
   _DEFINE_MPS_OP(IndexSelect);
   _DEFINE_MPS_OP(Embedding);
+  _DEFINE_MPS_OP(IndexTensor);
+  _DEFINE_MPS_OP(IndexPut);
   // Linear algebra ops
   _DEFINE_MPS_OP(MatMul);
   _DEFINE_MPS_OP(Addmm);
@@ -153,6 +156,7 @@ class MPSGraphBuilder {
 
   // Helper functions
   Error addNodeToMPSGraph(NodePtr nodePtr);
+  Error compileMetalKernel(NodePtr nodePtr);
   MPSShape *getMPSShape(int32_t id);
   MPSShape *getMPSShape(const flatbuffers::Vector<int32_t> *shape);
   int64_t numel(const flatbuffers::Vector<int32_t> *shape);
@@ -161,6 +165,8 @@ class MPSGraphBuilder {
   MPSGraphTensor *getMPSGraphTensor(int32_t id);
   NSData *getConstantData(int32_t id);
   std::pair<float, float> getMinMaxValues(NodePtr nodePtr);
+  Error compileMPSGraph();
+  Error compileMetalKernel();
 
   // Each MPSGraph op result in at least MPSGraphTensor being
   // produced, which will be stored in this structure. Other ops
@@ -172,6 +178,7 @@ class MPSGraphBuilder {
   // FlatBuffer raw bytes of the serialized MPS model.
   const void *_buffer_pointer;
 
+  bool _metal_kernel;
   MPSGraph *_mpsGraph;
   MPSGraphExecutable *_mpsGraphExecutable;
   NSMutableDictionary<MPSGraphTensor *, MPSGraphShapedType *> *_feeds;
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@`
`41`	`41`	`MPSSqrt,`
`42`	`42`	`MPSTan,`
`43`	`43`	`MPSTanh,`
	`44`	`+ MPSLogicalNot,`
`44`	`45`	`)`
`45`	`46`	`from executorch.exir.dialects._ops import ops as exir_ops`
`46`	`47`
`@@ -79,6 +80,7 @@ class UnaryOpVisitor(NodeVisitor):`
`79`	`80`	`"aten.isnan.default",`
`80`	`81`	`"aten.isinf.default",`
`81`	`82`	`"aten.round.default",`
	`83`	`+ "aten.logical_not.default",`
`82`	`84`	`]`
`83`	`85`
`84`	`86`	`def __init__(self, *args) -> None:`
`@@ -115,6 +117,7 @@ def __init__(self, *args) -> None:`
`115`	`117`	`exir_ops.edge.aten.isnan.default: MPSIsnan,`
`116`	`118`	`exir_ops.edge.aten.isinf.default: MPSIsinf,`
`117`	`119`	`exir_ops.edge.aten.round.default: MPSRound,`
	`120`	`+ exir_ops.edge.aten.logical_not.default: MPSLogicalNot,`
`118`	`121`	`}`
`119`	`122`
`120`	`123`	`def define_node(`