[ET-VK][13/n] split_with_sizes with more test codegen

yipjustin · yipjustin · commit 23e04e2be1fa · 2024-04-28T23:06:49.000-07:00
Pull Request resolved: #3389 Life is fun when the code-gen is more challenging than writing the operator itself. 1. Test codegen update to include vector of Tensor as output: `Tensor(a)[]`. 2. `aten.split_with_sizes.default` 3. `aten.split.Tensor` 4. Improve `DimUtils` for better dimension reasoning. ghstack-source-id: 224258792 @exported-using-ghexport Differential Revision: [D56660525](https://our.internmc.facebook.com/intern/diff/D56660525/)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_split_with_sizes_default_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    const std::vector<int64_t>& split_sizes,
+    int64_t dim,
+    ValueRef out_list_ref) {
+  vTensorPtr t_in = graph.get_tensor(in);
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+
+  ValueListPtr out_list = graph.get_value_list(out_list_ref);
+
+  NchwDim nchw_dim = normalize_to_nchw_dim(*t_in, dim);
+
+  VK_CHECK_COND(out_list->size() == split_sizes.size());
+
+  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
+    int64_t split_size = split_sizes[split_idx];
+    ValueRef out_ref = (*out_list)[split_idx];
+
+    vTensorPtr t_out = graph.get_tensor(out_ref);
+    VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+    VK_CHECK_COND(dim_at(*t_out, nchw_dim) == split_size);
+  }
+
+  if (nchw_dim == DimWidth) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef out_ref : *out_list) {
+      // Doesn't need to use split_size since we have already verified that the
+      // output tensor's size matches with the split_size.
+      vTensorPtr t_out = graph.get_tensor(out_ref);
+      api::utils::ivec3 range = t_out->texture_limits();
+      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+
+      src_offset.data[0] += range.data[0];
+    }
+  } else if (nchw_dim == DimHeight) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef out_ref : *out_list) {
+      vTensorPtr t_out = graph.get_tensor(out_ref);
+      api::utils::ivec3 range = t_out->texture_limits();
+      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+
+      src_offset.data[1] += range.data[1];
+    }
+  } else if (nchw_dim == DimBatch) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef out_ref : *out_list) {
+      vTensorPtr t_out = graph.get_tensor(out_ref);
+      api::utils::ivec3 range = t_out->texture_limits();
+      add_copy_offset_node(graph, in, range, src_offset, dst_offset, out_ref);
+
+      src_offset.data[2] += range.data[2];
+    }
+  } else if (nchw_dim == DimChannel) {
+    int32_t src_offset = 0;
+    int32_t dst_offset = 0;
+
+    for (ValueRef out_ref : *out_list) {
+      vTensorPtr t_out = graph.get_tensor(out_ref);
+      int32_t range = dim_at<Dim4D::Channel>(t_out->sizes());
+      add_copy_channel_offset_node(
+          graph, in, range, src_offset, dst_offset, out_ref);
+      src_offset += range;
+    }
+
+  } else {
+    VK_THROW("not ipmlemented");
+  }
+}
+
+void add_split_with_sizes_default_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef split_sizes_ref,
+    ValueRef dim_ref,
+    ValueRef out) {
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
+
+  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
+}
+
+void split_with_sizes_default(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]);
+}
+
+void add_split_tensor_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef split_size_ref,
+    ValueRef dim_ref,
+    ValueRef out) {
+  int64_t split_size = graph.extract_scalar<int64_t>(split_size_ref);
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  vTensorPtr t_in = graph.get_tensor(in);
+  NchwDim nchw_dim = normalize_to_nchw_dim(*t_in, dim);
+  int64_t size = dim_at(*t_in, nchw_dim);
+  std::vector<int64_t> split_sizes(size / split_size, split_size);
+
+  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
+}
+
+void split_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_split_tensor_node(graph, args[0], args[1], args[2], args[3]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.split_with_sizes.default, split_with_sizes_default);
+  VK_REGISTER_OP(aten.split.Tensor, split_tensor);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
@@ -12,6 +12,27 @@
 
 namespace vkcompute {
 
+// A canonical way to represent dimensions as enum. Motivation behind a
+// canonical enum is that in the user tensor, it is using a "big-endian"-ish
+// mechanism to reference a dimension in a nchw-tensor, leading to tensor of
+// different dimension have different mapping from dim to the underlying texture
+// dimension. For instasnce, for a 2d (height x width) tensors, dim 0 refers to
+// height and dim 1 refers to width; for a 4d (batch x channel x height x width)
+// tensor, dim 0 refers to batch and dim 1 refers to channel. Using this
+// canonical enum allows us to bring clarity in code.
+
+enum NchwDim : uint32_t {
+  DimWidth = 1u,
+  DimHeight = 2u,
+  DimChannel = 3u,
+  DimBatch = 4u,
+};
+
+// Convert a dim provided by user into canonical enum.
+inline NchwDim normalize_to_nchw_dim(const vTensor& v_in, int32_t dim) {
+  return static_cast<NchwDim>(v_in.dim() - dim);
+}
+
 /*
  * Maps a semantic dimension name to an integer that
  * corresponds to its innermost ordering in a 4D tensor in
@@ -20,10 +41,10 @@ namespace vkcompute {
  * corresponds to 2, and so on.
  */
 struct Dim4D {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t Channel = 3u;
-  static constexpr uint32_t Batch = 4u;
+  static constexpr uint32_t Width = DimWidth;
+  static constexpr uint32_t Height = DimHeight;
+  static constexpr uint32_t Channel = DimChannel;
+  static constexpr uint32_t Batch = DimBatch;
 };
 
 /*
@@ -65,34 +86,20 @@ uint32_t dim_at(const std::vector<int64_t>& sizes) {
   return dims < N ? 1 : api::utils::safe_downcast<uint32_t>(sizes[dims - N]);
 }
 
+inline uint32_t dim_at(const std::vector<int64_t>& sizes, NchwDim nchw_dim) {
+  const uint32_t dims = sizes.size();
+  return dims < nchw_dim
+      ? 1
+      : api::utils::safe_downcast<uint32_t>(sizes[dims - nchw_dim]);
+}
+
 template <uint32_t N>
 uint32_t dim_at(const vTensor& v_in) {
   return dim_at<N>(v_in.sizes());
 }
 
-// A canonical way to represent dimensions as enum. Intended to use the same
-// value as Dim4D for potential future refactoring.
-
-enum NchwDim {
-  DimWidth = 1,
-  DimHeight = 2,
-  DimChannel = 3,
-  DimBatch = 4,
-};
-
-/* This function return a NchwDim
- * given a Tensor and a user provided dim. The reason for this normalization is
- * that in the user tensor coordinate, it is using a "big-endian" mechanism when
- * referring to a nchw dimension, in that dim=0 refers to the batch dimension in
- * a 4d tensor but dim=0 reference to height in a 2d tensor. Despite in a common
- * texture representation of channel packing, a 2d tensor has exactly the same
- * layout as a 4d with the batch and channel size equals to 1. This function
- * returns a canonical dimension to simplify dimension reasoning in the code.
- *
- */
-
-inline NchwDim normalize_to_nchw_dim(const vTensor& v_in, int32_t dim) {
-  return static_cast<NchwDim>(v_in.dim() - dim);
+inline uint32_t dim_at(const vTensor& v_in, NchwDim nchw_dim) {
+  return dim_at(v_in.sizes(), nchw_dim);
 }
 
 inline std::ostream& operator<<(std::ostream& os, NchwDim nchw_dim) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -8,8 +8,6 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-
 namespace vkcompute {
 
 //
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -438,9 +438,7 @@ def get_cat_inputs():
             ([(3, 5), (4, 5)], 0),
             ([(3, 5), (4, 5), (1, 5)], 0),
             (
-                [
-                    (3, 5),
-                ],
+                [(3, 5)],
                 0,
             ),
             # Cat on Width
@@ -449,9 +447,7 @@ def get_cat_inputs():
             ([(5, 3), (5, 4)], 1),
             ([(5, 3), (5, 4), (5, 1)], 1),
             (
-                [
-                    (5, 4),
-                ],
+                [(5, 4)],
                 1,
             ),
             ([(5,), (6,)], 0),
@@ -474,6 +470,91 @@ def get_cat_inputs():
     return test_suite
 
 
+def get_split_with_sizes_inputs():
+    Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
+    test_cases = [
+        # Split on Width
+        Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
+        Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
+        Test(self=(10, 10), sizes=[1, 9], dim=1),
+        Test(self=(10,), sizes=[1, 9], dim=0),
+        # Split on Height
+        Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
+        Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
+        Test(self=(7, 10, 10), sizes=[10], dim=1),
+        Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
+        Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
+        # Split on Batch
+        Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
+        Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
+        # Split on Channel
+        Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
+        Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
+        Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
+        Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
+        Test(self=(13, 4, 8), sizes=[13], dim=0),
+    ]
+    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
+
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.dtypes = ["at::kFloat"]
+    return test_suite
+
+
+def get_split_tensor_inputs():
+    test_suite = VkTestSuite(
+        [
+            # Split on Width
+            ((S1, 7, 10, 12), 12, 3),
+            ((S1, 7, 10, 12), 3, 3),
+            ((S1, 7, 10, 12), 1, 3),
+            ((7, 10, 12), 12, 2),
+            ((7, 10, 12), 3, 2),
+            ((7, 10, 12), 1, 2),
+            ((10, 12), 12, 1),
+            ((10, 12), 3, 1),
+            ((10, 12), 1, 1),
+            ((12,), 12, 0),
+            ((12,), 3, 0),
+            ((12,), 1, 0),
+            # Split on Height
+            ((S1, 7, 12, 8), 12, 2),
+            ((S1, 7, 12, 8), 3, 2),
+            ((S1, 7, 12, 8), 1, 2),
+            ((7, 12, 8), 12, 1),
+            ((7, 12, 8), 3, 1),
+            ((7, 12, 8), 1, 1),
+            ((12, 8), 12, 0),
+            ((12, 8), 3, 0),
+            ((12, 8), 1, 0),
+            # Split  on Batch
+            ((12, 7, 10, 10), 12, 0),
+            ((12, 7, 10, 10), 3, 0),
+            ((12, 7, 10, 10), 1, 0),
+            # Split  on Channel
+            ((7, 15, 10, 10), 15, 1),
+            ((7, 15, 10, 10), 5, 1),
+            ((7, 15, 10, 10), 3, 1),
+            ((7, 15, 10, 10), 1, 1),
+            ((15, 10, 10), 15, 0),
+            ((15, 10, 10), 5, 0),
+            ((15, 10, 10), 3, 0),
+            ((15, 10, 10), 1, 0),
+        ]
+    )
+
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.dtypes = ["at::kFloat"]
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -494,4 +575,6 @@ def get_cat_inputs():
     "aten.clone.default": get_clone_inputs(),
     "aten.repeat.default": get_repeat_inputs(),
     "aten.cat.default": get_cat_inputs(),
+    "aten.split_with_sizes.default": get_split_with_sizes_inputs(),
+    "aten.split.Tensor": get_split_tensor_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py