[ET-VK][12/n] aten.cat with more codegen

yipjustin · yipjustin · commit 7a46ac789726 · 2024-04-27T00:44:29.000-07:00
1. The `aten.cat` operation are very straightforward using the `copy_*_node`. 2. Complexity comes from the code-gen. We need to introduce a `AT_TENSOR_LIST` type, which contains a list of `AT_TENSOR` with `is_in=True`. The tensor list itself as a container is not an `IOValueRef`, but the element inside are. It leads to some ugly if-then-else in the codegen. Differential Revision: [D56626865](https://our.internmc.facebook.com/intern/diff/D56626865/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_cat_default_node(
+    ComputeGraph& graph,
+    ValueRef in_list_ref,
+    ValueRef dim_ref,
+    ValueRef out) {
+  ValueListPtr input_list = graph.get_value_list(in_list_ref);
+
+  for (ValueRef input_ref : *input_list) {
+    vTensorPtr t_in = graph.get_tensor(input_ref);
+    VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  }
+
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  /*
+  for (ValueRef input_ref : *input_list) {
+    vTensorPtr t_in = graph.get_tensor(input_ref);
+  }
+  */
+
+  NchwDim nchw_dim = normalize_to_nchw_dim(*t_out, dim);
+
+  // TODO: Find ways to factor out the similar code for width, height, and batch
+  if (nchw_dim == DimWidth) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[0] += range.data[0];
+    }
+
+  } else if (nchw_dim == DimHeight) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[1] += range.data[1];
+    }
+  } else if (nchw_dim == DimBatch) {
+    api::utils::ivec3 src_offset = api::utils::make_ivec3({0, 0, 0}, false);
+    api::utils::ivec3 dst_offset = api::utils::make_ivec3({0, 0, 0}, false);
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      api::utils::ivec3 range = t_in->texture_limits();
+      add_copy_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset.data[2] += range.data[2];
+    }
+  } else if (nchw_dim == DimChannel) {
+    int32_t src_offset = 0;
+    int32_t dst_offset = 0;
+
+    for (ValueRef input_ref : *input_list) {
+      vTensorPtr t_in = graph.get_tensor(input_ref);
+      int32_t range = dim_at<Dim4D::Channel>(t_in->sizes());
+      add_copy_channel_offset_node(
+          graph, input_ref, range, src_offset, dst_offset, out);
+      dst_offset += range;
+    }
+  } else {
+    VK_THROW("Unexpected value of nchw_dim=", nchw_dim);
+  }
+}
+
+void cat_default(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  add_cat_default_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.cat.default, cat_default);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -96,10 +96,23 @@ void add_copy_channel_offset_node(
 
   VK_CHECK_COND(
       dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
-      "Source channel plus range should be less than or equal to input tensor's channel size");
+      "Src channel (",
+      src_channel_offset,
+      ") and range (",
+      channel_range,
+      ") should be less than or equal to input tensor's channel size (",
+      dim_at<Dim4D::Channel>(in_sizes),
+      ")");
+
   VK_CHECK_COND(
       dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
-      "Source channel and range should be less than or equal to input tensor's channel size");
+      "Dst channel (",
+      dst_channel_offset,
+      ") and range (",
+      channel_range,
+      ") should be less than or equal to input tensor's channel size (",
+      dim_at<Dim4D::Channel>(out_sizes),
+      ")");
 
   VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
   VK_CHECK_COND(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
@@ -70,4 +70,50 @@ uint32_t dim_at(const vTensor& v_in) {
   return dim_at<N>(v_in.sizes());
 }
 
+// A canonical way to represent dimensions as enum. Intended to use the same
+// value as Dim4D for potential future refactoring.
+
+enum NchwDim {
+  DimWidth = 1,
+  DimHeight = 2,
+  DimChannel = 3,
+  DimBatch = 4,
+};
+
+/* This function return a NchwDim
+ * given a Tensor and a user provided dim. The reason for this normalization is
+ * that in the user tensor coordinate, it is using a "big-endian" mechanism when
+ * referring to a nchw dimension, in that dim=0 refers to the batch dimension in
+ * a 4d tensor but dim=0 reference to height in a 2d tensor. Despite in a common
+ * texture representation of channel packing, a 2d tensor has exactly the same
+ * layout as a 4d with the batch and channel size equals to 1. This function
+ * returns a canonical dimension to simplify dimension reasoning in the code.
+ *
+ */
+
+inline NchwDim normalize_to_nchw_dim(const vTensor& v_in, int32_t dim) {
+  return static_cast<NchwDim>(v_in.dim() - dim);
+}
+
+inline std::ostream& operator<<(std::ostream& os, NchwDim nchw_dim) {
+  switch (nchw_dim) {
+    case DimWidth:
+      os << "DimWidth";
+      break;
+    case DimHeight:
+      os << "DimHeight";
+      break;
+    case DimChannel:
+      os << "DimChannel";
+      break;
+    case DimBatch:
+      os << "DimBatch";
+      break;
+    default:
+      os << "DimUnknown";
+      break;
+  }
+  return os;
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -428,6 +428,52 @@ def get_repeat_inputs():
     return test_suite
 
 
+def get_cat_inputs():
+    # TensorList must be specified as list of tuples
+    test_suite = VkTestSuite(
+        [
+            # Cat on Height
+            ([(S1, S1, 3, 5), (S1, S1, 4, 5)], 2),
+            ([(S1, 3, 5), (S1, 4, 5)], 1),
+            ([(3, 5), (4, 5)], 0),
+            ([(3, 5), (4, 5), (1, 5)], 0),
+            (
+                [
+                    (3, 5),
+                ],
+                0,
+            ),
+            # Cat on Width
+            ([(S1, S1, 5, 3), (S1, S1, 5, 4)], 3),
+            ([(S1, 5, 3), (S1, 5, 4)], 2),
+            ([(5, 3), (5, 4)], 1),
+            ([(5, 3), (5, 4), (5, 1)], 1),
+            (
+                [
+                    (5, 4),
+                ],
+                1,
+            ),
+            ([(5,), (6,)], 0),
+            # Cat on Batch
+            ([(S, S1, 5, 4), (S1, S1, 5, 4)], 0),
+            ([(S, XS, 5, 4), (S1, XS, 5, 4)], 0),
+            ([(S, S2, 5, 4), (S1, S2, 5, 4)], 0),
+            # Cat on Channel
+            ([(S, 5, 4), (S1, 5, 4), (S2, 5, 4)], 0),
+            ([(XS, 5, 4), (XS, 5, 4), (S2, 5, 4)], 0),
+            ([(XS, S, 5, 4), (XS, S1, 5, 4), (XS, S2, 5, 4)], 1),
+            ([(XS, XS, 5, 4), (XS, XS, 5, 4), (XS, S2, 5, 4)], 1),
+        ]
+    )
+    test_suite.layouts = [
+        "api::kChannelsPacked",
+    ]
+    test_suite.data_gen = "make_seq_tensor"
+    test_suite.dtypes = ["at::kFloat"]
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -447,4 +493,5 @@ def get_repeat_inputs():
     "aten.unsqueeze_copy.default": get_unsqueeze_inputs(),
     "aten.clone.default": get_clone_inputs(),
     "aten.repeat.default": get_repeat_inputs(),
+    "aten.cat.default": get_cat_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/generate_op_tests.py b/backends/vulkan/test/op_tests/generate_op_tests.py
@@ -16,6 +16,7 @@
     TestSuite,
     TestSuiteGen,
 )
+from torchgen import local
 
 from torchgen.gen import parse_native_yaml, ParsedYaml
 from torchgen.model import DispatchKey, NativeFunction
@@ -45,6 +46,9 @@ def process_test_suites(
         cpp_generator.add_suite(registry_name, f, op_test_suite)
 
 
+@local.parametrize(
+    use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
+)
 def generate_cpp(
     native_functions_yaml_path: str, tags_path: str, output_dir: str
 ) -> None:
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py