[ET-VK][9/n] clone node

yipjustin · yipjustin · commit b27a82e94da5 · 2024-04-22T15:07:01.000-07:00
Introduce a clone node for copy operation. Also register `aten.clone` to this node. Important to note that during model export, possible to point the lvalue of `aten.clone` to the underlying shared object of the rvalue to achieve no-copy. Differential Revision: [D56441547](https://our.internmc.facebook.com/intern/diff/D56441547/) ghstack-source-id: 223471608 Pull Request resolved: #3219
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl b/backends/vulkan/runtime/graph/ops/glsl/clone.glsl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+  imageStore(image_out, pos, texelFetch(image_in, pos, 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml b/backends/vulkan/runtime/graph/ops/glsl/clone.yaml
@@ -0,0 +1,10 @@
+clone:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: clone
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/Logging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_clone_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef out) {
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  std::string kernel_name = "clone";
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      {t_out->texture_limits_ubo()}));
+}
+
+void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // The vulkan delegate does not support changing memory format.
+  return add_clone_node(graph, args[0], args[2]);
+}
+
+// Clone node is not the most efficient implementation for the aten.clone
+// operation. A more efficient implementation can be achieved during vulkan
+// export with the use of shared object. This clone node is introduced to enable
+// a "copy" mechanism if there is no alternative (e.g. during direct
+// ComputeGraph manipulation, we need to make a copy of a Tensor).
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.clone.default, clone);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -312,6 +312,25 @@ def get_slice_inputs():
     return test_suite
 
 
+def get_clone_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((S2, S1, S2, S1),),
+            ((S2, S1, S2),),
+            ((S2, S1),),
+            ((S2,),),
+            ((XS, S1, XS, S1),),
+            ((XS, S1, XS),),
+            ((S1, XS, S1),),
+            ((XS, S1),),
+            ((S1, XS),),
+            ((S1,),),
+            ((XS,),),
+        ]
+    )
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -328,4 +347,5 @@ def get_slice_inputs():
     "aten.permute_copy.default": get_permute_inputs(),
     "aten.view_copy.default": get_view_inputs(),
     "aten.slice_copy.Tensor": get_slice_inputs(),
+    "aten.clone.default": get_clone_inputs(),
 }
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -21,6 +21,7 @@
     OPT_DEVICE,
     OPT_INT64,
     OPT_LAYOUT,
+    OPT_MEMORYFORMAT,
     OPT_SCALARTYPE,
     TestSuite,
     TestSuiteGen,
@@ -254,6 +255,7 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             or ref.src_cpp_type == OPT_LAYOUT
             or ref.src_cpp_type == OPT_DEVICE
             or ref.src_cpp_type == OPT_BOOL
+            or ref.src_cpp_type == OPT_MEMORYFORMAT
         ):
             ret_str += "add_none(); \n"
         elif ref.src_cpp_type == TWO_TENSOR_TUPLE:
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -25,6 +25,7 @@
 OPT_INT64 = "::std::optional<int64_t>"
 OPT_DEVICE = "::std::optional<at::Device>"
 OPT_LAYOUT = "::std::optional<at::Layout>"
+OPT_MEMORYFORMAT = "::std::optional<at::MemoryFormat>"
 OPT_SCALARTYPE = "::std::optional<at::ScalarType>"
 TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
 THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
@@ -153,6 +154,7 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
             or cpp_type == OPT_LAYOUT
             or cpp_type == OPT_DEVICE
             or cpp_type == OPT_BOOL
+            or cpp_type == OPT_MEMORYFORMAT
         ):
             ret_str += "std::nullopt;"
         else: