pytorch · yipjustin · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/backends/vulkan/runtime/api/Tensor.h b/backends/vulkan/runtime/api/Tensor.h
@@ -220,6 +220,10 @@ class vTensor final {
    */
   const api::BufferBindInfo texture_limits_ubo();
 
+  inline const api::utils::ivec3 texture_limits() const {
+    return texture_limits_.limits;
+  }
+
   inline size_t numel() const {
     return api::utils::multiply_integers(sizes());
   }

diff --git a/backends/vulkan/runtime/api/Utils.h b/backends/vulkan/runtime/api/Utils.h
@@ -262,12 +262,23 @@ inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const ivec3& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  return os;
+}
+
 inline std::ostream& operator<<(std::ostream& os, const uvec4& v) {
   os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
      << v.data[3u] << ")";
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const ivec4& v) {
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
+  return os;
+}
+
 //
 // std::vector<T> Handling
 //
@@ -298,6 +309,25 @@ inline ivec2 make_ivec2(
   }
 }
 
+inline ivec3 make_ivec3(
+    const std::vector<int64_t>& ints,
+    bool reverse = false) {
+  VK_CHECK_COND(ints.size() == 3);
+  if (reverse) {
+    return {
+        safe_downcast<int32_t>(ints[2]),
+        safe_downcast<int32_t>(ints[1]),
+        safe_downcast<int32_t>(ints[0]),
+    };
+  } else {
+    return {
+        safe_downcast<int32_t>(ints[0]),
+        safe_downcast<int32_t>(ints[1]),
+        safe_downcast<int32_t>(ints[2]),
+    };
+  }
+}
+
 inline ivec4 make_ivec4(
     const std::vector<int64_t>& ints,
     bool reverse = false) {
@@ -338,6 +368,13 @@ inline ivec3 make_ivec3(uvec3 ints) {
       safe_downcast<int32_t>(ints.data[2u])};
 }
 
+inline uvec3 make_uvec3(ivec3 ints) {
+  return {
+      safe_downcast<uint32_t>(ints.data[0u]),
+      safe_downcast<uint32_t>(ints.data[1u]),
+      safe_downcast<uint32_t>(ints.data[2u])};
+}
+
 /*
  * Given an vector of up to 4 uint64_t representing the sizes of a tensor,
  * constructs a uvec4 containing those elements in reverse order.

diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
@@ -34,6 +34,14 @@ inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) {
   return api::utils::operator<<(os, v);
 }
 
+inline std::ostream& operator<<(std::ostream& os, const api::utils::ivec3& v) {
+  return api::utils::operator<<(os, v);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const api::utils::ivec4& v) {
+  return api::utils::operator<<(os, v);
+}
+
 template <typename T>
 inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& opt) {
   os << "[";

diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
+  ivec3 out_limits;
+};
+
+layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
+  ivec3 in_limits;
+};
+
+
+
+layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
+  ivec3 range;
+  int unused0;
+  ivec3 src_offset;
+  int unused1;
+  ivec3 dst_offset;
+  int unused2;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  const ivec3 out_pos = pos + dst_offset;
+  const ivec3 in_pos = pos + src_offset;
+
+  if (any(greaterThanEqual(pos, range))) {
+    return;
+  }
+
+  imageStore(image_out, out_pos, texelFetch(image_in, in_pos, 0));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
@@ -0,0 +1,10 @@
+copy_offset:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: copy_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
+
+layout(set = 0, binding = 2) uniform PRECISION restrict RepeatArgs {
+  // With input_size (n, c_i, h, w) and repeat r
+  // out_size == (n, c_i * r, h, w)
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
+
+  if (any(greaterThanEqual(out_whcn, out_sizes))) {
+    return;
+  }
+
+  VEC4_T v;
+  // Loop over the 4 elements in texel, calculate the corresponding elem, and
+  // fetch. Not most efficient algorithm because likely we fetch same texel
+  // multiple times in this loop.
+
+  for (int i=0; i<4;i++) {
+    ivec4 in_whcn = out_whcn;
+    in_whcn.z = (out_whcn.z + i) % in_sizes.z;
+
+    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
+
+    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
+  }
+
+  imageStore(image_out, out_pos, v);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml
@@ -0,0 +1,10 @@
+repeat_channel:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: repeat_channel
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const api::utils::ivec3& range,
+    const api::utils::ivec3& src_offset,
+    const api::utils::ivec3& dst_offset,
+    const ValueRef out) {
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));
+
+  std::string kernel_name = "copy_offset";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = api::utils::make_uvec3(range);
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  const struct Block final {
+    api::utils::ivec3 range;
+    int32_t unused0;
+    api::utils::ivec3 src_offset;
+    int32_t unused1;
+    api::utils::ivec3 dst_offset;
+    int32_t unused2;
+  } offset_params{
+      range,
+      0,
+      src_offset,
+      0,
+      dst_offset,
+      0,
+  };
+
+  auto shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
+      // Parameter buffers
+      {t_out->texture_limits_ubo(),
+       t_in->texture_limits_ubo(),
+       graph.create_params_buffer(offset_params)},
+      // Specialization Constants
+      {}));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+namespace vkcompute {
+
+void add_copy_offset_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const api::utils::ivec3& range,
+    const api::utils::ivec3& src_offset,
+    const api::utils::ivec3& dst_offset,
+    const ValueRef out);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -21,6 +21,8 @@ using api::utils::ivec3;
 using api::utils::uvec2;
 using api::utils::uvec4;
 
+namespace {
+
 void check_args(
     const vTensor& in,
     const std::vector<int64_t>& permute_dims,
@@ -39,6 +41,8 @@ void check_args(
       "Output tensor dim size must match argument");
 }
 
+} // namespace
+
 void add_permute_node(
     ComputeGraph& graph,
     ValueRef in,