pytorch · yipjustin · Apr 17, 2024 · Apr 17, 2024
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
@@ -101,6 +101,24 @@ void Context::register_shader_dispatch(
   cmd_.dispatch(effective_global_wg);
 }
 
+
+void Context::register_copy(
+      PipelineBarrier& pipeline_barrier,
+      const VulkanImage& src,
+      const VulkanImage& dst,
+      const api::utils::uvec3& copy_range,
+      const api::utils::uvec3& src_offset,
+      const api::utils::uvec3& dst_offset) {
+  cmd_.insert_barrier(pipeline_barrier);
+  cmd_.copy_texture_to_texture(
+      src,
+      dst,
+      copy_range,
+      src_offset,
+      dst_offset);
+}
+
+
 void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
   if (cmd_) {
     cmd_.end();

diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
@@ -180,6 +180,14 @@ class Context final {
       const ShaderInfo&,
       const utils::uvec3&);
 
+  void register_copy(
+      PipelineBarrier&,
+      const VulkanImage& src,
+      const VulkanImage& dst,
+      const api::utils::uvec3& copy_range,
+      const api::utils::uvec3& src_offset,
+      const api::utils::uvec3& dst_offset);
+
   template <class S, class D>
   bool submit_copy(
       PipelineBarrier&,

diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
@@ -33,22 +33,67 @@ ExecuteNode::ExecuteNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
-void ExecuteNode::encode(ComputeGraph* graph) {
+ExecuteNode::ExecuteNode(
+      ComputeGraph& graph,
+      const ArgGroup& src,
+      const ArgGroup& dst,
+      const api::utils::uvec3& copy_range,
+      const api::utils::uvec3& src_offset,
+      const api::utils::uvec3& dst_offset)
+  :
+    src_(src), dst_(dst), copy_range_(copy_range),
+    src_offset_(src_offset), dst_offset_(dst_offset) {
+  // TODO: Update descriptor counts in graph.
+}
+
+
+void ExecuteNode::encode_shader(ComputeGraph* graph) {
   api::Context* const context = graph->context();
   api::PipelineBarrier pipeline_barrier{};
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
   api::DescriptorSet descriptor_set =
-      context->get_descriptor_set(shader_, local_workgroup_size_);
+      context->get_descriptor_set(shader_, *local_workgroup_size_);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
       graph, args_, pipeline_barrier, descriptor_set, idx);
   bind_params_to_descriptor_set(params_, descriptor_set, idx);
 
   context->register_shader_dispatch(
-      descriptor_set, pipeline_barrier, shader_, global_workgroup_size_);
+      descriptor_set, pipeline_barrier, shader_, *global_workgroup_size_);
+}
+
+void ExecuteNode::encode_copy(ComputeGraph* graph) {
+  api::Context* const context = graph->context();
+  api::PipelineBarrier pipeline_barrier{};
+
+  vTensorPtr src_v_t = graph->get_tensor(src_->refs[0]);
+  api::VulkanImage& src_image = src_v_t->image(
+      pipeline_barrier,
+      api::PipelineStage::COMPUTE, api::MemoryAccessType::READ);
+
+  vTensorPtr dst_v_t = graph->get_tensor(dst_->refs[0]);
+  api::VulkanImage& dst_image = dst_v_t->image(
+      pipeline_barrier,
+      api::PipelineStage::COMPUTE, api::MemoryAccessType::WRITE);
+
+  context->register_copy(
+      pipeline_barrier,
+      src_image,
+      dst_image,
+      *copy_range_,
+      *src_offset_,
+      *dst_offset_);
+}
+
+void ExecuteNode::encode(ComputeGraph* graph) {
+  if (shader_.src_code.size > 0) {
+    return encode_shader(graph);
+  } else {
+    return encode_copy(graph);
+  }
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -1,3 +1,5 @@
+//123123
+
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
@@ -12,6 +14,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
 
+#include <optional>
+
 namespace vkcompute {
 
 class ComputeGraph;
@@ -24,6 +28,8 @@ struct ArgGroup {
   ArgGroup(const ValueRef ref, const api::MemoryAccessType access)
       : refs{ref}, access(access) {}
 
+  ArgGroup(const ArgGroup& ag): refs(ag.refs), access(ag.access) {}
+
   ArgGroup(
       const std::vector<ValueRef>& refs,
       const api::MemoryAccessType access)
@@ -58,6 +64,14 @@ class ExecuteNode final {
       const ResizeFunction& resize_fn = nullptr,
       const std::vector<ValueRef>& resize_args = {});
 
+  ExecuteNode(
+      ComputeGraph& graph,
+      const ArgGroup& src,
+      const ArgGroup& dst,
+      const api::utils::uvec3& copy_range,
+      const api::utils::uvec3& src_offset,
+      const api::utils::uvec3& dst_offset);
+
   ~ExecuteNode() = default;
 
   void encode(ComputeGraph* graph);
@@ -70,12 +84,22 @@ class ExecuteNode final {
 
  protected:
   const api::ShaderInfo shader_;
-  const api::utils::uvec3 global_workgroup_size_;
-  const api::utils::uvec3 local_workgroup_size_;
+  const std::optional<api::utils::uvec3> global_workgroup_size_;
+  const std::optional<api::utils::uvec3> local_workgroup_size_;
   const std::vector<ArgGroup> args_;
   std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
   const ResizeFunction resize_fn_;
   const std::vector<ValueRef> resize_args_;
+
+  const std::optional<ArgGroup> src_;
+  const std::optional<ArgGroup> dst_;
+  const std::optional<api::utils::uvec3> copy_range_;
+  const std::optional<api::utils::uvec3> src_offset_;
+  const std::optional<api::utils::uvec3> dst_offset_;
+
+ private:
+  void encode_shader(ComputeGraph *graph);
+  void encode_copy(ComputeGraph *graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -445,6 +445,60 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
   }
 }
 
+TEST_F(VulkanComputeAPITest, copy_test) {
+  std::vector<int64_t> sizes = {1, 3, 4};  // c, h, w
+  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
+  api::MemoryAllocation a_mem = allocate_memory_for(a);
+  a.image().bind_allocation(a_mem);
+
+  std::vector<float> data_a(a.gpu_numel());
+  std::iota(data_a.begin(), data_a.end(), 0.0);
+  fill_vtensor(a, data_a);
+
+  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
+  api::MemoryAllocation b_mem = allocate_memory_for(b);
+  b.image().bind_allocation(b_mem);
+
+	// Force clear memory
+  fill_vtensor(b, 0.0);
+
+  auto context = api::context();
+
+  // These operates on texture coordinate in (x, y, z), corresponds to (w, h,
+  // packed_c).
+  api::utils::uvec3 copy_range{2, 2, 1};
+  api::utils::uvec3 src_offset{0, 0, 0};
+  api::utils::uvec3 dst_offset{2, 1, 0};
+
+  vkcompute::api::PipelineBarrier pipeline_barrier{};
+
+  context->register_copy(
+      pipeline_barrier,
+      a.image(),
+      b.image(),
+      copy_range,
+      src_offset,
+      dst_offset
+      );
+
+  submit_to_gpu();
+
+  // Fetch result back
+  std::vector<float> data_out(b.gpu_numel());
+  extract_vtensor(b, data_out);
+
+  // w shifted by 2, h shifted by 1.
+  std::vector<float> expected{
+    0, 0, 0, 0,
+    0, 0, 0, 1,
+    0, 0, 4, 5,
+  };
+
+  for (size_t i = 0; i < expected.size(); i++) {
+    CHECK_VALUE(data_out, i, expected[i]);
+  }
+}
+
 //
 // Compute Graph Tests
 //
@@ -793,6 +847,57 @@ TEST(VulkanComputeGraphTest, test_large_graph) {
   }
 }
 
+
+TEST(VulkanComputeGraphTest, test_register_copy) {
+  GraphConfig config;
+  ComputeGraph graph(config);
+
+  std::vector<int64_t> size = {1, 3, 4};
+
+  auto memory_layout = api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+  IOValueRef a = graph.add_input_tensor(size, api::kFloat, memory_layout);
+
+  IOValueRef out = {};
+  out.value = graph.add_tensor(size, api::kFloat, memory_layout);
+
+  api::utils::uvec3 copy_range{2, 2, 1};
+  api::utils::uvec3 src_offset{0, 0, 0};
+  api::utils::uvec3 dst_offset{2, 1, 0};
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+    graph,
+    {a.value, api::MemoryAccessType::READ},
+    {out.value, api::MemoryAccessType::WRITE},
+    copy_range,
+    src_offset,
+    dst_offset));
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_execute();
+
+  // The tensor region that is not within the dst_offset + copy_range region is
+  // undefined, since they are outside the copy region. Hence we set the target
+  // value from 1.0. In the expected value, 0.0 are the don't-care values.
+  fill_vtensor(graph, a, 1.0, /* iota = */ true);
+
+  graph.execute();
+  EXTRACT_TENSOR(out);
+
+  std::vector<float> expected{
+    0, 0, 0, 0,
+    0, 0, 1, 2,
+    0, 0, 5, 6,
+  };
+
+  for (size_t i = 0; i < expected.size(); i++) {
+    if (expected[i] > 0){
+      CHECK_VALUE(data_out, i, expected[i]);
+    }
+  }
+}
+
 class VulkanToFromGPUShaderTest : public ::testing::Test {
  public:
   void SetUp() override {