Introduce virtual_clone API to support view of view use cases + fix synchronization hazard with view tensors (#5753)

SS-JIA · facebook-github-bot · commit a5a76f7b4c51 · 2024-09-30T14:48:08.000-07:00
Summary: Pull Request resolved: #5753 ## Context This diff fixes some hazards (not necessarily) bugs with view tensors. ### `virtual_clone` API Consider the following sequence of calls which may be common in the view of view use case. ``` // Note the below does not reflect accurate API usage; it is mostly pseudocode. t1 = graph.add_tensor(...); // t2 will have the same metadata as t1 t2 = graph.add_tensor_view(t1); // t3 will also have the same metadata as t2 at this point. t3 = graph.add_tensor_view(t2); // t2 metadata will be updated correctly. t2 = add_transpose_view_node(t1, 0, 1, t2); // Unfortunately, this node will have an assumption that t3 has the same metadata as t2 to start. However, this is not true. // As a result, t3 will have incorrect metadata after this node. t3 = add_transpose_view_node(t2, 1, 2, t3); ``` To address this, the `virtual_clone` API is introduced which will allow view nodes to set the metadata of the output equal to the input before modifying the output. ### WAW synchronization hazards `vTensorStorage` maintains a `last_access` state which facilitates inserting the correct memory barriers for the underlying `vkImage` or `vkBuffer`. However, when we create a tensor view, `last_access` is not shared between `vTensor` instances that use the same resource. As as result, writing into a `vTensor` will not update the `last_access` of its views, and vice versa. Therefore, sebsequent accesses of the other tensor that references the same resource will result in a synchronization hazard. This diff fixes this hazard in a bit of a crude way; if the `vTensor` is a copy, or has copies, then cowardly assume that it has been written to before the current access so that appropriate memory barriers are inserted. This was the selected solution because I thought that adding a map to track last access of tensors that share resources is a bit overkill when the assumption that the underlying resource has been written to before the current access should hold most of the time. ghstack-source-id: 245517197 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D63642092 fbshipit-source-id: 5ca76347946fa2b2f1b07709cbc7d9afa3aaf00d
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -277,10 +277,11 @@ vTensorStorage::vTensorStorage(
           storage_type_,
           dtype,
           allocate_memory)),
-      last_access_{} {}
+      last_access_{},
+      has_copies_{false} {}
 
 vTensorStorage::vTensorStorage(
-    const vTensorStorage& other,
+    vTensorStorage& other,
     const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
@@ -289,7 +290,10 @@ vTensorStorage::vTensorStorage(
       buffer_offset_{buffer_offset},
       image_(other.image_),
       buffer_(other.buffer_, buffer_offset),
-      last_access_{other.last_access_} {}
+      last_access_{other.last_access_},
+      has_copies_{false} {
+  other.has_copies_ = true;
+}
 
 vTensorStorage::~vTensorStorage() {
   flush();
@@ -312,6 +316,21 @@ void vTensorStorage::transition(
   vkapi::PipelineStageFlags prev_stage = last_access_.stage;
   vkapi::MemoryAccessFlags prev_access = last_access_.access;
 
+  // If the underlying resource is a copy of another tensor's resource the
+  // last_access may not be accurate, since the original storage may have been
+  // written to as part of the original tensor. Likewise, if the underlying
+  // resource has copies, then the resource may have been updated as part of the
+  // view tensors.
+  //
+  // If the resource is a copy, or has copies of it, then cowardly assume that
+  // it has previously been written to as part of a compute shader before the
+  // current access event so that the appropriate memory barriers may be
+  // inserted.
+  if (is_copy() || has_copies_) {
+    prev_stage = vkapi::PipelineStage::COMPUTE;
+    prev_access = vkapi::kWrite;
+  }
+
   const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
 
   VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -358,6 +377,13 @@ void vTensorStorage::transition(
   last_access_.access = cur_access;
 }
 
+bool vTensorStorage::is_copy() const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy();
+  }
+  return image_.is_copy();
+}
+
 bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
   if (storage_type_ == utils::kBuffer) {
     return buffer_.is_copy_of(other.buffer_);
@@ -418,7 +444,8 @@ vTensor::vTensor(
   }
 }
 
-vTensor::vTensor(const vTensor& other)
+// NOLINTNEXTLINE
+vTensor::vTensor(vTensor& other)
     : dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
@@ -443,7 +470,7 @@ vTensor::vTensor(const vTensor& other)
       storage_(other.storage_) {}
 
 vTensor::vTensor(
-    const vTensor& other,
+    vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
@@ -671,6 +698,14 @@ void vTensor::virtual_reconfigure(
   update_metadata();
 }
 
+void vTensor::virtual_clone(const vTensor& other) {
+  VK_CHECK_COND(is_view_of(other));
+  sizes_ = other.sizes_;
+  dim_order_ = other.dim_order_;
+  axis_map_ = other.axis_map_;
+  packed_dim_ = other.packed_dim_;
+}
+
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   VK_CHECK_COND(
       new_sizes.size() == dim_order_.size(),
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -104,7 +104,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
+  vTensorStorage(vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -134,6 +134,8 @@ class vTensorStorage final {
 
   // Last Access - used to insert memory barriers
   LastAccess last_access_;
+  // Indicates whether copies of this vTensorStorage have been made
+  bool has_copies_;
 
  private:
   // Registers underlying memory for cleanup
@@ -153,6 +155,11 @@ class vTensorStorage final {
     return image_.format();
   }
 
+  /*
+   * Check if the underlying resource is a copy of another resource
+   */
+  bool is_copy() const;
+
   /*
    * Used for checking if this vTensorStorage is a copy of another instance
    */
@@ -176,6 +183,8 @@ class vTensor final {
       const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
       const bool allocate_memory = true);
 
+  vTensor(const vTensor& other) = delete;
+
   /*
    * This constructor allows for the creation of a vTensor that references the
    * same buffer resource of another vTensor, with the same sizes and strides
@@ -185,7 +194,7 @@ class vTensor final {
    * Once created, the sizes and strides of the aliased vTensor can be changed
    * using the `virtual_reconfigure` member function.
    */
-  vTensor(const vTensor& other);
+  vTensor(vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -202,7 +211,7 @@ class vTensor final {
    * buffer.
    */
   vTensor(
-      const vTensor& other,
+      vTensor& other,
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& dim_order,
       const int64_t offset_numel = 0);
@@ -511,6 +520,11 @@ class vTensor final {
       const std::vector<int64_t>& new_sizes,
       const std::vector<int64_t>& new_dim_order);
 
+  /*
+   * Set all metadata of this tensor to match the metadata of another tensor.
+   */
+  void virtual_clone(const vTensor& other);
+
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
@@ -62,6 +62,8 @@ void add_transpose_view_node(
   const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
 
   check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref);
+  const vTensorPtr in = graph.get_tensor(input_ref);
+  graph.get_tensor(out_ref)->virtual_clone(*in);
   graph.get_tensor(out_ref)->virtual_transpose(dim0, dim1);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -301,6 +301,35 @@ TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
   }
 }
 
+TEST_F(VulkanComputeAPITest, view_of_view_test) {
+  constexpr int N = 3;
+  constexpr int C = 5;
+  constexpr int H = 17;
+  constexpr int W = 19;
+
+  std::vector<int64_t> sizes = {N, C, H, W};
+
+  vTensor t1 = vTensor(
+      context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
+
+  vTensor t2 = vTensor(t1);
+  EXPECT_TRUE(t2.sizes() == sizes);
+  vTensor t3 = vTensor(t2);
+  EXPECT_TRUE(t2.sizes() == sizes);
+
+  t2.virtual_transpose(1, 2);
+  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
+  EXPECT_TRUE(t2.sizes() == expected_t2_sizes);
+
+  // Because t3 was created before t2's metadata was updated, we need to first
+  // update t3's metadata to match t2's metadata. Then the transpose will yield
+  // the correct metadata.
+  t3.virtual_clone(t2);
+  t3.virtual_transpose(2, 3);
+  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
+  EXPECT_TRUE(t3.sizes() == expected_t3_sizes);
+}
+
 utils::ivec3 make_temp_ivec3(int x, int y, int z) {
   return utils::ivec3{x, y, z};
 }
@@ -1274,6 +1303,49 @@ TEST(VulkanComputeGraphTest, test_simple_graph_with_view) {
   }
 }
 
+TEST(VulkanComputeGraphTest, test_graph_view_of_view) {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+
+  constexpr int N = 3;
+  constexpr int C = 5;
+  constexpr int H = 17;
+  constexpr int W = 19;
+
+  std::vector<int64_t> orig_sizes = {N, C, H, W};
+
+  // Test a common view of view usage pattern. In delegate execution, the values
+  // of the graph are created first; then operators are added. As a result,
+  // creating views of views is a bit tricky because metadata updates to a view
+  // does not update the metadata of the view's views. Nonetheless, view
+  // operators have an implicit assumption that the metadata of the output is
+  // equivalent to the metadata of the input. Therefore, view operators must
+  // account for unseen updates to the input view by first calling
+  // `virtual_clone()` to make the output equivalent to the input before.
+  // modifying metadata.
+
+  ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat);
+  ValueRef t2 = graph.add_tensor_view(t1);
+  ValueRef t3 = graph.add_tensor_view(t2);
+
+  ValueRef channels = graph.add_scalar<int64_t>(1);
+  ValueRef height = graph.add_scalar<int64_t>(2);
+  ValueRef width = graph.add_scalar<int64_t>(3);
+
+  auto opFn = VK_GET_OP_FN("aten.transpose.int");
+
+  opFn(graph, {t1, channels, height, t2});
+  std::vector<int64_t> t2_sizes = graph.sizes_of(t2);
+  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
+  EXPECT_TRUE(t2_sizes == expected_t2_sizes);
+
+  opFn(graph, {t2, height, width, t3});
+  std::vector<int64_t> t3_sizes = graph.sizes_of(t3);
+  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
+  EXPECT_TRUE(t3_sizes == expected_t3_sizes);
+}
+
 TEST(VulkanComputeGraphTest, test_simple_graph) {
   GraphConfig config;
   ComputeGraph graph(config);