[ET-VK] Introduce virtual_clone API to support view of view use cases + fix synchronization hazard with view tensors

SS-JIA · SS-JIA · commit bd11b1151bab · 2024-09-30T09:29:57.000-07:00
## Context This diff fixes some hazards (not necessarily) bugs with view tensors. ### `virtual_clone` API Consider the following sequence of calls which may be common in the view of view use case. ``` t1 = graph.add_tensor(...); // t2 will have the same metadata as t1 t2 = graph.add_tensor_view(t1); // t3 will also have the same metadata as t2 at this point. t3 = graph.add_tensor_view(t2); // t2 metadata will be updated correctly. t2 = add_transpose_view_node(t1, 0, 1, t2); // Unfortunately, this node will have an assumption that t3 has the same metadata as t2 to start. However, this is not true. // As a result, t3 will have incorrect metadata after this node. t3 = add_transpose_view_node(t2, 1, 2, t3); ``` To address this, the `virtual_clone` API is introduced which will allow view nodes to set the metadata of the output equal to the input before modifying the output. ### WAW synchronization hazards `vTensorStorage` maintains a `last_access` state which facilitates inserting the correct memory barriers for the underlying `vkImage` or `vkBuffer`. However, when we create a tensor view, `last_access` is not shared between `vTensor` instances that use the same resource. As as result, writing into a `vTensor` will not update the `last_access` of its views, and vice versa. Therefore, sebsequent accesses of the other tensor that references the same resource will result in a synchronization hazard. This diff fixes this hazard in a bit of a crude way; if the `vTensor` is a copy, or has copies, then cowardly assume that it has been written to before the current access so that appropriate memory barriers are inserted. This was the selected solution because I thought that adding a map to track last access of tensors that share resources is a bit overkill when the assumption that the underlying resource has been written to before the current access should hold most of the time. Differential Revision: [D63642092](https://our.internmc.facebook.com/intern/diff/D63642092/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -277,10 +277,11 @@ vTensorStorage::vTensorStorage(
           storage_type_,
           dtype,
           allocate_memory)),
-      last_access_{} {}
+      last_access_{},
+      has_copies_{false} {}
 
 vTensorStorage::vTensorStorage(
-    const vTensorStorage& other,
+    vTensorStorage& other,
     const int64_t buffer_offset)
     : context_(other.context_),
       storage_type_{other.storage_type_},
@@ -289,7 +290,10 @@ vTensorStorage::vTensorStorage(
       buffer_offset_{buffer_offset},
       image_(other.image_),
       buffer_(other.buffer_, buffer_offset),
-      last_access_{other.last_access_} {}
+      last_access_{other.last_access_},
+      has_copies_{false} {
+  other.has_copies_ = true;
+}
 
 vTensorStorage::~vTensorStorage() {
   flush();
@@ -312,6 +316,21 @@ void vTensorStorage::transition(
   vkapi::PipelineStageFlags prev_stage = last_access_.stage;
   vkapi::MemoryAccessFlags prev_access = last_access_.access;
 
+  // If the underlying resource is a copy of another tensor's resource the
+  // last_access may not be accurate, since the original storage may have been
+  // written to as part of the original tensor. Likewise, if the underlying
+  // resource has copies, then the resource may have been updated as part of the
+  // view tensors.
+  //
+  // If the resource is a copy, or has copies of it, then cowardly assume that
+  // it has previously been written to as part of a compute shader before the
+  // current access event so that the appropriate memory barriers may be
+  // inserted.
+  if (is_copy() || has_copies_) {
+    prev_stage = vkapi::PipelineStage::COMPUTE;
+    prev_access = vkapi::kWrite;
+  }
+
   const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
 
   VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -358,6 +377,13 @@ void vTensorStorage::transition(
   last_access_.access = cur_access;
 }
 
+bool vTensorStorage::is_copy() const {
+  if (storage_type_ == utils::kBuffer) {
+    return buffer_.is_copy();
+  }
+  return image_.is_copy();
+}
+
 bool vTensorStorage::is_copy_of(const vTensorStorage& other) const {
   if (storage_type_ == utils::kBuffer) {
     return buffer_.is_copy_of(other.buffer_);
@@ -418,7 +444,7 @@ vTensor::vTensor(
   }
 }
 
-vTensor::vTensor(const vTensor& other)
+vTensor::vTensor(vTensor& other)
     : dtype_(other.dtype_),
       // Copy tensor size metadata
       sizes_(other.sizes_.begin(), other.sizes_.end()),
@@ -443,7 +469,7 @@ vTensor::vTensor(const vTensor& other)
       storage_(other.storage_) {}
 
 vTensor::vTensor(
-    const vTensor& other,
+    vTensor& other,
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order,
     const int64_t offset_numel)
@@ -671,6 +697,14 @@ void vTensor::virtual_reconfigure(
   update_metadata();
 }
 
+void vTensor::virtual_clone(const vTensor& other) {
+  VK_CHECK_COND(is_view_of(other));
+  sizes_ = other.sizes_;
+  dim_order_ = other.dim_order_;
+  axis_map_ = other.axis_map_;
+  packed_dim_ = other.packed_dim_;
+}
+
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   VK_CHECK_COND(
       new_sizes.size() == dim_order_.size(),
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -104,7 +104,7 @@ class vTensorStorage final {
    * because this behaviour is unsafe, since the original tensor may be
    * destroyed before the copy is destroyed.
    */
-  vTensorStorage(const vTensorStorage& other, const int64_t buffer_offset = 0);
+  vTensorStorage(vTensorStorage& other, const int64_t buffer_offset = 0);
 
  public:
   // To discourage creating copies, the assignment operator is still deleted.
@@ -134,6 +134,8 @@ class vTensorStorage final {
 
   // Last Access - used to insert memory barriers
   LastAccess last_access_;
+  // Indicates whether copies of this vTensorStorage have been made
+  bool has_copies_;
 
  private:
   // Registers underlying memory for cleanup
@@ -153,6 +155,11 @@ class vTensorStorage final {
     return image_.format();
   }
 
+  /*
+   * Check if the underlying resource is a copy of another resource
+   */
+  bool is_copy() const;
+
   /*
    * Used for checking if this vTensorStorage is a copy of another instance
    */
@@ -185,7 +192,7 @@ class vTensor final {
    * Once created, the sizes and strides of the aliased vTensor can be changed
    * using the `virtual_reconfigure` member function.
    */
-  vTensor(const vTensor& other);
+  vTensor(vTensor& other);
 
   /*
    * This constructor allows for the creation of a vTensor that references the
@@ -202,7 +209,7 @@ class vTensor final {
    * buffer.
    */
   vTensor(
-      const vTensor& other,
+      vTensor& other,
       const std::vector<int64_t>& sizes,
       const std::vector<int64_t>& dim_order,
       const int64_t offset_numel = 0);
@@ -511,6 +518,11 @@ class vTensor final {
       const std::vector<int64_t>& new_sizes,
       const std::vector<int64_t>& new_dim_order);
 
+  /*
+   * Set all metadata of this tensor to match the metadata of another tensor.
+   */
+  void virtual_clone(const vTensor& other);
+
   /*
    * Perform a virtual resize of the vTensor by modifying the size metadata that
    * gets used in compute shaders. This allows the shader to treat the
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
@@ -62,6 +62,8 @@ void add_transpose_view_node(
   const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
 
   check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref);
+  const vTensorPtr in = graph.get_tensor(input_ref);
+  graph.get_tensor(out_ref)->virtual_clone(*in);
   graph.get_tensor(out_ref)->virtual_transpose(dim0, dim1);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(