pytorch · SS-JIA · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -84,27 +84,68 @@ void ComputeGraph::update_descriptor_counts(
   }
 }
 
+api::StorageType ComputeGraph::suggested_storage_type() {
+  if (config_.enableStorageTypeOverride) {
+    return config_.storageTypeOverride;
+  }
+  return api::StorageType::TEXTURE_3D;
+}
+
+api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
+    const std::vector<int64_t>& sizes) {
+  if (config_.enableMemoryLayoutOverride) {
+    return config_.memoryLayoutOverride;
+  }
+  if (sizes.size() < 3) {
+    return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+  }
+  // For 3 dimensional tensors that only have a channels dimension of 1, still
+  // prefer width packed.
+  if (api::utils::val_at(-3, sizes) == 1) {
+    return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+  }
+  return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+}
+
 ValueRef ComputeGraph::add_tensor(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,
+    const api::StorageType storage_type,
+    const api::GPUMemoryLayout memory_layout,
     const int64_t shared_object_idx) {
   bool allocate_memory = shared_object_idx < 0;
 
   ValueRef idx(static_cast<int>(values_.size()));
   values_.emplace_back(vTensor(
-      context(),
-      sizes,
-      dtype,
-      api::StorageType::TEXTURE_3D,
-      api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
-      allocate_memory));
+      context(), sizes, dtype, storage_type, memory_layout, allocate_memory));
 
   if (!allocate_memory) {
     get_shared_object(shared_object_idx).add_user(this, idx);
   }
   return idx;
 }
 
+ValueRef ComputeGraph::add_tensor(
+    const std::vector<int64_t>& sizes,
+    const api::ScalarType dtype,
+    const api::GPUMemoryLayout memory_layout,
+    const int64_t shared_object_idx) {
+  return add_tensor(
+      sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
+}
+
+ValueRef ComputeGraph::add_tensor(
+    const std::vector<int64_t>& sizes,
+    const api::ScalarType dtype,
+    const int64_t shared_object_idx) {
+  return add_tensor(
+      sizes,
+      dtype,
+      suggested_storage_type(),
+      suggested_memory_layout(sizes),
+      shared_object_idx);
+}
+
 ValueRef ComputeGraph::add_tensorref(
     const std::vector<int64_t>& sizes,
     const api::ScalarType dtype,

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -93,10 +93,13 @@ class ComputeGraph final {
       bool execute);
 
   /*
-   * Returns the value at a particular reference
+   * Returns the value at a particular index in the graph. If storing this
+   * function's return value in a lvalue reference, it is imperative that no
+   * values are added to the graph while the reference is in scope, otherwise
+   * the underlying value may have been moved as part of a vector resize.
    */
   inline Value& get_val(ValueRef idx) {
-    return values_[idx];
+    return values_.at(idx);
   }
 
   inline const std::vector<int64_t>& get_val_sizes(ValueRef idx) {
@@ -127,18 +130,88 @@ class ComputeGraph final {
     return execute_nodes_;
   }
 
+  //
+  // Utility functions
+  //
+
+  /*
+   * Returns a suggested storage type (i.e. buffer or texture) that can be used
+   * to construct `vTensor`s. The storage type is typically determined by the
+   * GPU reported by the Vulkan context, unless a storage type override is
+   * defined in the graph configuration. Some GPU architectures work better with
+   * buffer storage, and others with texture storage. Current only texture
+   * storage is supported.
+   */
+  api::StorageType suggested_storage_type();
+
+  /*
+   * Returns a suggested memory layout (i.e. channels, width, or height packed)
+   * that can be used to construct `vTensor`s. The memory layout impacts which
+   * dimension will be treated as the vectorized dimension. For texture storage,
+   * elements along the vectorized dimension are packed into texels. The
+   * suggested memory layout is determined based on the sizes of the tensor,
+   * unless a memory layout override is defined in the graph configuration.
+   */
+  api::GPUMemoryLayout suggested_memory_layout(
+      const std::vector<int64_t>& sizes);
+
+  /*
+   * Returns the memory layout of a Tensor value at the specified index.
+   */
+  inline api::GPUMemoryLayout memory_layout_of(ValueRef idx) {
+    return get_val(idx).toTensor().gpu_memory_layout();
+  }
+
   //
   // Graph Building
   //
 
+  /*
+   * Add a `vTensor` value to the graph with the specified properties. There are
+   * various convenience overloads of this function that may be used instead.
+   */
+  ValueRef add_tensor(
+      const std::vector<int64_t>& sizes,
+      const api::ScalarType dtype,
+      const api::StorageType storage_type,
+      const api::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx);
+
+  /*
+   * Add a `vTensor` value to the graph with the specified properties. The
+   * suggested storage type will be used to construct the `vTensor`.
+   */
+  ValueRef add_tensor(
+      const std::vector<int64_t>& sizes,
+      const api::ScalarType dtype,
+      const api::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx = -1);
+
+  /*
+   * Add a `vTensor` value to the graph with the specified properties. The
+   * suggested storage type and memory layout will be used to construct the
+   * `vTensor`.
+   */
   ValueRef add_tensor(
       const std::vector<int64_t>& sizes,
       const api::ScalarType dtype = api::ScalarType::Float,
       const int64_t shared_object_idx = -1);
+
+  /*
+   * Add a `TensorRef` value to the graph with the specific properties. A
+   * `TensorRef` is a reference to a `vTensor` whose data is stored in an
+   * external CPU buffer.
+   */
   ValueRef add_tensorref(
       const std::vector<int64_t>& sizes,
       const api::ScalarType dtype,
       const void* const data);
+
+  /*
+   * Add a staging buffer to the graph. Staging buffers are data buffers that
+   * use memory that is visible to both the CPU and GPU, and therefore is used
+   * as a intermediary when transferring data between the CPU and GPU.
+   */
   ValueRef add_staging(const api::ScalarType dtype, const size_t numel);
 
   ValueRef add_none();
@@ -176,6 +249,20 @@ class ComputeGraph final {
     return {t, staging};
   }
 
+  /*
+   * Convenience function to add an input tensor with a specific memory layout
+   * along with its staging buffer
+   */
+  inline IOValueRef add_input_tensor(
+      const std::vector<int64_t>& sizes,
+      const api::ScalarType dtype,
+      const api::GPUMemoryLayout memory_layout,
+      const int64_t shared_object_idx = -1) {
+    ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
+    ValueRef staging = set_input_tensor(t);
+    return {t, staging};
+  }
+
   SharedObject& get_shared_object(const int64_t idx);
 
   //

diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
@@ -49,6 +49,26 @@ GraphConfig::GraphConfig() {
   // Empirically selected safety factor. If descriptor pools start running out
   // of memory, increase this safety factor.
   descriptorPoolSafetyFactor = 1.25;
+
+  // For now, force TEXTURE_3D storage as we are still developing shader
+  // support for buffer storage type.
+  enableStorageTypeOverride = true;
+  storageTypeOverride = api::StorageType::TEXTURE_3D;
+
+  // For now, force TENSOR_CHANNELS_PACKED memory layout by default as we are
+  // still developing support for other memory layouts.
+  enableMemoryLayoutOverride = true;
+  memoryLayoutOverride = api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+}
+
+void GraphConfig::setStorageTypeOverride(api::StorageType storage_type) {
+  enableStorageTypeOverride = true;
+  storageTypeOverride = storage_type;
+}
+
+void GraphConfig::setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout) {
+  enableMemoryLayoutOverride = true;
+  memoryLayoutOverride = memory_layout;
 }
 
 } // namespace vulkan

diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -26,8 +26,17 @@ struct GraphConfig final {
   // risk.
   float descriptorPoolSafetyFactor;
 
+  bool enableStorageTypeOverride;
+  api::StorageType storageTypeOverride;
+
+  bool enableMemoryLayoutOverride;
+  api::GPUMemoryLayout memoryLayoutOverride;
+
   // Generate a default graph config with pre-configured settings
   explicit GraphConfig();
+
+  void setStorageTypeOverride(api::StorageType storage_type);
+  void setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout);
 };
 
 } // namespace vulkan

diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -63,5 +63,10 @@ void main() {
     COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
     0);
 
+  // Detect broadcasting
+  if (PACKED_DIM_${PACKING}(other_sizes.data) < PACKED_DIM_${PACKING}(in_sizes.data)) {
+    other_texel = other_texel.xxxx;
+  }
+
   imageStore(image_out, pos, OP(in_texel, other_texel, alpha.data));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
@@ -11,11 +11,18 @@ binary_op:
     DTYPE: float
     PACKING: CHANNELS_PACKED
   generate_variant_forall:
+    PACKING:
+      - VALUE: CHANNELS_PACKED
+        SUFFIX: C_packed
+      - VALUE: WIDTH_PACKED
+        SUFFIX: W_packed
+      - VALUE: HEIGHT_PACKED
+        SUFFIX: H_packed
     DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
+      - VALUE: half
+        SUFFIX: half
+      - VALUE: float
+        SUFFIX: float
   shader_variants:
     - NAME: binary_add
     - NAME: binary_sub

diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
@@ -9,7 +9,7 @@
 ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) {
   ivec4 in_coord = out_coord;
   for (int i = 0; i < 4; ++i) {
-    if (in_sizes[i] == 1) {
+    if (out_coord[i] >= in_sizes[i]) {
       in_coord[i] = 0;
     }
   }

diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -24,6 +24,15 @@
 #define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
   ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
 
+#define COORD_TO_POS_WIDTH_PACKED(coord, sizes) \
+  ivec3(coord.x / 4, coord.y, (coord.z + coord.w * sizes.z))
+
+#define COORD_TO_POS_HEIGHT_PACKED(coord, sizes) \
+  ivec3(coord.x, coord.y / 4, (coord.z + coord.w * sizes.z))
+
+#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
+  ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)
+
 #define COORD_TO_BUFFER_IDX(coord, sizes)                  \
   coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
       coord.w* sizes.z* sizes.y* sizes.x;

diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -19,6 +19,17 @@ namespace at {
 namespace native {
 namespace vulkan {
 
+void check_binary_op_args(
+    const vTensor& self,
+    const vTensor& other,
+    const vTensor& out) {
+  VK_CHECK_COND(check_same_memory_layout(self, other, out));
+  VK_CHECK_COND(check_broadcastable(self, other));
+  std::vector<int64_t> broadcasted_sizes =
+      calculate_broadcasted_output_size(self, other);
+  VK_CHECK_COND(out.sizes() == broadcasted_sizes);
+}
+
 void resize_binary_op_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
@@ -28,15 +39,8 @@ void resize_binary_op_node(
   vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
   vTensor& other = graph->get_val(args[1].refs[1]).toTensor();
 
-  std::vector<int64_t> new_out_sizes(
-      std::max(self.sizes().size(), other.sizes().size()));
-
-  // Match the sizes in reverse because sizes are in NCHW order
-  for (int i = -1; i >= -new_out_sizes.size(); --i) {
-    new_out_sizes.at(new_out_sizes.size() + i) = std::max(
-        api::utils::val_at(i, self.sizes()),
-        api::utils::val_at(i, other.sizes()));
-  }
+  std::vector<int64_t> new_out_sizes =
+      calculate_broadcasted_output_size(self, other);
 
   out.virtual_resize(new_out_sizes);
 }
@@ -49,12 +53,16 @@ void add_binary_op_node(
     const ValueRef out,
     const std::string& op_name) {
   ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
-  ValueRef arg2 = prepack_if_tensor_ref(graph, in2);
+  ValueRef arg2 =
+      prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
 
   vTensor& t_in1 = graph.get_val(arg1).toTensor();
   vTensor& t_in2 = graph.get_val(arg2).toTensor();
+
   vTensor& t_out = graph.get_val(out).toTensor();
 
+  check_binary_op_args(t_in1, t_in2, t_out);
+
   api::utils::uvec3 global_size = t_out.virtual_extents();
   api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
@@ -67,6 +75,7 @@ void add_binary_op_node(
 
   std::stringstream kernel_name;
   kernel_name << "binary_" << op_name;
+  apply_memory_layout_suffix(kernel_name, t_out);
   apply_dtype_suffix(kernel_name, t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(