pytorch · jorgep31415 · Mar 5, 2024
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -81,7 +81,7 @@ ValueRef ComputeGraph::set_input_tensor(
   if (use_staging) {
     vTensor& tensor = get_val(idx).toTensor();
     ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(staging_idx, idx));
+    add_staging_to_tensor_node(*this, staging_idx, idx);
     inputs_.push_back(staging_idx);
     return staging_idx;
   }
@@ -95,7 +95,7 @@ ValueRef ComputeGraph::set_output_tensor(
   if (use_staging) {
     vTensor& tensor = get_val(idx).toTensor();
     ValueRef staging_idx = add_staging(tensor.dtype(), tensor.gpu_numel());
-    execute_nodes_.emplace_back(new StagingNode(idx, staging_idx));
+    add_tensor_to_staging_node(*this, idx, staging_idx);
     outputs_.push_back(staging_idx);
     return staging_idx;
   }

diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -28,13 +28,10 @@ class ComputeGraph;
  * encoding of the shader corresponding to the op into the command buffer of a
  * ComputeGraph.
  */
-class ExecuteNode {
+class ExecuteNode final {
   friend class ComputeGraph;
 
  public:
-  ExecuteNode(ValueRef input, ValueRef output)
-      : outputs_{output}, inputs_{input} {}
-
   ExecuteNode(
       const api::ShaderInfo& shader,
       const api::utils::uvec3& global_workgroup_size,
@@ -49,21 +46,19 @@ class ExecuteNode {
         inputs_(inputs),
         params_(std::move(params)) {}
 
-  virtual ~ExecuteNode() = default;
+  ~ExecuteNode() = default;
+
+  void encode(ComputeGraph* graph);
 
  protected:
-  // TODO: Consider making members const after we remove StagingNode.
-  api::ShaderInfo shader_;
-  api::utils::uvec3 global_workgroup_size_;
-  api::utils::uvec3 local_workgroup_size_;
-  std::vector<ValueRef> outputs_;
-  std::vector<ValueRef> inputs_;
+  const api::ShaderInfo shader_;
+  const api::utils::uvec3 global_workgroup_size_;
+  const api::utils::uvec3 local_workgroup_size_;
+  const std::vector<ValueRef> outputs_;
+  const std::vector<ValueRef> inputs_;
   // TODO(T180906086): pass multiple buffers and index with ValueRef.
   // TODO(T180906457): allow re-computing param buffers.
   api::UniformParamsBuffer params_;
-
- public:
-  virtual void encode(ComputeGraph* graph);
 };
 
 } // namespace vulkan

diff --git a/backends/vulkan/runtime/graph/ops/Utils.cpp b/backends/vulkan/runtime/graph/ops/Utils.cpp
@@ -37,6 +37,13 @@ void bind_tensor_to_descriptor_set(
   }
 }
 
+void bind_staging_to_descriptor_set(
+    api::StorageBuffer& staging,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t idx) {
+  descriptor_set.bind(idx, staging.buffer());
+}
+
 uint32_t bind_values_to_descriptor_set(
     ComputeGraph* graph,
     const std::vector<ValueRef>& args,
@@ -48,9 +55,10 @@ uint32_t bind_values_to_descriptor_set(
   for (auto& arg : args) {
     Value& val = graph->get_val(arg);
     if (val.isTensor()) {
-      vTensor& tensor = val.toTensor();
       bind_tensor_to_descriptor_set(
-          tensor, pipeline_barrier, accessType, descriptor_set, idx++);
+          val.toTensor(), pipeline_barrier, accessType, descriptor_set, idx++);
+    } else if (val.isStaging()) {
+      bind_staging_to_descriptor_set(val.toStaging(), descriptor_set, idx++);
     } else {
       VK_THROW("Unsupported type: ", val.type());
     }

diff --git a/backends/vulkan/runtime/graph/ops/Utils.h b/backends/vulkan/runtime/graph/ops/Utils.h
@@ -30,6 +30,11 @@ void bind_tensor_to_descriptor_set(
     api::DescriptorSet& descriptor_set,
     const uint32_t idx);
 
+void bind_staging_to_descriptor_set(
+    api::StorageBuffer& staging,
+    api::DescriptorSet& descriptor_set,
+    const uint32_t idx);
+
 uint32_t bind_values_to_descriptor_set(
     ComputeGraph* graph,
     const std::vector<ValueRef>& args,

diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
+#include <ATen/native/vulkan/impl/Common.h>
 #include <ATen/native/vulkan/impl/Packing.h>
 
 namespace at {
@@ -72,7 +73,7 @@ void encode_copy_to_vtensor(
     api::Context* context,
     api::StorageBuffer& staging,
     vTensor& tensor) {
-  api::ShaderInfo shader = packing::get_nchw_to_image_shader(tensor);
+  api::ShaderInfo shader = get_nchw_to_image_shader(tensor);
   api::PipelineBarrier pipeline_barrier{};
   packing::record_nchw_to_image_op(
       context,
@@ -83,41 +84,190 @@ void encode_copy_to_vtensor(
       VK_NULL_HANDLE);
 }
 
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging) {
-  api::ShaderInfo shader = packing::get_image_to_nchw_shader(tensor);
-  api::PipelineBarrier pipeline_barrier{};
-  packing::record_image_to_nchw_op(
-      context,
+struct StagingParams final {
+  api::utils::ivec3 extents;
+  int32_t plane_size;
+  api::utils::ivec2 channel_info;
+};
+
+StagingParams create_staging_params(const vTensor& t) {
+  int32_t height = api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Height>(t));
+  int32_t width = api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Width>(t));
+  int32_t channels =
+      api::utils::safe_downcast<int32_t>(dim_at<Dim4D::Channel>(t));
+
+  int32_t plane_size = height * width;
+  int32_t c_depth = api::utils::div_up(channels, 4);
+
+  return {
+      api::utils::make_ivec3(t.extents()),
+      plane_size,
+      {c_depth, channels},
+  };
+}
+
+void add_staging_to_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef out_tensor) {
+  vTensor& t_out = graph.get_val(out_tensor).toTensor();
+  VK_CHECK_COND(graph.get_val(in_staging).isStaging());
+
+  api::ShaderInfo shader = get_nchw_to_image_shader(t_out);
+
+  api::utils::uvec3 global_size = t_out.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  api::UniformParamsBuffer params(
+      graph.context(), create_staging_params(t_out));
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
       shader,
-      tensor,
-      staging.buffer(),
-      pipeline_barrier,
-      VK_NULL_HANDLE);
+      global_size,
+      local_size,
+      {out_tensor},
+      {in_staging},
+      std::move(params)));
 }
 
-StagingNode::StagingNode(ValueRef from, ValueRef to) : ExecuteNode(from, to) {}
+void add_tensor_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef in_tensor,
+    const ValueRef out_staging) {
+  vTensor& t_in = graph.get_val(in_tensor).toTensor();
+  VK_CHECK_COND(graph.get_val(out_staging).isStaging());
 
-void StagingNode::encode(ComputeGraph* graph) {
-  Value& in_val = graph->get_val(inputs_[0]);
-  Value& out_val = graph->get_val(outputs_[0]);
+  api::ShaderInfo shader = get_image_to_nchw_shader(t_in);
+
+  api::utils::uvec3 global_size = t_in.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  StagingParams sp = create_staging_params(t_in);
+  api::UniformParamsBuffer params(graph.context(), sp);
+
+  // TODO(T181194784): These are workgroup sizes for special cases. Refactor the
+  // calculation of workgroup sizes to a standalone function. We should use
+  // scalar type to get the shader name, and use the shader name to get the
+  // workgroup size.
+  if (t_in.dtype() == api::ScalarType::QUInt8 ||
+      t_in.dtype() == api::ScalarType::QInt8 || t_in.dtype() == api::kBool) {
+    if (sp.plane_size % 4 == 0) {
+      global_size.data[0u] = sp.plane_size / 4;
+      global_size.data[1u] = 1;
+      local_size.data[0u] *= local_size.data[1u];
+      local_size.data[1u] = 1;
+    } else {
+      uint32_t numel = t_in.numel();
+      global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u};
+      local_size = {64u, 1u, 1u};
+    }
+  }
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      shader,
+      global_size,
+      local_size,
+      {in_tensor},
+      {out_staging},
+      std::move(params)));
+}
+
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
+  if (v_dst.is_quantized()) {
+    switch (v_dst.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        switch (v_dst.dtype()) {
+          case api::ScalarType::QUInt8:
+            return VK_KERNEL(nchw_to_image_uint8);
+          case api::ScalarType::QInt8:
+            return VK_KERNEL(nchw_to_image_int8);
+          case api::ScalarType::QInt32:
+            return VK_KERNEL(nchw_to_image_int32);
+          default:
+            VK_THROW(
+                "Vulkan quantization currently not supported for dtype ",
+                v_dst.dtype());
+        }
+      case api::StorageType::TEXTURE_2D:
+        switch (v_dst.dtype()) {
+          case api::ScalarType::QUInt8:
+            return VK_KERNEL(nchw_to_image2d_uint8);
+          case api::ScalarType::QInt8:
+            return VK_KERNEL(nchw_to_image2d_int8);
+          case api::ScalarType::QInt32:
+            return VK_KERNEL(nchw_to_image2d_int32);
+          default:
+            VK_THROW(
+                "Vulkan quantization currently not supported for dtype ",
+                v_dst.dtype());
+        }
+      default:
+        VK_THROW("No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        VK_THROW("Requested storage type must be a texture type.");
+    }
+  }
+
+  if (v_dst.dtype() == api::kFloat) {
+    switch (v_dst.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        return VK_KERNEL(nchw_to_image);
+      case api::StorageType::TEXTURE_2D:
+        return VK_KERNEL(nchw_to_image2d);
+      default:
+        VK_THROW("No kernel available!");
+    }
+  } else if (v_dst.dtype() == api::kBool) {
+    switch (v_dst.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        return VK_KERNEL(nchw_to_image_bool);
+      default:
+        VK_THROW("No kernel available!");
+    }
+  } else {
+    VK_THROW("Unsupported dtype!");
+  }
+}
+
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src) {
+  if (v_src.is_quantized() || v_src.dtype() == api::kBool) {
+    auto plane_size =
+        dim_at<Dim4D::Height>(v_src) * dim_at<Dim4D::Width>(v_src);
+    switch (v_src.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        switch (v_src.dtype()) {
+          case api::ScalarType::QUInt8:
+          case api::ScalarType::QInt8:
+          case api::kBool:
+            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                       : VK_KERNEL(image_to_nchw_uint);
+          case api::ScalarType::QInt32:
+            return VK_KERNEL(image_to_nchw_int32);
+          default:
+            VK_THROW(
+                "Vulkan quantization currently not supported for dtype ",
+                v_src.dtype());
+        }
+      default:
+        VK_THROW("No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        VK_THROW("Requested storage type must be a texture type.");
+    }
+  }
 
-  if (in_val.isStaging() && out_val.isTensor()) {
-    api::StorageBuffer& from_staging = graph->get_val(inputs_[0]).toStaging();
-    vTensor& to_tensor = graph->get_val(outputs_[0]).toTensor();
-    encode_copy_to_vtensor(graph->context(), from_staging, to_tensor);
-  } else if (in_val.isTensor() && out_val.isStaging()) {
-    vTensor& from_tensor = graph->get_val(inputs_[0]).toTensor();
-    api::StorageBuffer& to_staging = graph->get_val(outputs_[0]).toStaging();
-    encode_copy_from_vtensor(graph->context(), from_tensor, to_staging);
+  if (v_src.dtype() == api::kFloat) {
+    switch (v_src.storage_type()) {
+      case api::StorageType::TEXTURE_3D:
+        return VK_KERNEL(image_to_nchw);
+      case api::StorageType::TEXTURE_2D:
+        return VK_KERNEL(image2d_to_nchw);
+      default:
+        VK_THROW("No kernel available!");
+    }
   } else {
-    VK_THROW(
-        "Unexpected input value type ",
-        in_val.type(),
-        " and output value type ",
-        out_val.type());
+    VK_THROW("Unsupported dtype!");
   }
 }
 

diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
@@ -12,7 +12,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
 
-#include <string.h>
+#include <cstring>
 
 namespace at {
 namespace native {
@@ -76,20 +76,26 @@ void encode_copy_to_vtensor(
     api::Context* context,
     api::StorageBuffer& staging,
     vTensor& tensor);
-void encode_copy_from_vtensor(
-    api::Context* context,
-    vTensor& tensor,
-    api::StorageBuffer& staging);
 
-/*
- * OpNode that allows copying data into and out of a staging buffer.
- */
-class StagingNode : public virtual ExecuteNode {
- public:
-  explicit StagingNode(ValueRef from, ValueRef to);
+//
+// Functions to initialize ExecuteNode
+//
+
+void add_staging_to_tensor_node(
+    ComputeGraph& graph,
+    const ValueRef in_staging,
+    const ValueRef out_tensor);
+void add_tensor_to_staging_node(
+    ComputeGraph& graph,
+    const ValueRef in_tensor,
+    const ValueRef out_staging);
+
+//
+// Functions to get shaders
+//
 
-  void encode(ComputeGraph* graph) override;
-};
+api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst);
+api::ShaderInfo get_image_to_nchw_shader(const vTensor& v_src);
 
 } // namespace vulkan
 } // namespace native