[ET-VK][Ops] aten.convolution (Bias=False)

jorgep31415 · jorgep31415 · commit 54a3c55ec20a · 2024-04-09T13:37:16.000-07:00
Pull Request resolved: #2887 The final touches to get ET-VK convolution on-par with ATen-VK's convolution. ## Idea In our shaders, we add the bias to our sum. ``` ${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0); ``` To keep our shaders as is, we implement having no bias by allocating a buffer of zeros. Then, our shader adds zero to our sum. ## Issue If `Bias=False`, dummy buffer of zeros is not serialized with the graph. The bias ValueRef is deserialized in the runtime as `TypeTag::NONE`, not `TypeTag::TENSORREF`. ## Solution If `TypeTag::NONE` is given, (1) create the `vTensor` using the `out_channels` value from the weights, (2) allocate a StagingBuffer of that size, and (3) `memset` its data to zero. Failure to do (3) will result in undefined behavior. ghstack-source-id: 221887675 @exported-using-ghexport Differential Revision: [D55814589](https://our.internmc.facebook.com/intern/diff/D55814589/)
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -32,17 +32,33 @@ PrepackNode::PrepackNode(
   graph.update_descriptor_counts(shader, /*execute = */ false);
 }
 
-void PrepackNode::encode(ComputeGraph* graph) {
-  api::Context* const context = graph->context();
-  api::PipelineBarrier pipeline_barrier{};
-
-  TensorRef& tref = graph->get_val(tref_).toTensorRef();
+api::StorageBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   vTensor& packed = graph->get_val(packed_).toTensor();
 
+  // If no TensorRef is provided, create a staging buffer of zeros according to
+  // the vTensor metadata.
+  if (graph->get_val(tref_).isNone()) {
+    size_t numel = api::utils::multiply_integers(packed.sizes());
+    api::StorageBuffer staging(graph->context(), packed.dtype(), numel);
+    size_t nbytes = numel * api::element_size(packed.dtype());
+    copy_zeros_to_staging(staging, nbytes);
+    return staging;
+  }
+
+  TensorRef& tref = graph->get_val(tref_).toTensorRef();
   size_t numel = api::utils::multiply_integers(tref.sizes);
   api::StorageBuffer staging(graph->context(), tref.dtype, numel);
   size_t nbytes = numel * api::element_size(tref.dtype);
   copy_ptr_to_staging(tref.data, staging, nbytes);
+  return staging;
+}
+
+void PrepackNode::encode(ComputeGraph* graph) {
+  api::Context* const context = graph->context();
+  api::PipelineBarrier pipeline_barrier{};
+
+  vTensor& packed = graph->get_val(packed_).toTensor();
+  api::StorageBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -47,6 +47,9 @@ class PrepackNode final {
   const ValueRef packed_;
   // TODO(T180906457): allow re-computing param buffers.
   std::vector<std::shared_ptr<api::UniformParamsBuffer>> params_;
+
+ private:
+  api::StorageBuffer create_staging_buffer(ComputeGraph* graph);
 };
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2d.cpp
@@ -52,13 +52,17 @@ void resize_conv2d_node(
   out.virtual_resize(new_out_sizes);
 }
 
-ValueRef prepack_biases(ComputeGraph& graph, const ValueRef vref) {
-  if (graph.get_val(vref).isNone()) {
-    VK_THROW("aten.convolution.default: Null bias is not supported yet!");
-  }
+ValueRef prepack_biases(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const ValueRef weight,
+    const bool transposed) {
+  TensorRef& tref = graph.get_val(weight).toTensorRef();
+  const int64_t out_channels = transposed ? tref.sizes.at(1) : tref.sizes.at(0);
 
-  ValueRef v = graph.add_tensor_like(
-      vref,
+  ValueRef v = graph.add_tensor(
+      {out_channels},
+      tref.dtype,
       api::StorageType::TEXTURE_2D,
       api::GPUMemoryLayout::TENSOR_WIDTH_PACKED);
   vTensor& t = graph.get_val(v).toTensor();
@@ -301,7 +305,7 @@ void add_conv2d_node(
 
   ValueRef arg_in = prepack_if_tensor_ref(graph, in);
   ValueRef arg_weight = prepack_weights(graph, weight, method);
-  ValueRef arg_bias = prepack_biases(graph, bias);
+  ValueRef arg_bias = prepack_biases(graph, bias, weight, transposed_val);
 
   vTensor& t_in = graph.get_val(arg_in).toTensor();
   vTensor& t_out = graph.get_val(out).toTensor();
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -89,6 +89,13 @@ void copy_staging_to_ptr(
   memcpy_from_mapping(mapping, dst, nbytes, staging.dtype());
 }
 
+void copy_zeros_to_staging(api::StorageBuffer& staging, const size_t nbytes) {
+  void* data = malloc(nbytes);
+  memset(data, 0, nbytes);
+  copy_ptr_to_staging(data, staging, nbytes);
+  free(data);
+}
+
 api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) {
   if (v_dst.is_quantized()) {
     VK_THROW("Quantized Tensors are currently not supported!");
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
@@ -25,6 +25,8 @@ void copy_staging_to_ptr(
     void* dst,
     const size_t nbytes);
 
+void copy_zeros_to_staging(api::StorageBuffer& staging, const size_t nbytes);
+
 //
 // Functions to get shaders
 //
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -601,3 +601,30 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_conv2d_bias_false(self):
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channels=6,
+                    out_channels=8,
+                    kernel_size=(3, 3),
+                    padding=(2, 3),
+                    stride=(1, 2),
+                    dilation=1,
+                    groups=1,
+                    bias=False,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        conv2d_module = Conv2dModule()
+        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            conv2d_module,
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )