Fix BinaryOp broadcasting for packed dim (#2653)

jorgep31415 · facebook-github-bot · commit 25c5b67d7739 · 2024-03-27T15:47:49.000-07:00
Summary: Pull Request resolved: #2653 As copyrightly pointed out, broadcasting was not working properly for the example below. I root caused the to confusion between `sizes()` vs `gpu_sizes()` once again! These concepts are explained in #2520 We should use the CPU size, not the GPU size to detect when we should broadcast across the packed-dim texel's elements. # Example Given inputs `torch.ones(2, 3)` and `torch.ones(2, 1)` and `GPUMemoryLayout::WIDTH_PACKED`, we have CPU widths 3 and 1, respectively. These are aligned up to GPU widths 4 and 4, and hence we were failing to broadcast along the packed-dim texel's elements. ## torch.ones(2, 3) ``` (2, 3) = (H, W) = sizes [[1 1 1] [1 1 1]] -> (W, H) = (3, 2) → (4, 2) = gpu_sizes -> extents = (1, 2) [1 1 1 0] [1 1 1 0] ``` ## torch.ones(2, 1) ``` (2, 1) = (H, W) = sizes [[1] [1]] -> (W, H) = (1, 2) → (4, 2) = gpu_sizes -> extents = (1, 2) [1 0 0 0] [1 0 0 0] -> (broadcast from this change) [1 1 1 1] [1 1 1 1] ``` ## torch.ones(2, 3) + torch.ones(2, 1) Ignore the final element of each texel as it's just padding we never read. ``` No broadcast: [1 1 1 0] [1 1 1 0] + [1 0 0 0] [1 0 0 0] = [2 1 1 0] [2 1 1 0] Broadcast: [1 1 1 0] [1 1 1 0] + [1 1 1 1] [1 1 1 1] = [2 2 2 1] [2 2 2 1] ``` # Cleanup Remove unneeded `check_broadcastable()` since this is caught earlier in the PyTorch compiler pipeline. For example, `torch.ones(2, 3) + torch.ones(2, 2)` triggers this error: ``` TorchRuntimeError: Failed running call_function <built-in function add>(*(FakeTensor(..., size=(2, 3)), FakeTensor(..., size=(2, 2))), **{}): Attempting to broadcast a dimension of length 2 at -1! Mismatching argument at index 1 had torch.Size([2, 2]); but expected shape should be broadcastable to [2, 3] ``` bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D55278527 fbshipit-source-id: abb8a83924370b21dbbabdd5f1f4af8f502edc1f
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -36,7 +36,12 @@ layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes {
 }
 other_sizes;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Alpha {
+layout(set = 0, binding = 6) uniform PRECISION restrict BroadcastParams {
+  ivec2 data;
+}
+broadcast_params;
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Alpha {
   float data;
 }
 alpha;
@@ -63,8 +68,11 @@ void main() {
     COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
     0));
 
-  // Detect broadcasting
-  if (PACKED_DIM_${PACKING}(other_sizes.data) < PACKED_DIM_${PACKING}(in_sizes.data)) {
+  // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
+  if (broadcast_params.data.x > 0) {
+    in_texel = in_texel.xxxx;
+  }
+  if (broadcast_params.data.y > 0) {
     other_texel = other_texel.xxxx;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -24,7 +24,6 @@ void check_binary_op_args(
     const vTensor& other,
     const vTensor& out) {
   VK_CHECK_COND(check_same_memory_layout(self, other, out));
-  VK_CHECK_COND(check_broadcastable(self, other));
   std::vector<int64_t> broadcasted_sizes =
       calculate_broadcasted_output_size(self, other);
   VK_CHECK_COND(out.sizes() == broadcasted_sizes);
@@ -36,6 +35,8 @@ void resize_binary_op_node(
     const std::vector<ValueRef>& extra_args) {
   (void)extra_args;
   vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
+
+  // TODO(T183442143): Verify tensors are broadcastable.
   vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
   vTensor& other = graph->get_val(args[1].refs[1]).toTensor();
 
@@ -73,6 +74,9 @@ void add_binary_op_node(
     alpha_val = extract_scalar<float>(graph.get_val(alpha));
   }
 
+  const api::utils::ivec2 broadcast_params =
+      create_broadcast_params(t_in1, t_in2);
+
   std::stringstream kernel_name;
   kernel_name << "binary_" << op_name;
   apply_memory_layout_suffix(kernel_name, t_out);
@@ -90,6 +94,7 @@ void add_binary_op_node(
       {t_out.gpu_sizes_ubo(),
        t_in1.gpu_sizes_ubo(),
        t_in2.gpu_sizes_ubo(),
+       graph.create_params_buffer(broadcast_params),
        graph.create_params_buffer(alpha_val)},
       // Resizing
       resize_binary_op_node));
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -72,28 +72,35 @@ bool check_same_memory_layout(
   return (t1.gpu_memory_layout() == t3.gpu_memory_layout());
 }
 
-bool check_broadcastable(const vTensor& t1, const vTensor& t2) {
-  size_t ndim = std::max(t1.sizes().size(), t2.sizes().size());
+//
+// Broadcast flag functions
+//
 
-  // Match the sizes in reverse because sizes are in NCHW order
-  for (int i = -1; i >= -ndim; --i) {
-    int64_t t1_size = api::utils::val_at(i, t1.sizes());
-    int64_t t2_size = api::utils::val_at(i, t2.sizes());
-    // If the sizes are not equal, one of them must be 1
-    if (t1_size != t2_size) {
-      if (t1_size > 1 && t2_size != 1) {
-        return false;
-      } else if (t2_size > 1 && t1_size != 1) {
-        return false;
-      }
-    }
+bool is_packed_dim_broadcasted(const vTensor& sndr, const vTensor& rcvr) {
+  // We assume that the tensors are broadcastable. If values aren't equal at
+  // some index, then the value of rcvr is 1 and hence should be broadcasted.
+  switch (sndr.gpu_memory_layout()) {
+    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+      return api::utils::val_at(-3, sndr.sizes()) >
+          api::utils::val_at(-3, rcvr.sizes());
+    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+      return api::utils::val_at(-2, sndr.sizes()) >
+          api::utils::val_at(-2, rcvr.sizes());
+    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+      return api::utils::val_at(-1, sndr.sizes()) >
+          api::utils::val_at(-1, rcvr.sizes());
   }
+}
 
-  return true;
+api::utils::ivec2 create_broadcast_params(
+    const vTensor& t1,
+    const vTensor& t2) {
+  return api::utils::make_ivec2(
+      {is_packed_dim_broadcasted(t2, t1), is_packed_dim_broadcasted(t1, t2)});
 }
 
 //
-// Work Group Size Calculation Utilities
+// Work group size calculation functions
 //
 
 api::utils::uvec3 adaptive_work_group_size(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -47,10 +47,14 @@ bool check_same_memory_layout(
     const vTensor& t2,
     const vTensor& t3);
 
-bool check_broadcastable(const vTensor& t1, const vTensor& t2);
+//
+// Broadcast flag functions
+//
+
+api::utils::ivec2 create_broadcast_params(const vTensor& t1, const vTensor& t2);
 
 //
-// Work Group Size Calculation Utilities
+// Work group size calculation functions
 //
 
 api::utils::uvec3 adaptive_work_group_size(
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -146,16 +146,20 @@ class AddModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, x, y):
+            def forward(self, x, y, w):
                 z = x + y
                 z = z + x
                 z = z + x
+                z = z + w
+                z = w + z
+                z = z + 3  # test scalar broadcasting
                 return z
 
         add_module = AddModule()
         sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
+            torch.rand(size=(2, 1), dtype=torch.float32),  # test broadcasting
         )
 
         self.lower_module_and_test_output(add_module, sample_inputs)
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -549,7 +549,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   std::vector<int64_t> size_big = {12, 64, 64};
   std::vector<int64_t> size_small = {12, 64, 64};
 
-  // Build graph
+  // Build graph and regularly check allocation counts
 
   IOValueRef a = graph.add_input_tensor(
       size_big,
@@ -560,9 +560,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 4);
 
-  // Allocation count will be 6:
-  // 4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
-  // 2: staging buffer for each input tensor
+  // +4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
+  // +2: staging buffer for each input tensor
   EXPECT_TRUE(get_vma_allocation_count() == 6);
 
   ValueRef c = graph.add_tensor(
@@ -578,11 +577,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 2);
 
-  // Allocation count will be 11, 5 are new:
-  // 2: out.gpu_sizes_ubo(), alpha UBO for arithmetic shader
-  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
-  // 1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 11);
+  // +3: out.gpu_sizes_ubo(), alpha UBO, broadcast UBO for arithmetic shader
+  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
+  // +1: staging buffer for the input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 12);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -596,18 +594,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // Allocation count will be 15, 4 are new:
-  // 1: alpha UBO for arithmetic shader
-  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
-  // 1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 15);
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
+  // +1 staging buffer for the input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 17);
 
   graph.prepare();
   graph.encode_execute();
 
-  // Allocation count will be 18, 3 are new:
-  // 3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 18);
+  // +3: shared memory allocations for tensors
+  EXPECT_TRUE(get_vma_allocation_count() == 20);
 
   // Run graph