Fix BinaryOp broadcasting for packed dim

jorgep31415 · facebook-github-bot · commit 1f16f5e680d3 · 2024-03-25T10:36:02.000-07:00
Summary: As copyrightly pointed out, broadcasting was not working properly for the example below. I root caused the to confusion between `sizes()` vs `gpu_sizes()` once again! These concepts are explained in #2520 We should use the CPU size, not the GPU size to detect when we should broadcast across the packed-dim texel's elements. For example, given inputs `torch.ones(2, 3)` and `torch.ones(2, 1)` and `GPUMemoryLayout::WIDTH_PACKED`, we have CPU widths 3 and 1, respectively. These are aligned up to GPU widths 4 and 4, and hence we were failing to broadcast along the packed-dim texel's elements. ## torch.ones(2, 3) ``` (2, 3) = (H, W) = sizes [[1 1 1] [1 1 1]] -> (W, H) = (3, 2) → (4, 2) = gpu_sizes -> extents = (1, 2) [1 1 1 0] [1 1 1 0] ``` ## torch.ones(2, 1) ``` (2, 1) = (H, W) = sizes [[1] [1]] -> (W, H) = (1, 2) → (4, 2) = gpu_sizes -> extents = (1, 2) [1 0 0 0] [1 0 0 0] -> (broadcast from this change) [1 1 1 1] [1 1 1 1] ``` ## torch.ones(2, 3) + torch.ones(2, 1) Ignore the final element of each texel as it's just padding we never read. ``` No broadcast: [1 1 1 0] [1 1 1 0] + [1 0 0 0] [1 0 0 0] = [2 1 1 0] [2 1 1 0] Broadcast: [1 1 1 0] [1 1 1 0] + [1 1 1 1] [1 1 1 1] = [2 2 2 1] [2 2 2 1] ``` Differential Revision: D55278527
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -36,7 +36,12 @@ layout(set = 0, binding = 5) uniform PRECISION restrict OtherSizes {
 }
 other_sizes;
 
-layout(set = 0, binding = 6) uniform PRECISION restrict Alpha {
+layout(set = 0, binding = 6) uniform PRECISION restrict BroadcastFlag {
+  bool data;
+}
+broadcast_flag;
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Alpha {
   float data;
 }
 alpha;
@@ -63,8 +68,7 @@ void main() {
     COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
     0));
 
-  // Detect broadcasting
-  if (PACKED_DIM_${PACKING}(other_sizes.data) < PACKED_DIM_${PACKING}(in_sizes.data)) {
+  if (broadcast_flag.data) {
     other_texel = other_texel.xxxx;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -73,6 +73,8 @@ void add_binary_op_node(
     alpha_val = extract_scalar<float>(graph.get_val(alpha));
   }
 
+  const bool broadcast_flag = is_packed_dim_broadcasted(t_in1, t_in2);
+
   std::stringstream kernel_name;
   kernel_name << "binary_" << op_name;
   apply_memory_layout_suffix(kernel_name, t_out);
@@ -90,6 +92,7 @@ void add_binary_op_node(
       {t_out.gpu_sizes_ubo(),
        t_in1.gpu_sizes_ubo(),
        t_in2.gpu_sizes_ubo(),
+       graph.create_params_buffer(broadcast_flag),
        graph.create_params_buffer(alpha_val)},
       // Resizing
       resize_binary_op_node));
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -93,7 +93,25 @@ bool check_broadcastable(const vTensor& t1, const vTensor& t2) {
 }
 
 //
-// Work Group Size Calculation Utilities
+// Broadcast flag functions
+//
+
+bool is_packed_dim_broadcasted(const vTensor& t1, const vTensor& t2) {
+  switch (t1.gpu_memory_layout()) {
+    case api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED:
+      return api::utils::val_at(-3, t1.sizes()) >
+          api::utils::val_at(-3, t2.sizes());
+    case api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED:
+      return api::utils::val_at(-2, t1.sizes()) >
+          api::utils::val_at(-2, t2.sizes());
+    case api::GPUMemoryLayout::TENSOR_WIDTH_PACKED:
+      return api::utils::val_at(-1, t1.sizes()) >
+          api::utils::val_at(-1, t2.sizes());
+  }
+}
+
+//
+// Work group size calculation functions
 //
 
 api::utils::uvec3 adaptive_work_group_size(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -50,7 +50,13 @@ bool check_same_memory_layout(
 bool check_broadcastable(const vTensor& t1, const vTensor& t2);
 
 //
-// Work Group Size Calculation Utilities
+// Broadcast flag functions
+//
+
+bool is_packed_dim_broadcasted(const vTensor& t1, const vTensor& t2);
+
+//
+// Work group size calculation functions
 //
 
 api::utils::uvec3 adaptive_work_group_size(
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -146,16 +146,19 @@ class AddModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, x, y):
+            def forward(self, x, y, w):
                 z = x + y
                 z = z + x
                 z = z + x
+                z = z + w
+                z = z + 3  # test scalar broadcasting
                 return z
 
         add_module = AddModule()
         sample_inputs = (
             torch.rand(size=(2, 3), dtype=torch.float32),
             torch.rand(size=(2, 3), dtype=torch.float32),
+            torch.rand(size=(2, 1), dtype=torch.float32),  # test broadcasting
         )
 
         self.lower_module_and_test_output(add_module, sample_inputs)
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -549,7 +549,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   std::vector<int64_t> size_big = {12, 64, 64};
   std::vector<int64_t> size_small = {12, 64, 64};
 
-  // Build graph
+  // Build graph and regularly check allocation counts
 
   IOValueRef a = graph.add_input_tensor(
       size_big,
@@ -560,9 +560,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 4);
 
-  // Allocation count will be 6:
-  // 4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
-  // 2: staging buffer for each input tensor
+  // +4: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for each staging shader
+  // +2: staging buffer for each input tensor
   EXPECT_TRUE(get_vma_allocation_count() == 6);
 
   ValueRef c = graph.add_tensor(
@@ -578,11 +577,10 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
       api::kFloat,
       /*shared_object_idx = */ 2);
 
-  // Allocation count will be 11, 5 are new:
-  // 2: out.gpu_sizes_ubo(), alpha UBO for arithmetic shader
-  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
-  // 1: staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 11);
+  // +3: out.gpu_sizes_ubo(), alpha UBO, broadcast UBO for arithmetic shader
+  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() uniform buffer for staging shader
+  // +1: staging buffer for the input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 12);
 
   ValueRef e = graph.add_tensor(
       size_big,
@@ -596,18 +594,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
-  // Allocation count will be 15, 4 are new:
-  // 1: alpha UBO for arithmetic shader
-  // 2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
-  // 1 staging buffer for the input tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 15);
+  // +2: alpha UBO, broadcast UBO for arithmetic shader
+  // +2: t.gpu_sizes_ubo(), t.cpu_sizes_ubo() for staging shader
+  // +1 staging buffer for the input tensor
+  EXPECT_TRUE(get_vma_allocation_count() == 17);
 
   graph.prepare();
   graph.encode_execute();
 
-  // Allocation count will be 18, 3 are new:
-  // 3: shared memory allocations for tensors
-  EXPECT_TRUE(get_vma_allocation_count() == 18);
+  // +3: shared memory allocations for tensors
+  EXPECT_TRUE(get_vma_allocation_count() == 20);
 
   // Run graph