pytorch · SS-JIA · Dec 18, 2024 · Dec 17, 2024
@@ -46,12 +46,15 @@ class PushConstantDataInfo {
     payload_.attr = attr;
   }
 
-  explicit PushConstantDataInfo(const void* data, uint32_t dataLen)
+  explicit PushConstantDataInfo(
+      const void* data,
+      uint32_t dataLen,
+      uint32_t pushConstantLen = 0)
       : tensorUniformData(nullptr) {
     VK_CHECK_COND(
         dataLen <= 16, "Single push constant data size must be <= 16 bytes");
-    payload_.dataSize = dataLen;
-    memcpy(payload_.data, data, payload_.dataSize);
+    payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen;
+    memcpy(payload_.data, data, dataLen);
   }
 
   /*

@@ -19,11 +19,6 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec4", "other_sizes")}
-${layout_declare_ubo(B, "ivec2", "broadcast_params")}
-${layout_declare_ubo(B, "float", "alpha")}
 
 #include "broadcasting_utils.h"
 #include "indexing_utils.h"
@@ -40,6 +35,14 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  ivec4 other_sizes;
+  ivec2 broadcast_params;
+  float alpha;
+};
+
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
   const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim);

@@ -67,7 +67,10 @@ void add_binary_op_node(
     alpha_val = graph.extract_scalar<float>(alpha);
   }
 
-  const utils::ivec2 broadcast_params = create_broadcast_params(*t_in1, *t_in2);
+  const struct BinaryOpsParams {
+    const utils::ivec2 broadcast_params;
+    const float alpha_val;
+  } binary_ops_params{create_broadcast_params(*t_in1, *t_in2), alpha_val};
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
@@ -83,16 +86,16 @@ void add_binary_op_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(),
-       t_in1->sizes_ubo(),
-       t_in2->sizes_ubo(),
-       graph.create_params_buffer(broadcast_params),
-       graph.create_params_buffer(alpha_val)},
+      {},
       // Specialization Constants
       {t_out->hashed_layout(), t_in1->hashed_layout(), t_in2->hashed_layout()},
       // Resizing Logic
       resize_binary_op_node,
-      {}));
+      {},
+      {{graph.sizes_pc_of(out),
+        graph.sizes_pc_of(arg1),
+        graph.sizes_pc_of(arg2),
+        PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}}));
 }
 
 #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \

@@ -1601,9 +1601,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
   addFn(graph, {a.value, b.value, kDummyValueRef, c});
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for arithmetic shader output c
-  expected_vma_allocation_count += 3;
+  // no new allocations if binary op uses push constants
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef d = graph.add_input_tensor(
@@ -1624,17 +1622,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
   mulFn(graph, {c, d.value, e});
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for arithmetic shader output e
-  expected_vma_allocation_count += 3;
+  // no new allocations if binary op uses push constants
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
+  // +1: staging buffer input tensor
   // +1: staging buffer for the output tensor
-  expected_vma_allocation_count += 1;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();