[ET-VK] Migrate workgroup API for trivial cases

jorgep31415 · jorgep31415 · commit 810ea6e1fa30 · 2024-06-25T09:46:39.000-07:00
This will allow us to override local workgroup sizes with #4046. ## Before ``` vTensorPtr t_out = graph.get_tensor(out); api::utils::uvec3 global_size = t_out->image_extents(); api::utils::uvec3 local_size = adaptive_work_group_size(global_size); graph.execute_nodes().emplace_back(new ExecuteNode( ..., global_size, local_size, ..., ); ``` ## After ``` graph.execute_nodes().emplace_back(new ExecuteNode( ..., graph.create_global_wg_size(out), graph.create_local_wg_size(out), ..., ); ``` Note we do not migrate cases where the global size is nontrivial (MatMul, Linear, Conv1D, Repeat) or the image isn't a ValueRef (MaxPool2D, NativeLayerNorm). We should first align on an API design for those cases. Differential Revision: [D59011492](https://our.internmc.facebook.com/intern/diff/D59011492/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
@@ -84,19 +84,15 @@ void add_arange_node(
 
   vTensorPtr t_out = graph.get_tensor(out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name("arange");
   kernel_name.reserve(kShaderNameReserve);
-
   add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -77,17 +77,14 @@ void add_native_batch_norm_node(
   std::string kernel_name = "batchnorm";
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   int32_t num_texel_per_batch =
       api::utils::div_up_4((dim_at<kChannel4D>(t_in->sizes())));
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out_ref),
+      graph.create_local_wg_size(out_ref),
       {{out_ref, api::MemoryAccessType::WRITE},
        {{in_ref, arg_weight, arg_bias, arg_mean, arg_var},
         api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -61,9 +61,6 @@ void add_binary_op_node(
 
   check_binary_op_args(*t_in1, *t_in2, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   float alpha_val = 1.0f;
   // String is checked since floor_div passes in an unused string argument in
   // place of alpha
@@ -82,8 +79,8 @@ void add_binary_op_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{arg1, arg2}, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -25,14 +25,11 @@ void add_clone_node(
   std::string kernel_name = "clone";
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       {t_out->texture_limits_ubo()}));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
@@ -32,9 +32,6 @@ void add_copy_offset_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  uvec3 global_size = api::utils::make_uvec3(range);
-  uvec3 local_size = adaptive_work_group_size(global_size);
-
   const struct Block final {
     ivec3 range;
     int32_t unused0;
@@ -56,8 +53,8 @@ void add_copy_offset_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {
           {out, api::MemoryAccessType::WRITE},
@@ -141,7 +138,6 @@ void add_copy_channel_offset_node(
         api::utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
         api::utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
         api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
-
     uvec3 local_size = adaptive_work_group_size(global_size);
 
     const struct Block final {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -41,14 +41,11 @@ void add_embedding_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       {{out, api::MemoryAccessType::WRITE},
        {{in, weight}, api::MemoryAccessType::READ}},
       {t_out->sizes_ubo()}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
@@ -39,9 +39,6 @@ void add_full_node(
   float fill_value_val = graph.extract_scalar<float>(fill_value);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name("full");
   kernel_name.reserve(kShaderNameReserve);
 
@@ -50,8 +47,8 @@ void add_full_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
@@ -41,14 +41,11 @@ void add_index_select_channel_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       {{out, api::MemoryAccessType::WRITE},
        {{in, idx}, api::MemoryAccessType::READ}},
       {t_out->sizes_ubo(), t_in->sizes_ubo()}));
@@ -93,14 +90,11 @@ void add_index_select_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       {{out, api::MemoryAccessType::WRITE},
        {{in, idx}, api::MemoryAccessType::READ}},
       {t_out->sizes_ubo(), graph.create_params_buffer(params)}));
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -101,9 +101,6 @@ void add_addmm_naive_node(
   ValueRef self = prepack_if_tensor_ref(graph, self_data, api::kWidthPacked);
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
-  api::utils::uvec3 global_size = graph.image_extents_of(out);
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name =
       graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
   kernel_name.reserve(kShaderNameReserve);
@@ -114,8 +111,8 @@ void add_addmm_naive_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{mat1, mat2, self}, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -72,9 +72,6 @@ void add_matmul_naive_node(
     const ValueRef mat2_is_transposed) {
   ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, api::kHeightPacked);
 
-  api::utils::uvec3 global_size = graph.image_extents_of(out);
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name = graph.get_bool(mat2_is_transposed)
       ? "matmul_transposed_naive"
       : "matmul_naive";
@@ -86,8 +83,8 @@ void add_matmul_naive_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {{mat1, mat2}, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
@@ -65,9 +65,6 @@ void add_constant_pad_nd_node(
   vTensorPtr t_in = graph.get_tensor(in);
   vTensorPtr t_out = graph.get_tensor(out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name = "";
   PadParam pad_param = creat_pad_param(*pad_vec);
 
@@ -84,8 +81,8 @@ void add_constant_pad_nd_node(
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -85,14 +85,11 @@ void add_permute_node(
       {out_c_aligned, in_c_aligned},
   };
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       {t_out->texture_limits_ubo(),
        t_out->sizes_ubo(),
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -102,15 +102,12 @@ void add_select_int_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   // TODO: add resizing to support dynamic shapes.
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       // Parameter buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -80,9 +80,6 @@ void add_slice_tensor_out_node(
     kernel_name.reserve(kShaderNameReserve);
     add_dtype_suffix(kernel_name, *t_out);
 
-    api::utils::uvec3 global_size = t_out->image_extents();
-    api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
     const struct Block final {
       int offset;
       int step;
@@ -94,8 +91,8 @@ void add_slice_tensor_out_node(
     graph.execute_nodes().emplace_back(new ExecuteNode(
         graph,
         VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
+        graph.create_global_wg_size(out),
+        graph.create_local_wg_size(out),
         {{out, api::MemoryAccessType::WRITE},
          {in, api::MemoryAccessType::READ}},
         {t_out->sizes_ubo(),
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
@@ -43,7 +43,6 @@ void add_softmax_node(
   softmax_dim = normalize(softmax_dim, in_dim);
 
   vTensorPtr t_out = graph.get_tensor(out);
-  uvec3 global_size = t_out->image_extents();
 
   api::ShaderInfo shader_descriptor;
   std::string kernel_name = in_dim - softmax_dim == 3
@@ -55,14 +54,12 @@ void add_softmax_node(
     kernel_name = "log_" + kernel_name;
   }
 
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       // shader_descriptor,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {in_arg, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/Sum.cpp b/backends/vulkan/runtime/graph/ops/impl/Sum.cpp
@@ -68,22 +68,18 @@ void add_sum_dim_node(
       in_dim > 2 ? static_cast<int32_t>(t_input->sizes()[in_dim - 3]) : 1;
   uint32_t dim_size = t_input->sizes()[dim];
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   std::string kernel_name("sum_dim");
   kernel_name.reserve(kShaderNameReserve);
   if (keepdim) {
     kernel_name += "_keepdim";
   }
-
   add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {arg, api::MemoryAccessType::READ}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
@@ -69,19 +69,16 @@ void add_upsample_nearest2d_node(
   }
 
   vTensorPtr t_out = graph.get_tensor(out);
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
   std::string kernel_name("upsample_nearest2d");
   kernel_name.reserve(kShaderNameReserve);
-
   add_dtype_suffix(kernel_name, *t_out);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE},
        {arg_in, api::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -65,14 +65,11 @@ void add_view_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  api::utils::uvec3 global_size = t_out->image_extents();
-  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
   graph.execute_nodes().emplace_back(new ExecuteNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      graph.create_global_wg_size(out),
+      graph.create_local_wg_size(out),
       // Inputs and Outputs
       {{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
       // Parameter Buffers