Update on "[ET-VK] Simplifying conv1d op shader by changing it to process one output texel per thread."

trivedivivek · trivedivivek · commit 7c9255188919 · 2025-05-02T14:57:38.000-07:00
This diff changes conv1d shader to process one output texel per thread, increasing GPU occupancy and improve performance. Differential Revision: [D74097560](https://our.internmc.facebook.com/intern/diff/D74097560/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
@@ -59,10 +59,6 @@ const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
 // This implementation performs N x out_C x out_L shader invocations, where each invocation
 // calculates the rolling kernel of the length dimension for each batch, i.e.,
 // computes out_L results.
-//
-// Note that we can rewrite this implementation as out_L * out_C * ceil(N / 4)
-// shader invocations, where each invocation computes 1 result. But that
-// performs worse.
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -520,7 +520,7 @@ void add_conv1d_node(
       // out channels
       static_cast<uint32_t>(out_channels),
       // out batches
-      graph.size_at<uint32_t>(-3, out)};
+      utils::div_up_4(graph.size_at<uint32_t>(-3, out))};
   const utils::uvec3 local_size = graph.create_local_wg_size(global_size);
 
   Kernel1dParams kernel_params = {