[ML-GPU] Simplify conv2d weight prepacking (>2x pipeline-creation speedup)

jorgep31415 · jorgep31415 · commit fdc869074b51 · 2024-04-25T21:28:47.000-07:00
ssjia has previously written two implementations of convolution weights prepacking for CPU (before and after [PyTorch PR #84973](pytorch/pytorch#84973)). Originally, I translated the second implementation to GPU since it is more readable. Now, I translate the first implementation to GPU and switch to it since it requires less steps. The second impl was so complex that during model-load, it took >1500ms to create pipelines. In the test plan's Before, the example sums to 1905ms: ``` [334ms] P::encode-conv2d_prepack_weights_float, (16, 4, 1) [110ms] P::encode-conv2d_dw_prepack_weights_float, (16, 4, 1) [270ms] P::encode-conv2d_prepack_weights_float, (8, 8, 1) [94ms] P::encode-conv2d_dw_prepack_weights_float, (8, 8, 1) [609ms] P::encode-conv_transpose2d_prepack_weights_float, (8, 8, 1) [488ms] P::encode-conv_transpose2d_prepack_weights_float, (16, 4, 1) ``` The first impl now takes <700ms to create pipelines. In the test plan's After, the example sums to 598ms: ``` [135ms] P::encode-conv2d_prepack_weights_float, (16, 4, 1) [83ms] P::encode-conv2d_dw_prepack_weights_float, (16, 4, 1) [102ms] P::encode-conv2d_prepack_weights_float, (8, 8, 1) [69ms] P::encode-conv2d_dw_prepack_weights_float, (8, 8, 1) [115ms] P::encode-conv_transpose2d_prepack_weights_float, (8, 8, 1) [94ms] P::encode-conv_transpose2d_prepack_weights_float, (16, 4, 1) ``` Internal: This diff targets the Next Steps involving convolution from [ET-VK Model-Load Benchmarks](https://docs.google.com/document/d/11JIBPuCI-u6Xe15GKzFC8pQaEW5F3ipBljWm7Nu_1KM/edit#heading=h.hlhgkp1f0o05) to reduce model-load time. Differential Revision: [D56617129](https://our.internmc.facebook.com/intern/diff/D56617129/) ghstack-source-id: 224037564 Pull Request resolved: #3368
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -26,49 +26,23 @@ layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
   BUF_T buffer_in[];
 };
 
-// Corresponds to {1,4,3,9} in the example below.
 layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-// Corresponds to {3,3,1,11} in the example below.
 layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
   ivec4 original_sizes;
 };
 
-// Corresponds to {1,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
-  ivec2 padded_sizes;
-};
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 /*
  * Computes special prepacking for a depthwise convolution. Each shader invocation
  * calculates the input buffer location to read into the desired texel. This
- * packing was originally developed on CPU and that approach is described in the
- * rest of this comment. Refer to the code-level comments, for how we translate
- * it to GPU by reversing the steps.
- *
- * Consider an example weight tensor of size {11,1,3,3}. The following
- * transformations will be applied.
- *
- * 1. Pad the N dim so that it is a multiple of 4. In this case, 1
- * batch of padding is added, producing a tensor of size {12,1,3,3}.
- *      at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
- *
- * 2. Flatten the last two dims by reshaping the tensor:
- *      x.reshape({12,1,9});
- *
- * 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
- * each split has 4 channels.
- *      x.reshape({3,4,1,9});
- *
- * 4. Stack the batches on each other vertically by permuting the N and C dims
- * and reshaping the tensor.
- *      x.permute({1,0,2,3}).reshape({4,3,9});
+ * packing was originally developed on CPU here:
+ * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -78,39 +52,40 @@ void main() {
     return;
   }
 
-  // As in usual staging shaders, map from GPU texel position to normal CPU
-  // buffer indices: (9,3) -> (4,3,9)
+  // Map tensor_idx to normal buffer_i
   const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, so we only map
-  // for pad and permute.
-  const int Np = padded_sizes.x;
+  // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
   const int C = original_sizes.z;
   const int H = original_sizes.y;
   const int W = original_sizes.x;
+  const int Y = sizes.y;
+
+  const ivec4 p1 = p0 / W;
+  const ivec4 p2 = p1 / H;
 
-  // Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
-  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W));
+  const ivec4 n = (p2 % Y) * 4 + (p2 / Y);
+  const ivec4 h = p1 % H;
+  const ivec4 w = p0 % W;
 
-  // Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
-  // For values in the padded region, write zero instead of buffer data.
-  const ivec4 n = p1 / (C * H * W);
-  const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
+  // Map modified tensor_idx to modifed buffer_i
+  // Zero modified tensor idx that are out of bounds
+  const ivec4 buf_i = n * C*H*W + h * W + w;
+  const bvec4 mask = bvec4(lessThan(n, ivec4(N)));
 
   VEC4_T texel = VEC4_T(0);
-  if (mask.x == 0) {
-    texel.x = SCALAR_T(buffer_in[p1.x]);
+  if (mask.x) {
+    texel.x = SCALAR_T(buffer_in[buf_i.x]);
   }
-  if (mask.y == 0) {
-    texel.y = SCALAR_T(buffer_in[p1.y]);
+  if (mask.y) {
+    texel.y = SCALAR_T(buffer_in[buf_i.y]);
   }
-  if (mask.z == 0) {
-    texel.z = SCALAR_T(buffer_in[p1.z]);
+  if (mask.z) {
+    texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w == 0) {
-    texel.w = SCALAR_T(buffer_in[p1.w]);
+  if (mask.w ) {
+    texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
   imageStore(image_out, pos.xy, texel);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -26,63 +26,23 @@ layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
   BUF_T buffer_in[];
 };
 
-// Corresponds to {1,4,9,24} in the example below.
 layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-// Corresponds to {3,3,7,10} in the example below.
 layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
   ivec4 original_sizes;
 };
 
-// Corresponds to {8,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
-  ivec2 padded_sizes;
-};
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 /*
  * Computes special prepacking for a 2D convolution. Each shader invocation
- * calculates the input buffer location to read into the desired texel. This
- * packing was originally developed on CPU and that approach is described in the
- * rest of this comment. Refer to the code-level comments, for how we translate
- * it to GPU by reversing the steps.
- *
- * Consider an example weight tensor of size {10,7,3,3}. The following
- * transformations will be applied.
- *
- * 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
- * batches and 1 channel of padding are added, producing a tensor of size
- * {12,8,3,3}.
- *      at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
- *
- * 2. Split the tensor along the C dim so that each split has 4 channels.
- *      x.reshape({12,2,4,3,3});
- *
- * 3. For each split, "fold" the C dim into the W dim. Suppose the first rows
- * at H=0 of the split have values
- *    0,1,2 | 10,11,12 | 20,21,22 | 30,31,32
- *
- * where | denotes a channel boundary. Then, the goal is to combine those rows
- * into one row with the values
- *    0, 10, 20, 30, 1, 11, 21, 31, 2, 12, 22, 32
- *
- *      x.permute({0,1,3,4,2}).reshape({12,2,3,12});
- *
- * 4. Stack the splits belonging to the same batch horizontally by swapping the
- * C and H dims.
- *      x.permute({0,2,1,3}).reshape({12,3,24});
- *
- * 5. Repeat a similar process to "fold" the N dim into the C dim. Split along
- * the N dim so that each split has 4 batches.
- *      x.reshape({3,4,3,24});
- *
- * 6. Stack the batches on each other vertically by swapping the N and C dims.
- *      x.permute({1,0,2,3}).reshape({4,9,24});
+ * calculates the input buffer locations to read into the desired texel. This
+ * packing was originally developed on CPU here:
+ * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -92,49 +52,44 @@ void main() {
     return;
   }
 
-  // As in usual staging shaders, map from GPU texel position to normal CPU
-  // buffer indices: (24,9) -> (4,9,24)
+  // Map tensor_idx to normal buffer_i
   const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, so we only map
-  // for pad and permute.
-  const int Np = padded_sizes.y;
-  const int Cp = padded_sizes.x;
+  // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
   const int C = original_sizes.z;
   const int H = original_sizes.y;
   const int W = original_sizes.x;
+  const int J = sizes.x / (4*W);
+  const int K = sizes.y / H;
+
+  const ivec4 p1 = p0 / 4;
+  const ivec4 p2 = p1 / W;
+  const ivec4 p3 = p2 / J;
+  const ivec4 p4 = p3 / H;
+
+  const ivec4 n = (p4 % K) * 4 + (p4 / K);
+  const ivec4 c = (p2 % J) * 4 + (p0 % 4);
+  const ivec4 h = p3 % H;
+  const ivec4 w = p1 % W;
 
-  // Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
-  // Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
-  // Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
-  // Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
-  const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W));
-  const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4));
-  const ivec4 p3 = swap_adj_dims(p2, W, 4, 1);
-  const ivec4 p4 = swap_adj_dims(p3, H, 4, W);
-
-  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
-  // For values in the padded region, write zero instead of buffer data.
-  const ivec4 c = p4 % (Cp * H * W) / (H * W);
-  const ivec4 n = p4 / (Cp * H * W);
-  const ivec4 p5 = p4 - n * (Cp - C) * H * W;
-  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
-      ivec4(greaterThanEqual(n, ivec4(N)));
+  // Map modified tensor_idx to modified buffer_i
+  // Zero modified tensor idx that are out of bounds
+  const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
+  const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
 
   VEC4_T texel = VEC4_T(0);
-  if (mask.x == 0) {
-    texel.x = SCALAR_T(buffer_in[p5.x]);
+  if (mask.x) {
+    texel.x = SCALAR_T(buffer_in[buf_i.x]);
   }
-  if (mask.y == 0) {
-    texel.y = SCALAR_T(buffer_in[p5.y]);
+  if (mask.y) {
+    texel.y = SCALAR_T(buffer_in[buf_i.y]);
   }
-  if (mask.z == 0) {
-    texel.z = SCALAR_T(buffer_in[p5.z]);
+  if (mask.z) {
+    texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w == 0) {
-    texel.w = SCALAR_T(buffer_in[p5.w]);
+  if (mask.w ) {
+    texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
   imageStore(image_out, pos.xy, texel);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -26,35 +26,23 @@ layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
   BUF_T buffer_in[];
 };
 
-// Corresponds to {1,4,6,36} in the example below.
 layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
   ivec4 sizes;
 };
 
-// Corresponds to {3,3,7,10} in the example below.
 layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
   ivec4 original_sizes;
 };
 
-// Corresponds to {8,12} in the example below.
-layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
-  ivec2 padded_sizes;
-};
-
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 layout(constant_id = 3) const int packed_dim = C_DIM;
 
 /*
  * Computes special prepacking for a 2D transpose convolution. Each shader
- * invocation calculates the input buffer location to read into the desired
- * texel.
- *
- * For details, refer to conv2d_prepack_weights.glsl which uses a similar
- * approach. For transpose, there are slight differences to reflect the data
- * access pattern in the shader. First, the weight tensor is flipped along the H
- * and W dims. Second, steps 3 and 4 are slightly different so that the splits
- * are interleaved.
+ * invocation calculates the input buffer locations to read into the desired
+ * texel. This packing was originally developed on CPU here:
+ * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
  */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -64,59 +52,43 @@ void main() {
     return;
   }
 
-  // As in usual staging shaders, map from GPU texel position to normal CPU
-  // buffer indices: (36,6) -> (4,6,36)
+  // Map tensor_idx to normal buffer_i
   const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
 
-  // Re-map the normal CPU buffer indices to special indices, through a series
-  // of mappings: reshape is a no-op to the underlying indices, so we only map
-  // for flip, pad, and permute.
-  const int Np = padded_sizes.y;
-  const int Cp = padded_sizes.x;
+  // Compute modified tensor_idx by inverting the CPU function
   const int N = original_sizes.w;
   const int C = original_sizes.z;
   const int H = original_sizes.y;
   const int W = original_sizes.x;
+  const int J = sizes.y / H;
+  const int K = sizes.x / (4*W);
+
+  const ivec4 p1 = p0 / (4*K);
+  const ivec4 p2 = p1 / W;
+  const ivec4 p3 = p2 / H;
+
+  const ivec4 n = p0 % (4*K);
+  const ivec4 c = (p3 % J) * 4 + (p3 / J);
+  const ivec4 h = H-1 - p2 % H;
+  const ivec4 w = W-1 - p1 % W;
 
-  // Undo step 6 premute: (4,2,3,36) -> (2,4,3,36)
-  // In the following comments, a=b=c=3.
-  // Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4)
-  // Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4)
-  // Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b)
-  // Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b)
-  const ivec4 p1 = swap_adj_dims(p0, 4, (Cp / 4), (H * Np * W));
-  const ivec4 p2 = swap_adj_dims(p1, W, (Np / 4), 4);
-  const ivec4 p3 = swap_adj_dims(p2, H, (Np / 4), (W * 4));
-  const ivec4 p4 = swap_adj_dims(p3, W, 4, 1);
-  const ivec4 p5 = swap_adj_dims(p4, H, 4, W);
-
-  // Undo step 0 permute: (8,12,3,3) -> (12,8,3,3)
-  const ivec4 p6 = swap_adj_dims(p5, Cp, Np, (W * H));
-  // Undo step 0 flip: (2,3)
-  const ivec4 w = p6 % W;
-  const ivec4 h = p6 % (H * W) / W;
-  const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h);
-
-  // Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
-  // For values in the padded region, write zero instead of buffer data.
-  const ivec4 c = p7 % (Cp * H * W) / (H * W);
-  const ivec4 n = p7 / (Cp * H * W);
-  const ivec4 p8 = p7 - n * (Cp - C) * H * W;
-  const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
-      ivec4(greaterThanEqual(n, ivec4(N)));
+  // Map modified tensor_idx to modifed buffer_i
+  // Zero modified tensor idx that are out of bounds
+  const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
+  const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
 
   VEC4_T texel = VEC4_T(0);
-  if (mask.x == 0) {
-    texel.x = SCALAR_T(buffer_in[p8.x]);
+  if (mask.x) {
+    texel.x = SCALAR_T(buffer_in[buf_i.x]);
   }
-  if (mask.y == 0) {
-    texel.y = SCALAR_T(buffer_in[p8.y]);
+  if (mask.y) {
+    texel.y = SCALAR_T(buffer_in[buf_i.y]);
   }
-  if (mask.z == 0) {
-    texel.z = SCALAR_T(buffer_in[p8.z]);
+  if (mask.z) {
+    texel.z = SCALAR_T(buffer_in[buf_i.z]);
   }
-  if (mask.w == 0) {
-    texel.w = SCALAR_T(buffer_in[p8.w]);
+  if (mask.w ) {
+    texel.w = SCALAR_T(buffer_in[buf_i.w]);
   }
 
   imageStore(image_out, pos.xy, texel);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -154,18 +154,3 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
   pos.w = idx[packed_dim] % 4;
   return pos;
 }
-
-//
-// Miscellaneous Utility Functions and Macros
-//
-
-// Given a buffer(1-D) index cur, compute a new index where the corresponding
-// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
-// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
-// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
-// plane=2*24=48.
-#define swap_adj_dims(cur, x, y, plane)                        \
-  cur +                                                        \
-      plane *                                                  \
-          ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
-           (x - 1) * ((cur % (y * plane)) / plane))
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp