Skip to content

Commit 6c06f26

Browse files
jorgep31415facebook-github-bot
authored andcommitted
Simplify conv2d weight prepacking (>2x pipeline-creation speedup) (#3368)
Summary: Pull Request resolved: #3368 SS-JIA has previously written two implementations of convolution weights prepacking for CPU (before and after [PyTorch PR #84973](pytorch/pytorch#84973)). Originally, I translated the second implementation to GPU since it is more readable. Now, I translate the first implementation to GPU and switch to it since it requires less steps. Reviewed By: SS-JIA Differential Revision: D56617129 fbshipit-source-id: b84533a45e1daf31f7fdec6707a518f3de57ce4c
1 parent 92b5aea commit 6c06f26

File tree

5 files changed

+82
-219
lines changed

5 files changed

+82
-219
lines changed

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl

Lines changed: 23 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -26,49 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
2626
BUF_T buffer_in[];
2727
};
2828

29-
// Corresponds to {1,4,3,9} in the example below.
3029
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
3130
ivec4 sizes;
3231
};
3332

34-
// Corresponds to {3,3,1,11} in the example below.
3533
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
3634
ivec4 original_sizes;
3735
};
3836

39-
// Corresponds to {1,12} in the example below.
40-
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
41-
ivec2 padded_sizes;
42-
};
43-
4437
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4538

4639
layout(constant_id = 3) const int packed_dim = C_DIM;
4740

4841
/*
4942
* Computes special prepacking for a depthwise convolution. Each shader invocation
5043
* calculates the input buffer location to read into the desired texel. This
51-
* packing was originally developed on CPU and that approach is described in the
52-
* rest of this comment. Refer to the code-level comments, for how we translate
53-
* it to GPU by reversing the steps.
54-
*
55-
* Consider an example weight tensor of size {11,1,3,3}. The following
56-
* transformations will be applied.
57-
*
58-
* 1. Pad the N dim so that it is a multiple of 4. In this case, 1
59-
* batch of padding is added, producing a tensor of size {12,1,3,3}.
60-
* at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
61-
*
62-
* 2. Flatten the last two dims by reshaping the tensor:
63-
* x.reshape({12,1,9});
64-
*
65-
* 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
66-
* each split has 4 channels.
67-
* x.reshape({3,4,1,9});
68-
*
69-
* 4. Stack the batches on each other vertically by permuting the N and C dims
70-
* and reshaping the tensor.
71-
* x.permute({1,0,2,3}).reshape({4,3,9});
44+
* packing was originally developed on CPU here:
45+
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118
7246
*/
7347
void main() {
7448
const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -78,39 +52,40 @@ void main() {
7852
return;
7953
}
8054

81-
// As in usual staging shaders, map from GPU texel position to normal CPU
82-
// buffer indices: (9,3) -> (4,3,9)
55+
// Map tensor_idx to normal buffer_i
8356
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
8457

85-
// Re-map the normal CPU buffer indices to special indices, through a series
86-
// of mappings: reshape is a no-op to the underlying indices, so we only map
87-
// for pad and permute.
88-
const int Np = padded_sizes.x;
58+
// Compute modified tensor_idx by inverting the CPU function
8959
const int N = original_sizes.w;
9060
const int C = original_sizes.z;
9161
const int H = original_sizes.y;
9262
const int W = original_sizes.x;
63+
const int Y = sizes.y;
64+
65+
const ivec4 p1 = p0 / W;
66+
const ivec4 p2 = p1 / H;
9367

94-
// Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
95-
const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W));
68+
const ivec4 n = (p2 % Y) * 4 + (p2 / Y);
69+
const ivec4 h = p1 % H;
70+
const ivec4 w = p0 % W;
9671

97-
// Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
98-
// For values in the padded region, write zero instead of buffer data.
99-
const ivec4 n = p1 / (C * H * W);
100-
const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
72+
// Map modified tensor_idx to modifed buffer_i
73+
// Zero out if modified tensor idx is out of bounds
74+
const ivec4 buf_i = n * C*H*W + h * W + w;
75+
const bvec4 mask = bvec4(lessThan(n, ivec4(N)));
10176

10277
VEC4_T texel = VEC4_T(0);
103-
if (mask.x == 0) {
104-
texel.x = SCALAR_T(buffer_in[p1.x]);
78+
if (mask.x) {
79+
texel.x = SCALAR_T(buffer_in[buf_i.x]);
10580
}
106-
if (mask.y == 0) {
107-
texel.y = SCALAR_T(buffer_in[p1.y]);
81+
if (mask.y) {
82+
texel.y = SCALAR_T(buffer_in[buf_i.y]);
10883
}
109-
if (mask.z == 0) {
110-
texel.z = SCALAR_T(buffer_in[p1.z]);
84+
if (mask.z) {
85+
texel.z = SCALAR_T(buffer_in[buf_i.z]);
11186
}
112-
if (mask.w == 0) {
113-
texel.w = SCALAR_T(buffer_in[p1.w]);
87+
if (mask.w) {
88+
texel.w = SCALAR_T(buffer_in[buf_i.w]);
11489
}
11590

11691
imageStore(image_out, pos.xy, texel);

backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl

Lines changed: 29 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -26,63 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
2626
BUF_T buffer_in[];
2727
};
2828

29-
// Corresponds to {1,4,9,24} in the example below.
3029
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
3130
ivec4 sizes;
3231
};
3332

34-
// Corresponds to {3,3,7,10} in the example below.
3533
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
3634
ivec4 original_sizes;
3735
};
3836

39-
// Corresponds to {8,12} in the example below.
40-
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
41-
ivec2 padded_sizes;
42-
};
43-
4437
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4538

4639
layout(constant_id = 3) const int packed_dim = C_DIM;
4740

4841
/*
4942
* Computes special prepacking for a 2D convolution. Each shader invocation
50-
* calculates the input buffer location to read into the desired texel. This
51-
* packing was originally developed on CPU and that approach is described in the
52-
* rest of this comment. Refer to the code-level comments, for how we translate
53-
* it to GPU by reversing the steps.
54-
*
55-
* Consider an example weight tensor of size {10,7,3,3}. The following
56-
* transformations will be applied.
57-
*
58-
* 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
59-
* batches and 1 channel of padding are added, producing a tensor of size
60-
* {12,8,3,3}.
61-
* at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
62-
*
63-
* 2. Split the tensor along the C dim so that each split has 4 channels.
64-
* x.reshape({12,2,4,3,3});
65-
*
66-
* 3. For each split, "fold" the C dim into the W dim. Suppose the first rows
67-
* at H=0 of the split have values
68-
* 0,1,2 | 10,11,12 | 20,21,22 | 30,31,32
69-
*
70-
* where | denotes a channel boundary. Then, the goal is to combine those rows
71-
* into one row with the values
72-
* 0, 10, 20, 30, 1, 11, 21, 31, 2, 12, 22, 32
73-
*
74-
* x.permute({0,1,3,4,2}).reshape({12,2,3,12});
75-
*
76-
* 4. Stack the splits belonging to the same batch horizontally by swapping the
77-
* C and H dims.
78-
* x.permute({0,2,1,3}).reshape({12,3,24});
79-
*
80-
* 5. Repeat a similar process to "fold" the N dim into the C dim. Split along
81-
* the N dim so that each split has 4 batches.
82-
* x.reshape({3,4,3,24});
83-
*
84-
* 6. Stack the batches on each other vertically by swapping the N and C dims.
85-
* x.permute({1,0,2,3}).reshape({4,9,24});
43+
* calculates the input buffer locations to read into the desired texel. This
44+
* packing was originally developed on CPU here:
45+
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
8646
*/
8747
void main() {
8848
const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -92,49 +52,44 @@ void main() {
9252
return;
9353
}
9454

95-
// As in usual staging shaders, map from GPU texel position to normal CPU
96-
// buffer indices: (24,9) -> (4,9,24)
55+
// Map tensor_idx to normal buffer_i
9756
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
9857

99-
// Re-map the normal CPU buffer indices to special indices, through a series
100-
// of mappings: reshape is a no-op to the underlying indices, so we only map
101-
// for pad and permute.
102-
const int Np = padded_sizes.y;
103-
const int Cp = padded_sizes.x;
58+
// Compute modified tensor_idx by inverting the CPU function
10459
const int N = original_sizes.w;
10560
const int C = original_sizes.z;
10661
const int H = original_sizes.y;
10762
const int W = original_sizes.x;
63+
const int J = sizes.x / (4*W);
64+
const int K = sizes.y / H;
65+
66+
const ivec4 p1 = p0 / 4;
67+
const ivec4 p2 = p1 / W;
68+
const ivec4 p3 = p2 / J;
69+
const ivec4 p4 = p3 / H;
70+
71+
const ivec4 n = (p4 % K) * 4 + (p4 / K);
72+
const ivec4 c = (p2 % J) * 4 + (p0 % 4);
73+
const ivec4 h = p3 % H;
74+
const ivec4 w = p1 % W;
10875

109-
// Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
110-
// Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
111-
// Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
112-
// Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
113-
const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W));
114-
const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4));
115-
const ivec4 p3 = swap_adj_dims(p2, W, 4, 1);
116-
const ivec4 p4 = swap_adj_dims(p3, H, 4, W);
117-
118-
// Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
119-
// For values in the padded region, write zero instead of buffer data.
120-
const ivec4 c = p4 % (Cp * H * W) / (H * W);
121-
const ivec4 n = p4 / (Cp * H * W);
122-
const ivec4 p5 = p4 - n * (Cp - C) * H * W;
123-
const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
124-
ivec4(greaterThanEqual(n, ivec4(N)));
76+
// Map modified tensor_idx to modified buffer_i
77+
// Zero out if modified tensor idx is out of bounds
78+
const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
79+
const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
12580

12681
VEC4_T texel = VEC4_T(0);
127-
if (mask.x == 0) {
128-
texel.x = SCALAR_T(buffer_in[p5.x]);
82+
if (mask.x) {
83+
texel.x = SCALAR_T(buffer_in[buf_i.x]);
12984
}
130-
if (mask.y == 0) {
131-
texel.y = SCALAR_T(buffer_in[p5.y]);
85+
if (mask.y) {
86+
texel.y = SCALAR_T(buffer_in[buf_i.y]);
13287
}
133-
if (mask.z == 0) {
134-
texel.z = SCALAR_T(buffer_in[p5.z]);
88+
if (mask.z) {
89+
texel.z = SCALAR_T(buffer_in[buf_i.z]);
13590
}
136-
if (mask.w == 0) {
137-
texel.w = SCALAR_T(buffer_in[p5.w]);
91+
if (mask.w) {
92+
texel.w = SCALAR_T(buffer_in[buf_i.w]);
13893
}
13994

14095
imageStore(image_out, pos.xy, texel);

backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl

Lines changed: 28 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -26,35 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
2626
BUF_T buffer_in[];
2727
};
2828

29-
// Corresponds to {1,4,6,36} in the example below.
3029
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
3130
ivec4 sizes;
3231
};
3332

34-
// Corresponds to {3,3,7,10} in the example below.
3533
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
3634
ivec4 original_sizes;
3735
};
3836

39-
// Corresponds to {8,12} in the example below.
40-
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
41-
ivec2 padded_sizes;
42-
};
43-
4437
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4538

4639
layout(constant_id = 3) const int packed_dim = C_DIM;
4740

4841
/*
4942
* Computes special prepacking for a 2D transpose convolution. Each shader
50-
* invocation calculates the input buffer location to read into the desired
51-
* texel.
52-
*
53-
* For details, refer to conv2d_prepack_weights.glsl which uses a similar
54-
* approach. For transpose, there are slight differences to reflect the data
55-
* access pattern in the shader. First, the weight tensor is flipped along the H
56-
* and W dims. Second, steps 3 and 4 are slightly different so that the splits
57-
* are interleaved.
43+
* invocation calculates the input buffer locations to read into the desired
44+
* texel. This packing was originally developed on CPU here:
45+
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
5846
*/
5947
void main() {
6048
const ivec3 pos = ivec3(gl_GlobalInvocationID);
@@ -64,59 +52,43 @@ void main() {
6452
return;
6553
}
6654

67-
// As in usual staging shaders, map from GPU texel position to normal CPU
68-
// buffer indices: (36,6) -> (4,6,36)
55+
// Map tensor_idx to normal buffer_i
6956
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
7057

71-
// Re-map the normal CPU buffer indices to special indices, through a series
72-
// of mappings: reshape is a no-op to the underlying indices, so we only map
73-
// for flip, pad, and permute.
74-
const int Np = padded_sizes.y;
75-
const int Cp = padded_sizes.x;
58+
// Compute modified tensor_idx by inverting the CPU function
7659
const int N = original_sizes.w;
7760
const int C = original_sizes.z;
7861
const int H = original_sizes.y;
7962
const int W = original_sizes.x;
63+
const int J = sizes.y / H;
64+
const int K = sizes.x / (4*W);
65+
66+
const ivec4 p1 = p0 / (4*K);
67+
const ivec4 p2 = p1 / W;
68+
const ivec4 p3 = p2 / H;
69+
70+
const ivec4 n = p0 % (4*K);
71+
const ivec4 c = (p3 % J) * 4 + (p3 / J);
72+
const ivec4 h = H-1 - p2 % H;
73+
const ivec4 w = W-1 - p1 % W;
8074

81-
// Undo step 6 premute: (4,2,3,36) -> (2,4,3,36)
82-
// In the following comments, a=b=c=3.
83-
// Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4)
84-
// Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4)
85-
// Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b)
86-
// Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b)
87-
const ivec4 p1 = swap_adj_dims(p0, 4, (Cp / 4), (H * Np * W));
88-
const ivec4 p2 = swap_adj_dims(p1, W, (Np / 4), 4);
89-
const ivec4 p3 = swap_adj_dims(p2, H, (Np / 4), (W * 4));
90-
const ivec4 p4 = swap_adj_dims(p3, W, 4, 1);
91-
const ivec4 p5 = swap_adj_dims(p4, H, 4, W);
92-
93-
// Undo step 0 permute: (8,12,3,3) -> (12,8,3,3)
94-
const ivec4 p6 = swap_adj_dims(p5, Cp, Np, (W * H));
95-
// Undo step 0 flip: (2,3)
96-
const ivec4 w = p6 % W;
97-
const ivec4 h = p6 % (H * W) / W;
98-
const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h);
99-
100-
// Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
101-
// For values in the padded region, write zero instead of buffer data.
102-
const ivec4 c = p7 % (Cp * H * W) / (H * W);
103-
const ivec4 n = p7 / (Cp * H * W);
104-
const ivec4 p8 = p7 - n * (Cp - C) * H * W;
105-
const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
106-
ivec4(greaterThanEqual(n, ivec4(N)));
75+
// Map modified tensor_idx to modifed buffer_i
76+
// Zero out if modified tensor idx is out of bounds
77+
const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
78+
const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
10779

10880
VEC4_T texel = VEC4_T(0);
109-
if (mask.x == 0) {
110-
texel.x = SCALAR_T(buffer_in[p8.x]);
81+
if (mask.x) {
82+
texel.x = SCALAR_T(buffer_in[buf_i.x]);
11183
}
112-
if (mask.y == 0) {
113-
texel.y = SCALAR_T(buffer_in[p8.y]);
84+
if (mask.y) {
85+
texel.y = SCALAR_T(buffer_in[buf_i.y]);
11486
}
115-
if (mask.z == 0) {
116-
texel.z = SCALAR_T(buffer_in[p8.z]);
87+
if (mask.z) {
88+
texel.z = SCALAR_T(buffer_in[buf_i.z]);
11789
}
118-
if (mask.w == 0) {
119-
texel.w = SCALAR_T(buffer_in[p8.w]);
90+
if (mask.w) {
91+
texel.w = SCALAR_T(buffer_in[buf_i.w]);
12092
}
12193

12294
imageStore(image_out, pos.xy, texel);

backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -154,18 +154,3 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
154154
pos.w = idx[packed_dim] % 4;
155155
return pos;
156156
}
157-
158-
//
159-
// Miscellaneous Utility Functions and Macros
160-
//
161-
162-
// Given a buffer(1-D) index cur, compute a new index where the corresponding
163-
// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
164-
// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
165-
// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
166-
// plane=2*24=48.
167-
#define swap_adj_dims(cur, x, y, plane) \
168-
cur + \
169-
plane * \
170-
((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
171-
(x - 1) * ((cur % (y * plane)) / plane))

0 commit comments

Comments
 (0)