Skip to content

[ET-VK] Simplify conv2d weight prepacking (>2x pipeline-creation speedup) #3368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,49 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
BUF_T buffer_in[];
};

// Corresponds to {1,4,3,9} in the example below.
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
ivec4 sizes;
};

// Corresponds to {3,3,1,11} in the example below.
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
ivec4 original_sizes;
};

// Corresponds to {1,12} in the example below.
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
ivec2 padded_sizes;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

/*
* Computes special prepacking for a depthwise convolution. Each shader invocation
* calculates the input buffer location to read into the desired texel. This
* packing was originally developed on CPU and that approach is described in the
* rest of this comment. Refer to the code-level comments, for how we translate
* it to GPU by reversing the steps.
*
* Consider an example weight tensor of size {11,1,3,3}. The following
* transformations will be applied.
*
* 1. Pad the N dim so that it is a multiple of 4. In this case, 1
* batch of padding is added, producing a tensor of size {12,1,3,3}.
* at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
*
* 2. Flatten the last two dims by reshaping the tensor:
* x.reshape({12,1,9});
*
* 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
* each split has 4 channels.
* x.reshape({3,4,1,9});
*
* 4. Stack the batches on each other vertically by permuting the N and C dims
* and reshaping the tensor.
* x.permute({1,0,2,3}).reshape({4,3,9});
* packing was originally developed on CPU here:
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118
*/
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
Expand All @@ -78,39 +52,40 @@ void main() {
return;
}

// As in usual staging shaders, map from GPU texel position to normal CPU
// buffer indices: (9,3) -> (4,3,9)
// Map tensor_idx to normal buffer_i
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);

// Re-map the normal CPU buffer indices to special indices, through a series
// of mappings: reshape is a no-op to the underlying indices, so we only map
// for pad and permute.
const int Np = padded_sizes.x;
// Compute modified tensor_idx by inverting the CPU function
const int N = original_sizes.w;
const int C = original_sizes.z;
const int H = original_sizes.y;
const int W = original_sizes.x;
const int Y = sizes.y;

const ivec4 p1 = p0 / W;
const ivec4 p2 = p1 / H;

// Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (C * H * W));
const ivec4 n = (p2 % Y) * 4 + (p2 / Y);
const ivec4 h = p1 % H;
const ivec4 w = p0 % W;

// Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
// For values in the padded region, write zero instead of buffer data.
const ivec4 n = p1 / (C * H * W);
const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
// Map modified tensor_idx to modifed buffer_i
// Zero out if modified tensor idx is out of bounds
const ivec4 buf_i = n * C*H*W + h * W + w;
const bvec4 mask = bvec4(lessThan(n, ivec4(N)));

VEC4_T texel = VEC4_T(0);
if (mask.x == 0) {
texel.x = SCALAR_T(buffer_in[p1.x]);
if (mask.x) {
texel.x = SCALAR_T(buffer_in[buf_i.x]);
}
if (mask.y == 0) {
texel.y = SCALAR_T(buffer_in[p1.y]);
if (mask.y) {
texel.y = SCALAR_T(buffer_in[buf_i.y]);
}
if (mask.z == 0) {
texel.z = SCALAR_T(buffer_in[p1.z]);
if (mask.z) {
texel.z = SCALAR_T(buffer_in[buf_i.z]);
}
if (mask.w == 0) {
texel.w = SCALAR_T(buffer_in[p1.w]);
if (mask.w) {
texel.w = SCALAR_T(buffer_in[buf_i.w]);
}

imageStore(image_out, pos.xy, texel);
Expand Down
103 changes: 29 additions & 74 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,63 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
BUF_T buffer_in[];
};

// Corresponds to {1,4,9,24} in the example below.
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
ivec4 sizes;
};

// Corresponds to {3,3,7,10} in the example below.
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
ivec4 original_sizes;
};

// Corresponds to {8,12} in the example below.
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
ivec2 padded_sizes;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

/*
* Computes special prepacking for a 2D convolution. Each shader invocation
* calculates the input buffer location to read into the desired texel. This
* packing was originally developed on CPU and that approach is described in the
* rest of this comment. Refer to the code-level comments, for how we translate
* it to GPU by reversing the steps.
*
* Consider an example weight tensor of size {10,7,3,3}. The following
* transformations will be applied.
*
* 1. Pad the N and C dims so that both are a multiple of 4. In this case, 2
* batches and 1 channel of padding are added, producing a tensor of size
* {12,8,3,3}.
* at::pad(x, {0,0,0,0,0,1,0,2}, "constant", 0);
*
* 2. Split the tensor along the C dim so that each split has 4 channels.
* x.reshape({12,2,4,3,3});
*
* 3. For each split, "fold" the C dim into the W dim. Suppose the first rows
* at H=0 of the split have values
* 0,1,2 | 10,11,12 | 20,21,22 | 30,31,32
*
* where | denotes a channel boundary. Then, the goal is to combine those rows
* into one row with the values
* 0, 10, 20, 30, 1, 11, 21, 31, 2, 12, 22, 32
*
* x.permute({0,1,3,4,2}).reshape({12,2,3,12});
*
* 4. Stack the splits belonging to the same batch horizontally by swapping the
* C and H dims.
* x.permute({0,2,1,3}).reshape({12,3,24});
*
* 5. Repeat a similar process to "fold" the N dim into the C dim. Split along
* the N dim so that each split has 4 batches.
* x.reshape({3,4,3,24});
*
* 6. Stack the batches on each other vertically by swapping the N and C dims.
* x.permute({1,0,2,3}).reshape({4,9,24});
* calculates the input buffer locations to read into the desired texel. This
* packing was originally developed on CPU here:
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
*/
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
Expand All @@ -92,49 +52,44 @@ void main() {
return;
}

// As in usual staging shaders, map from GPU texel position to normal CPU
// buffer indices: (24,9) -> (4,9,24)
// Map tensor_idx to normal buffer_i
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);

// Re-map the normal CPU buffer indices to special indices, through a series
// of mappings: reshape is a no-op to the underlying indices, so we only map
// for pad and permute.
const int Np = padded_sizes.y;
const int Cp = padded_sizes.x;
// Compute modified tensor_idx by inverting the CPU function
const int N = original_sizes.w;
const int C = original_sizes.z;
const int H = original_sizes.y;
const int W = original_sizes.x;
const int J = sizes.x / (4*W);
const int K = sizes.y / H;

const ivec4 p1 = p0 / 4;
const ivec4 p2 = p1 / W;
const ivec4 p3 = p2 / J;
const ivec4 p4 = p3 / H;

const ivec4 n = (p4 % K) * 4 + (p4 / K);
const ivec4 c = (p2 % J) * 4 + (p0 % 4);
const ivec4 h = p3 % H;
const ivec4 w = p1 % W;

// Undo step 6 premute: (4,3,3,24) -> (3,4,3,24)
// Undo step 4 permute: (12,3,2,12) -> (12,2,3,12)
// Undo step 3 permute, part 1: (12,2,3h,3w,4) -> (12,2,3h,4,3w)
// Undo step 3 permute, part 2: (12,2,3h,4,3w) -> (12,2,4,3h,3w)
const ivec4 p1 = swap_adj_dims(p0, 4, (Np / 4), (H * Cp * W));
const ivec4 p2 = swap_adj_dims(p1, H, (Cp / 4), (W * 4));
const ivec4 p3 = swap_adj_dims(p2, W, 4, 1);
const ivec4 p4 = swap_adj_dims(p3, H, 4, W);

// Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
// For values in the padded region, write zero instead of buffer data.
const ivec4 c = p4 % (Cp * H * W) / (H * W);
const ivec4 n = p4 / (Cp * H * W);
const ivec4 p5 = p4 - n * (Cp - C) * H * W;
const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
ivec4(greaterThanEqual(n, ivec4(N)));
// Map modified tensor_idx to modified buffer_i
// Zero out if modified tensor idx is out of bounds
const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));

VEC4_T texel = VEC4_T(0);
if (mask.x == 0) {
texel.x = SCALAR_T(buffer_in[p5.x]);
if (mask.x) {
texel.x = SCALAR_T(buffer_in[buf_i.x]);
}
if (mask.y == 0) {
texel.y = SCALAR_T(buffer_in[p5.y]);
if (mask.y) {
texel.y = SCALAR_T(buffer_in[buf_i.y]);
}
if (mask.z == 0) {
texel.z = SCALAR_T(buffer_in[p5.z]);
if (mask.z) {
texel.z = SCALAR_T(buffer_in[buf_i.z]);
}
if (mask.w == 0) {
texel.w = SCALAR_T(buffer_in[p5.w]);
if (mask.w) {
texel.w = SCALAR_T(buffer_in[buf_i.w]);
}

imageStore(image_out, pos.xy, texel);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,35 +26,23 @@ layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
BUF_T buffer_in[];
};

// Corresponds to {1,4,6,36} in the example below.
layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
ivec4 sizes;
};

// Corresponds to {3,3,7,10} in the example below.
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
ivec4 original_sizes;
};

// Corresponds to {8,12} in the example below.
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
ivec2 padded_sizes;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

/*
* Computes special prepacking for a 2D transpose convolution. Each shader
* invocation calculates the input buffer location to read into the desired
* texel.
*
* For details, refer to conv2d_prepack_weights.glsl which uses a similar
* approach. For transpose, there are slight differences to reflect the data
* access pattern in the shader. First, the weight tensor is flipped along the H
* and W dims. Second, steps 3 and 4 are slightly different so that the splits
* are interleaved.
* invocation calculates the input buffer locations to read into the desired
* texel. This packing was originally developed on CPU here:
* https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
*/
void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
Expand All @@ -64,59 +52,43 @@ void main() {
return;
}

// As in usual staging shaders, map from GPU texel position to normal CPU
// buffer indices: (36,6) -> (4,6,36)
// Map tensor_idx to normal buffer_i
const ivec4 p0 = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);

// Re-map the normal CPU buffer indices to special indices, through a series
// of mappings: reshape is a no-op to the underlying indices, so we only map
// for flip, pad, and permute.
const int Np = padded_sizes.y;
const int Cp = padded_sizes.x;
// Compute modified tensor_idx by inverting the CPU function
const int N = original_sizes.w;
const int C = original_sizes.z;
const int H = original_sizes.y;
const int W = original_sizes.x;
const int J = sizes.y / H;
const int K = sizes.x / (4*W);

const ivec4 p1 = p0 / (4*K);
const ivec4 p2 = p1 / W;
const ivec4 p3 = p2 / H;

const ivec4 n = p0 % (4*K);
const ivec4 c = (p3 % J) * 4 + (p3 / J);
const ivec4 h = H-1 - p2 % H;
const ivec4 w = W-1 - p1 % W;

// Undo step 6 premute: (4,2,3,36) -> (2,4,3,36)
// In the following comments, a=b=c=3.
// Undo step 3 permute, part 1: (8,a,b,c,4) -> (8,a,c,b,4)
// Undo step 3 permute, part 2: (8,a,c,b,4) -> (8,c,a,b,4)
// Undo step 3 permute, part 3: (8,c,a,b,4) -> (8,c,a,4,b)
// Undo step 3 permute, part 4: (8,c,a,4,b) -> (8,c,4,a,b)
const ivec4 p1 = swap_adj_dims(p0, 4, (Cp / 4), (H * Np * W));
const ivec4 p2 = swap_adj_dims(p1, W, (Np / 4), 4);
const ivec4 p3 = swap_adj_dims(p2, H, (Np / 4), (W * 4));
const ivec4 p4 = swap_adj_dims(p3, W, 4, 1);
const ivec4 p5 = swap_adj_dims(p4, H, 4, W);

// Undo step 0 permute: (8,12,3,3) -> (12,8,3,3)
const ivec4 p6 = swap_adj_dims(p5, Cp, Np, (W * H));
// Undo step 0 flip: (2,3)
const ivec4 w = p6 % W;
const ivec4 h = p6 % (H * W) / W;
const ivec4 p7 = p6 + W - 1 - 2 * w + W * (H - 1 - 2 * h);

// Undo step 1 pad: (12,8,3,3) -> (10,7,3,3)
// For values in the padded region, write zero instead of buffer data.
const ivec4 c = p7 % (Cp * H * W) / (H * W);
const ivec4 n = p7 / (Cp * H * W);
const ivec4 p8 = p7 - n * (Cp - C) * H * W;
const ivec4 mask = ivec4(greaterThanEqual(c, ivec4(C))) |
ivec4(greaterThanEqual(n, ivec4(N)));
// Map modified tensor_idx to modifed buffer_i
// Zero out if modified tensor idx is out of bounds
const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));

VEC4_T texel = VEC4_T(0);
if (mask.x == 0) {
texel.x = SCALAR_T(buffer_in[p8.x]);
if (mask.x) {
texel.x = SCALAR_T(buffer_in[buf_i.x]);
}
if (mask.y == 0) {
texel.y = SCALAR_T(buffer_in[p8.y]);
if (mask.y) {
texel.y = SCALAR_T(buffer_in[buf_i.y]);
}
if (mask.z == 0) {
texel.z = SCALAR_T(buffer_in[p8.z]);
if (mask.z) {
texel.z = SCALAR_T(buffer_in[buf_i.z]);
}
if (mask.w == 0) {
texel.w = SCALAR_T(buffer_in[p8.w]);
if (mask.w) {
texel.w = SCALAR_T(buffer_in[buf_i.w]);
}

imageStore(image_out, pos.xy, texel);
Expand Down
15 changes: 0 additions & 15 deletions backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,3 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
pos.w = idx[packed_dim] % 4;
return pos;
}

//
// Miscellaneous Utility Functions and Macros
//

// Given a buffer(1-D) index cur, compute a new index where the corresponding
// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
// plane=2*24=48.
#define swap_adj_dims(cur, x, y, plane) \
cur + \
plane * \
((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
(x - 1) * ((cur % (y * plane)) / plane))
Loading