Skip to content

Commit c4ac14c

Browse files
jorgep31415facebook-github-bot
authored andcommitted
aten.convolution (Depthwise) (#2884)
Summary: Pull Request resolved: #2884 ## Summary We introduce support for the convolution cases covered by [ATen-VK's default Depthwise implementation](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L68). This is achieved by - reusing the [existing `conv2d_dw.glsl`](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl), and - [moving special weights prepacking from CPU](https://github.com/pytorch/pytorch/blob/09c72eaa3f69f90402c86a30abf4fc621298578c/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L80-L132) to the GPU in `conv2d_dw_prepack_weights.glsl`. The implementation is on-par with ATen-VK's Depthwise. This means it only covers: - `in_channels == groups`, `out_channels == groups` A full implementation would cover, for any positive integer K: - `in_channels == groups`, `out_channels == groups * K` ghstack-source-id: 221721752 exported-using-ghexport bypass-github-export-checks Reviewed By: SS-JIA Differential Revision: D55813511 fbshipit-source-id: c0726798bd36cc5ff2326836c28a5f7d23494f5e
1 parent 8a6427e commit c4ac14c

File tree

6 files changed

+335
-23
lines changed

6 files changed

+335
-23
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#include "indexing_utils.h"
14+
15+
layout(std430) buffer;
16+
17+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
18+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
19+
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
20+
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
21+
22+
layout(set = 0, binding = 4) uniform PRECISION restrict OutExtents {
23+
uvec4 data;
24+
}
25+
out_extents;
26+
27+
layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
28+
uvec4 data;
29+
}
30+
in_extents;
31+
32+
layout(set = 0, binding = 6) uniform PRECISION restrict Params {
33+
ivec2 kernel_size;
34+
ivec2 stride;
35+
ivec2 padding;
36+
ivec2 dilation;
37+
}
38+
params;
39+
40+
// If fields are separated, SwiftShader cannot identify in_group_size.
41+
layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
42+
ivec2 overlay_region;
43+
int in_group_size;
44+
}
45+
extra_params;
46+
47+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
48+
49+
/*
50+
* Computes a depthwise convolution. Each shader invocation calculates the
51+
* output at a single output location.
52+
*/
53+
void main() {
54+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
55+
56+
if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
57+
return;
58+
}
59+
60+
// Compute the index of the top-left element of the overlay region. Negative
61+
// indices indicate that the top-left element is in a region added by padding.
62+
const ivec2 ipos = pos.xy * params.stride - params.padding;
63+
64+
// Compute the start and end of the input indices to load. Padding is assumed
65+
// to be constant 0 padding, so reads from the padding region are skipped.
66+
const ivec2 start = ipos;
67+
const ivec2 end = ipos + extra_params.overlay_region.xy;
68+
69+
${VEC4_T[DTYPE]} sum = texelFetch(bias_in, ivec2(pos.z, 0), 0);
70+
int kx = 0;
71+
for (int y = start.y; y < end.y; y += params.dilation.y) {
72+
for (int x = start.x; x < end.x; x += params.dilation.x) {
73+
// The weight kernel was rearranged so that every NxN filter is flattened
74+
// to fits in one row. Each filter was then stacked on top of each other
75+
// vertically.
76+
const ${VEC4_T[DTYPE]} in_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
77+
sum = fma(in_texel, texelFetch(kernel_in, ivec2(kx, pos.z), 0), sum);
78+
++kx;
79+
}
80+
}
81+
82+
imageStore(image_out, pos, sum);
83+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
conv2d_dw:
8+
parameter_names_with_default_values:
9+
NDIM: 3
10+
DTYPE: float
11+
generate_variant_forall:
12+
DTYPE:
13+
- VALUE: half
14+
SUFFIX: half
15+
- VALUE: float
16+
SUFFIX: float
17+
shader_variants:
18+
- NAME: conv2d_dw
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#include "indexing_utils.h"
14+
15+
layout(std430) buffer;
16+
17+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
18+
layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
19+
${T[DTYPE]} data[];
20+
}
21+
buffer_in;
22+
23+
// Corresponds to {1,4,3,9} in the example below.
24+
layout(set = 0, binding = 2) uniform PRECISION restrict GpuSizes {
25+
ivec4 data;
26+
}
27+
gpu_sizes;
28+
29+
// Corresponds to {3,3,1,11} in the example below.
30+
layout(set = 0, binding = 3) uniform PRECISION restrict OriginalSizes {
31+
ivec4 data;
32+
}
33+
original_sizes;
34+
35+
// Corresponds to {1,12} in the example below.
36+
layout(set = 0, binding = 4) uniform PRECISION restrict PaddedSizes {
37+
ivec2 data;
38+
}
39+
padded_sizes;
40+
41+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
42+
43+
/*
44+
* Computes special prepacking for a depthwise convolution. Each shader invocation
45+
* calculates the input buffer location to read into the desired texel. This
46+
* packing was originally developed on CPU and that approach is described in the
47+
* rest of this comment. Refer to the code-level comments, for how we translate
48+
* it to GPU by reversing the steps.
49+
*
50+
* Consider an example weight tensor of size {11,1,3,3}. The following
51+
* transformations will be applied.
52+
*
53+
* 1. Pad the N dim so that it is a multiple of 4. In this case, 1
54+
* batch of padding is added, producing a tensor of size {12,1,3,3}.
55+
* at::pad(x, {0,0,0,0,0,0,0,1}, "constant", 0);
56+
*
57+
* 2. Flatten the last two dims by reshaping the tensor:
58+
* x.reshape({12,1,9});
59+
*
60+
* 3. "Fold" the N dim into the C dim. Split the tensor along the N dim so that
61+
* each split has 4 channels.
62+
* x.reshape({3,4,1,9});
63+
*
64+
* 4. Stack the batches on each other vertically by permuting the N and C dims
65+
* and reshaping the tensor.
66+
* x.permute({1,0,2,3}).reshape({4,3,9});
67+
*/
68+
void main() {
69+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
70+
const ivec4 coord = POS_TO_COORD_CHANNELS_PACKED(pos, gpu_sizes.data);
71+
72+
if (any(greaterThanEqual(coord, gpu_sizes.data))) {
73+
return;
74+
}
75+
76+
// As in usual staging shaders, map from GPU texel position to normal CPU
77+
// buffer indices: (9,3) -> (4,3,9)
78+
const int base_index = COORD_TO_BUFFER_IDX(coord, gpu_sizes.data);
79+
const ivec4 p0 =
80+
base_index + ivec4(0, 1, 2, 3) * STRIDE_CHANNELS_PACKED(gpu_sizes.data);
81+
82+
// Re-map the normal CPU buffer indices to special indices, through a series
83+
// of mappings: reshape is a no-op to the underlying indices, so we only map
84+
// for pad and permute.
85+
const int Np = padded_sizes.data.x;
86+
const int N = original_sizes.data.w;
87+
const int C = original_sizes.data.z;
88+
const int H = original_sizes.data.y;
89+
const int W = original_sizes.data.x;
90+
91+
// Undo step 3 permute: (4,3,1,9) -> (3,4,1,9)
92+
const ivec4 p1 = SWAP_ADJ_DIMS(p0, 4, (Np / 4), (C * H * W));
93+
94+
// Undo step 1 pad: (12,1,3,3) -> (11,1,3,3)
95+
// For values in the padded region, write zero instead of buffer data.
96+
const ivec4 n = p1 / (C * H * W);
97+
const ivec4 mask = ivec4(greaterThanEqual(n, ivec4(N)));
98+
99+
${T[DTYPE]} val_x = mix(buffer_in.data[p1.x], 0, mask.x);
100+
${T[DTYPE]} val_y = mix(buffer_in.data[p1.y], 0, mask.y);
101+
${T[DTYPE]} val_z = mix(buffer_in.data[p1.z], 0, mask.z);
102+
${T[DTYPE]} val_w = mix(buffer_in.data[p1.w], 0, mask.w);
103+
104+
${VEC4_T[DTYPE]} texel = ${VEC4_T[DTYPE]}(val_x, val_y, val_z, val_w);
105+
106+
imageStore(image_out, pos.xy, texel);
107+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
conv2d_dw_prepack_weights:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
generate_variant_forall:
11+
DTYPE:
12+
- VALUE: half
13+
SUFFIX: half
14+
- VALUE: float
15+
SUFFIX: float
16+
shader_variants:
17+
- NAME: conv2d_dw_prepack_weights

0 commit comments

Comments
 (0)