Skip to content

Commit e1de9eb

Browse files
committed
[ET-VK] Bring back extents_ubo() as texture_limits_ubo()
## Context #3181 deprecated the `gpu_sizes_ubo()` and `extents_ubo()` functions of `vTensor` in order to standardize how shaders consume shape/size metadata of input tensors. However, this came at the cost of increasing the overhead required for bounds checking, which is needed to support dynamic shapes as shaders now needed to convert the input sizes to texture limits before checking if a given texel position is out of bounds. Benchmarking revealed that this overhead can be quite significant especially on lower power mobile GPUs. In the interest of preserving performance, `extents_ubo()` is re-introduced since bounds checking is an operation that is common to every single shader. However, some improvements are made: * instead of `extents`, the nomenclature `texture_limits` is used in order to differentiate from physical image extents of the texture. * `texture_limits` is represented via an `ivec3` (previously `uvec4`); this means that to use it for bounds checking, there does not need to be an implicit cast to from `uvec` to `ivec` and there is also no need for swizzling. Also introduced in this changeset is the convention of passing both the texture limits and tensor sizes instead of using `pos_out_of_bounds()`. Passing in the texture limits is probably cheaper than using `pos_out_of_bounds()`. There are some exceptions though where I choose not to migrate to this pattern to avoid passing in too many variants of tensor metadata. ### What about `gpu_sizes_ubo`? I will hold off on re-introducing `gpu_sizes_ubo` for now since converting `sizes` to `gpu_sizes` is much cheaper compared to `pos_out_of_bounds()`: ``` ivec4 sizes[packed_dim] = alignup4(sizes[packed_dim]) ``` Will perform some additional benchmarking on this to see if the overhead of the alignment warrants an explicit API for passing in GPU sizes to shaders. Differential Revision: [D56435574](https://our.internmc.facebook.com/intern/diff/D56435574/) ghstack-source-id: 223453651 Pull Request resolved: #3217
1 parent 8dc54d5 commit e1de9eb

31 files changed

+202
-135
lines changed

backends/vulkan/runtime/api/Tensor.cpp

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,10 @@ vTensor::vTensor(
139139
// Calculate sizes and strides
140140
sizes_(sizes.begin(), sizes.end()),
141141
gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
142-
// Utility Uniform Buffer that can be passed to shaders as arguments
143-
sizes_uniform_(context, api::utils::make_whcn_ivec4(sizes_)),
142+
texture_limits_{{0, 0, 0}},
143+
// Utility Uniform Buffers that can be passed to shaders as arguments
144+
sizes_uniform_(),
145+
texture_limits_uniform_(),
144146
// Construct Tensor storage
145147
storage_(
146148
context,
@@ -149,6 +151,13 @@ vTensor::vTensor(
149151
gpu_sizes_,
150152
dtype_,
151153
allocate_memory) {
154+
if (storage_type != api::kBuffer) {
155+
texture_limits_.limits = api::utils::ivec3{
156+
api::utils::safe_downcast<int32_t>(storage_.extents_.data[0]),
157+
api::utils::safe_downcast<int32_t>(storage_.extents_.data[1]),
158+
api::utils::safe_downcast<int32_t>(storage_.extents_.data[2])};
159+
}
160+
152161
if (dtype == api::kHalf) {
153162
VK_CHECK_COND(
154163
api::context()->adapter_ptr()->has_16bit_storage(),
@@ -187,6 +196,22 @@ api::VulkanBuffer& vTensor::buffer(
187196
return storage_.buffer_;
188197
}
189198

199+
const api::BufferBindInfo vTensor::sizes_ubo() {
200+
if (!sizes_uniform_.buffer()) {
201+
sizes_uniform_ = api::UniformParamsBuffer(
202+
storage_.context_, api::utils::make_whcn_ivec4(sizes_));
203+
}
204+
return api::BufferBindInfo(sizes_uniform_.buffer());
205+
}
206+
207+
const api::BufferBindInfo vTensor::texture_limits_ubo() {
208+
if (!texture_limits_uniform_.buffer()) {
209+
texture_limits_uniform_ =
210+
api::UniformParamsBuffer(storage_.context_, texture_limits_);
211+
}
212+
return api::BufferBindInfo(texture_limits_uniform_.buffer());
213+
}
214+
190215
VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
191216
switch (storage_type()) {
192217
case api::kBuffer:
@@ -224,7 +249,25 @@ void vTensor::bind_allocation(const api::MemoryAllocation& allocation) {
224249
void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
225250
sizes_ = new_sizes;
226251
gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
227-
sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
252+
253+
if (storage_type() != api::kBuffer) {
254+
// Calculate the extents of the image texture that would have been required
255+
// for a tensor of the new sizes.
256+
api::utils::uvec3 virtual_extents =
257+
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
258+
// Update the texture limits to reflect the new virtual extents.
259+
texture_limits_.limits = api::utils::ivec3{
260+
api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
261+
api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
262+
api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
263+
}
264+
265+
if (sizes_uniform_.buffer()) {
266+
sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
267+
}
268+
if (texture_limits_uniform_.buffer()) {
269+
texture_limits_uniform_.update(texture_limits_);
270+
}
228271
}
229272

230273
void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
@@ -236,6 +279,8 @@ void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
236279
}
237280

238281
void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
282+
// For texture storage check that the current texture is large enough for the
283+
// new sizes of the tensor.
239284
if (storage_type() != api::kBuffer) {
240285
api::utils::uvec3 virtual_extents =
241286
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);

backends/vulkan/runtime/api/Tensor.h

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,13 @@ class vTensorStorage final {
9494
};
9595

9696
class vTensor final {
97+
struct TextureLimits {
98+
// Alignment is required to conform with Vulkan specification; a 3 or 4
99+
// component vector with components of size N must have base alignment of
100+
// 4N.
101+
alignas(16) api::utils::ivec3 limits;
102+
};
103+
97104
public:
98105
explicit vTensor(
99106
api::Context* context,
@@ -115,11 +122,18 @@ class vTensor final {
115122

116123
std::vector<int64_t> sizes_;
117124
std::vector<int64_t> gpu_sizes_;
125+
TextureLimits texture_limits_;
118126

119-
// A Vulkan uniform buffer containing the tensor sizes in WHCN that can be
120-
// passed into a shader.
127+
// A Vulkan uniform buffer containing the (W, H, C, N) tensor sizes that can
128+
// be passed into a shader.
121129
api::UniformParamsBuffer sizes_uniform_;
122130

131+
// A Vulkan uniform buffer containing the texture limits derived from the
132+
// tensor's current size information that can be passed into a shader. Note
133+
// that the texture limits may be different from the texture's extents if the
134+
// tensor has been resized with `virtual_resize()`.
135+
api::UniformParamsBuffer texture_limits_uniform_;
136+
123137
vTensorStorage storage_;
124138

125139
public:
@@ -194,11 +208,17 @@ class vTensor final {
194208

195209
/*
196210
* Get the binding information for the uniform buffer object containing the
197-
* tensor sizes to use in a compute shader.
211+
* tensor sizes to use in a compute shader. Note that the GPU buffer will be
212+
* allocated the first time this function is called.
198213
*/
199-
inline const api::BufferBindInfo sizes_ubo() {
200-
return api::BufferBindInfo(sizes_uniform_.buffer());
201-
}
214+
const api::BufferBindInfo sizes_ubo();
215+
216+
/*
217+
* Get the binding information for the uniform buffer object containing the
218+
* texture limits to use in a compute shader. Note that the GPU buffer will be
219+
* allocated the first time this function is called.
220+
*/
221+
const api::BufferBindInfo texture_limits_ubo();
202222

203223
inline size_t numel() const {
204224
return api::utils::multiply_integers(sizes());

backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
2222
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
2323

24-
layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
25-
ivec4 out_sizes;
24+
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
25+
ivec3 out_limits;
2626
};
2727

2828
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
4444

4545
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4646

47-
layout(constant_id = 3) const int packed_dim = C_DIM;
48-
4947
/*
5048
* Computes a 2D convolution. Each shader invocation calculates the output at
5149
* a single output location.
5250
*/
5351
void main() {
5452
const ivec3 pos = ivec3(gl_GlobalInvocationID);
5553

56-
if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
54+
if (any(greaterThanEqual(pos, out_limits))) {
5755
return;
5856
}
5957

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
2222
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
2323

24-
layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
25-
ivec4 out_sizes;
24+
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
25+
ivec3 out_limits;
2626
};
2727

2828
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
4444

4545
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4646

47-
layout(constant_id = 3) const int packed_dim = C_DIM;
48-
4947
/*
5048
* Computes a depthwise convolution. Each shader invocation calculates the
5149
* output at a single output location.
5250
*/
5351
void main() {
5452
const ivec3 pos = ivec3(gl_GlobalInvocationID);
5553

56-
if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
54+
if (any(greaterThanEqual(pos, out_limits))) {
5755
return;
5856
}
5957

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
2222
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
2323

24-
layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
25-
ivec4 out_sizes;
24+
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
25+
ivec3 out_limits;
2626
};
2727

2828
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,16 +44,14 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
4444

4545
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4646

47-
layout(constant_id = 3) const int packed_dim = C_DIM;
48-
4947
/*
5048
* Computes a depthwise convolution. Each shader invocation calculates the
5149
* output at a single output location.
5250
*/
5351
void main() {
5452
const ivec3 pos = ivec3(gl_GlobalInvocationID);
5553

56-
if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
54+
if (any(greaterThanEqual(pos, out_limits))) {
5755
return;
5856
}
5957

backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
2222
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
2323

24-
layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
25-
ivec4 out_sizes;
24+
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
25+
ivec3 out_limits;
2626
};
2727

2828
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
@@ -44,8 +44,6 @@ layout(set = 0, binding = 7) uniform PRECISION restrict ExtraParams {
4444

4545
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4646

47-
layout(constant_id = 3) const int packed_dim = C_DIM;
48-
4947
/*
5048
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
5149
* output tile for pointwise convolution is more efficient because the kernel
@@ -71,7 +69,7 @@ void main() {
7169

7270
// If the top left position is out of bounds, then this invocation will have
7371
// no work to do.
74-
if (pos_out_of_bounds(pos[0], out_sizes, packed_dim)) {
72+
if (any(greaterThanEqual(pos[0], out_limits))) {
7573
return;
7674
}
7775

@@ -146,7 +144,7 @@ void main() {
146144
}
147145

148146
for (int i = 0; i < ${TILE_SIZE * TILE_SIZE}; ++i) {
149-
if (!pos_out_of_bounds(pos[i], out_sizes, packed_dim)) {
147+
if (all(lessThan(pos[i], out_limits))) {
150148
imageStore(image_out, pos[i], sum[i]);
151149
}
152150
}

backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@ layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
2121
layout(set = 0, binding = 2) uniform PRECISION sampler2D kernel_in;
2222
layout(set = 0, binding = 3) uniform PRECISION sampler2D bias_in;
2323

24-
layout(set = 0, binding = 4) uniform PRECISION restrict OutSizes {
25-
ivec4 out_sizes;
24+
layout(set = 0, binding = 4) uniform PRECISION restrict OutLimits {
25+
ivec3 out_limits;
2626
};
2727

28-
layout(set = 0, binding = 5) uniform PRECISION restrict InExtents {
28+
layout(set = 0, binding = 5) uniform PRECISION restrict InSizes {
2929
ivec4 in_sizes;
3030
};
3131

@@ -54,7 +54,7 @@ layout(constant_id = 3) const int packed_dim = C_DIM;
5454
void main() {
5555
const ivec3 pos = ivec3(gl_GlobalInvocationID);
5656

57-
if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
57+
if (any(greaterThanEqual(pos, out_limits))) {
5858
return;
5959
}
6060

backends/vulkan/runtime/graph/ops/glsl/matmul.glsl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
1616
layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
1717
layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
1818

19-
layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
20-
ivec4 out_sizes;
19+
layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
20+
ivec3 out_limits;
2121
};
2222

2323
layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -26,12 +26,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
2626

2727
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2828

29-
layout(constant_id = 3) const int out_packed_dim = C_DIM;
30-
3129
void main() {
3230
const ivec3 pos = ivec3(gl_GlobalInvocationID);
3331

34-
if (pos_out_of_bounds(pos, out_sizes, out_packed_dim)) {
32+
if (any(greaterThanEqual(pos, out_limits))) {
3533
return;
3634
}
3735

backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict
1919
layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
2020
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
2121

22-
layout(set = 0, binding = 3) uniform PRECISION restrict OutSizes {
23-
ivec4 out_sizes;
22+
layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
23+
ivec3 out_limits;
2424
};
2525

2626
layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
@@ -36,12 +36,10 @@ layout(set = 0, binding = 5) uniform PRECISION restrict Params {
3636

3737
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3838

39-
layout(constant_id = 3) const int packed_dim = C_DIM;
40-
4139
void main() {
4240
const ivec3 pos = ivec3(gl_GlobalInvocationID);
4341

44-
if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
42+
if (any(greaterThanEqual(pos, out_limits))) {
4543
return;
4644
}
4745

backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,24 @@ layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
2525
layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
2626
layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
2727

28-
layout(set = 0, binding = 6) uniform PRECISION restrict Sizes {
28+
layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
29+
ivec3 out_limits;
30+
};
31+
32+
layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
2933
ivec4 sizes;
3034
};
3135

32-
layout(set = 0, binding = 7) uniform PRECISION restrict Epsilon {
36+
layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
3337
float epsilon;
3438
};
3539

3640
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
3741

38-
layout(constant_id = 3) const int packed_dim = C_DIM;
39-
4042
void main() {
4143
const ivec3 pos = ivec3(gl_GlobalInvocationID);
4244

43-
if (pos_out_of_bounds(pos, sizes, packed_dim)) {
45+
if (any(greaterThanEqual(pos, out_limits))) {
4446
return;
4547
}
4648

0 commit comments

Comments
 (0)