Skip to content

Commit c670e6f

Browse files
SS-JIAfacebook-github-bot
authored andcommitted
Add transfer shaders for buffer storage tensors (#3684)
Summary: ## Context Add transfer shaders for tensors that use buffer storage, in preparation for quantization support. Differential Revision: D57577019
1 parent a707550 commit c670e6f

18 files changed

+754
-179
lines changed

backends/vulkan/runtime/api/Tensor.cpp

Lines changed: 100 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -11,117 +11,87 @@
1111

1212
namespace vkcompute {
1313

14-
namespace {
15-
16-
/*
17-
* When stored on the GPU, one dimension will be aligned to the next multiple of
18-
* 4 in order to take advantage of vec4 data types. The dimension that is
19-
* packed is denoted by the GPUMemoryLayout. This function adjusts one of
20-
* the dimensions based on the desired memory format and storage type and
21-
* returns a sizes array describing the dimensions of the memory used to store
22-
* the tensor data on the GPU.
23-
*/
24-
std::vector<int64_t> calc_gpu_sizes(
14+
std::vector<int64_t> calculate_strides(
2515
const std::vector<int64_t>& sizes,
2616
const api::GPUMemoryLayout memory_layout,
27-
const api::StorageType storage_type) {
28-
std::vector<int64_t> gpu_sizes;
29-
if (storage_type == api::kBuffer) {
30-
gpu_sizes.resize(sizes.size());
31-
for (size_t i = 0; i < sizes.size(); i++) {
32-
gpu_sizes.at(i) = sizes.at(i);
17+
const bool texel_strides) {
18+
const int64_t dim_offset = api::to_dim_offset<int64_t>(memory_layout);
19+
const int64_t last_dim = sizes.size() - dim_offset;
20+
VK_CHECK_COND(last_dim >= 0);
21+
22+
size_t ndim = sizes.size();
23+
std::vector<int64_t> strides(ndim);
24+
25+
const int64_t last_dim_size = texel_strides
26+
? api::utils::div_up(sizes.at(last_dim), INT64_C(4))
27+
: sizes.at(last_dim);
28+
29+
for (int stride_d = ndim - 1; stride_d >= 0; stride_d--) {
30+
strides.at(stride_d) = 1;
31+
if (stride_d == last_dim) {
32+
continue;
3333
}
34-
}
35-
// For texture storage, tensors are typically stored using 3D image textures.
36-
// Batches are stacked along the depth dimension. To represent the physical
37-
// 3 dimensionality of the image texture (with concatenated batches) GPU sizes
38-
// will be fixed to 4 dimensions when using texture storage.
39-
else {
40-
VK_CHECK_COND(
41-
sizes.size() >= 0 && sizes.size() <= 4,
42-
"Texture storage only valid for 0 <= ndim <= 4, received: ",
43-
sizes.size());
44-
45-
gpu_sizes.resize(4);
46-
gpu_sizes.at(0) = api::utils::val_at(-4, sizes);
47-
gpu_sizes.at(1) = api::utils::val_at(-3, sizes);
48-
gpu_sizes.at(2) = api::utils::val_at(-2, sizes);
49-
gpu_sizes.at(3) = api::utils::val_at(-1, sizes);
50-
}
51-
52-
size_t ndim = gpu_sizes.size();
53-
switch (memory_layout) {
54-
case api::kWidthPacked:
55-
if (ndim >= 1) {
56-
gpu_sizes.at(ndim - 1) =
57-
api::utils::align_up(api::utils::val_at(-1, sizes), INT64_C(4));
34+
strides.at(stride_d) = last_dim_size;
35+
for (int size_d = ndim - 1; size_d > stride_d; size_d--) {
36+
if (size_d != last_dim) {
37+
strides.at(stride_d) *= sizes.at(size_d);
5838
}
59-
break;
39+
}
40+
}
41+
return strides;
42+
}
6043

61-
case api::kHeightPacked:
62-
if (ndim >= 2) {
63-
gpu_sizes.at(ndim - 2) =
64-
api::utils::align_up(api::utils::val_at(-2, sizes), INT64_C(4));
65-
}
66-
break;
44+
std::vector<int64_t> calculate_padded_sizes(
45+
const std::vector<int64_t>& sizes,
46+
const api::GPUMemoryLayout memory_layout) {
47+
int64_t ndim = sizes.size();
48+
if (ndim == 0) {
49+
ndim = 1;
50+
}
6751

68-
case api::kChannelsPacked:
69-
if (ndim >= 3) {
70-
gpu_sizes.at(ndim - 3) =
71-
api::utils::align_up(api::utils::val_at(-3, sizes), INT64_C(4));
72-
}
73-
break;
52+
// Tensor sizes will be unsqueezed up to the next multiple of 4
53+
const int64_t ndim_up4 = api::utils::align_up(ndim, INT64_C(4));
54+
std::vector<int64_t> gpu_sizes(ndim_up4);
55+
for (int64_t i = 0; i < ndim_up4; ++i) {
56+
gpu_sizes.at(i) = api::utils::val_at(i - ndim_up4, sizes);
7457
}
7558

59+
// Pad the packed dim to the next multiple of 4.
60+
const int64_t dim_offset = api::to_dim_offset<int64_t>(memory_layout);
61+
const int64_t padded_dim_size = api::utils::val_at(-dim_offset, sizes);
62+
gpu_sizes.at(ndim_up4 - dim_offset) =
63+
api::utils::align_up(padded_dim_size, INT64_C(4));
64+
7665
return gpu_sizes;
7766
}
7867

79-
/*
80-
* Creates a uvec3 denoting the extents of the image texture that will be
81-
* created to store a tensor of a given size.
82-
*/
83-
api::utils::uvec3 create_image_extents(
68+
api::utils::uvec3 calculate_texture_limits(
8469
const std::vector<int64_t>& gpu_sizes,
85-
const api::StorageType storage_type,
8670
const api::GPUMemoryLayout memory_layout) {
87-
size_t ndim = gpu_sizes.size();
71+
VK_CHECK_COND(gpu_sizes.size() == 4);
8872

89-
if (storage_type == api::kBuffer) {
90-
// image extents do not apply to buffer storage
91-
return {0u, 0u, 0u};
92-
} else {
93-
VK_CHECK_COND(
94-
ndim >= 1 && ndim <= 4,
95-
"Texture storage only valid for 1 <= ndim <= 4!");
96-
97-
using namespace api::utils;
98-
uint32_t width = safe_downcast<uint32_t>(val_at(-1, gpu_sizes));
99-
uint32_t height = safe_downcast<uint32_t>(val_at(-2, gpu_sizes));
100-
uint32_t channels = safe_downcast<uint32_t>(val_at(-3, gpu_sizes));
101-
uint32_t batch = safe_downcast<uint32_t>(val_at(-4, gpu_sizes));
102-
103-
switch (memory_layout) {
104-
case api::kWidthPacked:
105-
VK_CHECK_COND(width % 4 == 0, "Width must be divisible by 4!");
106-
width /= 4;
107-
break;
108-
case api::kHeightPacked:
109-
VK_CHECK_COND(height % 4 == 0, "Height must be divisible by 4!");
110-
height /= 4;
111-
break;
112-
case api::kChannelsPacked:
113-
VK_CHECK_COND(channels % 4 == 0, "Channels must be divisible by 4!");
114-
channels /= 4;
115-
break;
116-
default:
117-
VK_THROW("Invalid memory format used!");
118-
}
73+
uint32_t N = api::utils::safe_downcast<uint32_t>(gpu_sizes.at(0));
74+
uint32_t C = api::utils::safe_downcast<uint32_t>(gpu_sizes.at(1));
75+
uint32_t H = api::utils::safe_downcast<uint32_t>(gpu_sizes.at(2));
76+
uint32_t W = api::utils::safe_downcast<uint32_t>(gpu_sizes.at(3));
11977

120-
return {width, height, batch * channels};
78+
switch (memory_layout) {
79+
case api::kWidthPacked:
80+
VK_CHECK_COND(W % 4 == 0);
81+
W /= 4;
82+
break;
83+
case api::kHeightPacked:
84+
VK_CHECK_COND(H % 4 == 0);
85+
H /= 4;
86+
break;
87+
case api::kChannelsPacked:
88+
VK_CHECK_COND(C % 4 == 0);
89+
C /= 4;
90+
break;
12191
}
122-
}
12392

124-
} // namespace
93+
return {W, H, C * N};
94+
}
12595

12696
//
12797
// vTensor
@@ -138,12 +108,12 @@ vTensor::vTensor(
138108
memory_layout_(memory_layout),
139109
// Calculate sizes and strides
140110
sizes_(sizes.begin(), sizes.end()),
141-
gpu_sizes_{calc_gpu_sizes(sizes, memory_layout_, storage_type)},
111+
gpu_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
142112
texture_limits_{{0, 0, 0}},
143113
// Utility Uniform Buffers that can be passed to shaders as arguments
144114
sizes_uniform_(),
145115
texture_limits_uniform_(),
146-
packed_dim_meta_(),
116+
buffer_meta_uniform_(),
147117
// Construct Tensor storage
148118
storage_(
149119
context,
@@ -197,6 +167,16 @@ api::VulkanBuffer& vTensor::buffer(
197167
return storage_.buffer_;
198168
}
199169

170+
vTensor::BufferMetadata vTensor::make_buffer_metadata() {
171+
auto strides = calculate_strides(gpu_sizes_, memory_layout_);
172+
return BufferMetadata{
173+
api::utils::make_whcn_ivec4(sizes_),
174+
api::utils::make_whcn_ivec4(
175+
calculate_strides(gpu_sizes_, memory_layout_)),
176+
texel_numel(),
177+
packed_dim_ntexels()};
178+
}
179+
200180
const api::BufferBindInfo vTensor::sizes_ubo() {
201181
if (!sizes_uniform_.buffer()) {
202182
sizes_uniform_ = api::UniformParamsBuffer(
@@ -213,28 +193,12 @@ const api::BufferBindInfo vTensor::texture_limits_ubo() {
213193
return api::BufferBindInfo(texture_limits_uniform_.buffer());
214194
}
215195

216-
vTensor::PackedDimMeta vTensor::make_packed_dim_metadata() const {
217-
int64_t packed_dim = gpu_memory_layout_int();
218-
int32_t dim_size = api::utils::val_at(-(packed_dim + 1), sizes_);
219-
int32_t dim_size_padded = api::utils::val_at(-(packed_dim + 1), gpu_sizes_);
220-
int32_t dim_texel_len =
221-
api::utils::safe_downcast<int32_t>(extents().data[packed_dim]);
222-
int32_t padding = dim_size_padded - dim_size;
223-
224-
return {
225-
dim_size,
226-
dim_size_padded,
227-
dim_texel_len,
228-
padding,
229-
};
230-
}
231-
232-
const api::BufferBindInfo vTensor::packed_dim_meta_ubo() {
233-
if (!packed_dim_meta_.buffer()) {
234-
packed_dim_meta_ =
235-
api::UniformParamsBuffer(storage_.context_, make_packed_dim_metadata());
196+
const api::BufferBindInfo vTensor::buffer_meta_ubo() {
197+
if (!buffer_meta_uniform_.buffer()) {
198+
sizes_uniform_ =
199+
api::UniformParamsBuffer(storage_.context_, make_buffer_metadata());
236200
}
237-
return api::BufferBindInfo(packed_dim_meta_.buffer());
201+
return api::BufferBindInfo(sizes_uniform_.buffer());
238202
}
239203

240204
VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
@@ -273,45 +237,44 @@ void vTensor::bind_allocation(const api::Allocation& allocation) {
273237

274238
void vTensor::update_size_metadata(const std::vector<int64_t>& new_sizes) {
275239
sizes_ = new_sizes;
276-
gpu_sizes_ = calc_gpu_sizes(sizes_, memory_layout_, storage_type());
240+
gpu_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
277241

278-
if (storage_type() != api::kBuffer) {
279-
// Calculate the extents of the image texture that would have been required
280-
// for a tensor of the new sizes.
281-
api::utils::uvec3 virtual_extents =
282-
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
283-
// Update the texture limits to reflect the new virtual extents.
284-
texture_limits_.limits = api::utils::ivec3{
285-
api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
286-
api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
287-
api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
288-
}
242+
// Calculate the extents of the image texture that would have been required
243+
// for a tensor of the new sizes.
244+
api::utils::uvec3 virtual_extents =
245+
calculate_texture_limits(gpu_sizes_, memory_layout_);
246+
247+
// Update the texture limits to reflect the new virtual extents.
248+
texture_limits_.limits = api::utils::ivec3{
249+
api::utils::safe_downcast<int32_t>(virtual_extents.data[0]),
250+
api::utils::safe_downcast<int32_t>(virtual_extents.data[1]),
251+
api::utils::safe_downcast<int32_t>(virtual_extents.data[2])};
289252

290253
if (sizes_uniform_.buffer()) {
291254
sizes_uniform_.update(api::utils::make_whcn_ivec4(sizes_));
292255
}
293256
if (texture_limits_uniform_.buffer()) {
294257
texture_limits_uniform_.update(texture_limits_);
295258
}
296-
if (packed_dim_meta_.buffer()) {
297-
packed_dim_meta_.update(make_packed_dim_metadata());
259+
if (buffer_meta_uniform_.buffer()) {
260+
buffer_meta_uniform_.update(make_buffer_metadata());
298261
}
299262
}
300263

301264
void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
302265
update_size_metadata(new_sizes);
303266
storage_.discard_and_reallocate(
304-
calc_gpu_sizes(new_sizes, memory_layout_, storage_type()),
267+
calculate_padded_sizes(new_sizes, memory_layout_),
305268
memory_layout_,
306269
dtype_);
307270
}
308271

309272
void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
310-
// For texture storage check that the current texture is large enough for the
311-
// new sizes of the tensor.
312273
if (storage_type() != api::kBuffer) {
274+
// For texture storage check that the current texture is large enough for
275+
// the new sizes of the tensor.
313276
api::utils::uvec3 virtual_extents =
314-
create_image_extents(gpu_sizes_, storage_type(), memory_layout_);
277+
calculate_texture_limits(gpu_sizes_, memory_layout_);
315278

316279
bool valid_resize = virtual_extents.data[0] <= extents().data[0];
317280
valid_resize = valid_resize && virtual_extents.data[1] <= extents().data[1];
@@ -403,8 +366,7 @@ vTensorStorage::vTensorStorage(
403366
const bool allocate_memory)
404367
: context_(context),
405368
storage_type_{storage_type},
406-
extents_(
407-
create_image_extents(gpu_sizes, storage_type, gpu_memory_layout)),
369+
extents_(calculate_texture_limits(gpu_sizes, gpu_memory_layout)),
408370
buffer_length_{api::utils::multiply_integers(gpu_sizes)},
409371
image_(allocate_image(
410372
context_,
@@ -496,7 +458,7 @@ void vTensorStorage::discard_and_reallocate(
496458

497459
flush();
498460

499-
extents_ = create_image_extents(gpu_sizes, storage_type_, gpu_memory_layout);
461+
extents_ = calculate_texture_limits(gpu_sizes, gpu_memory_layout);
500462
image_ = allocate_image(
501463
context_,
502464
extents_,

0 commit comments

Comments
 (0)