11
11
12
12
namespace vkcompute {
13
13
14
- namespace {
15
-
16
- /*
17
- * When stored on the GPU, one dimension will be aligned to the next multiple of
18
- * 4 in order to take advantage of vec4 data types. The dimension that is
19
- * packed is denoted by the GPUMemoryLayout. This function adjusts one of
20
- * the dimensions based on the desired memory format and storage type and
21
- * returns a sizes array describing the dimensions of the memory used to store
22
- * the tensor data on the GPU.
23
- */
24
- std::vector<int64_t > calc_gpu_sizes (
14
+ std::vector<int64_t > calculate_strides (
25
15
const std::vector<int64_t >& sizes,
26
16
const api::GPUMemoryLayout memory_layout,
27
- const api::StorageType storage_type) {
28
- std::vector<int64_t > gpu_sizes;
29
- if (storage_type == api::kBuffer ) {
30
- gpu_sizes.resize (sizes.size ());
31
- for (size_t i = 0 ; i < sizes.size (); i++) {
32
- gpu_sizes.at (i) = sizes.at (i);
17
+ const bool texel_strides) {
18
+ const int64_t dim_offset = api::to_dim_offset<int64_t >(memory_layout);
19
+ const int64_t last_dim = sizes.size () - dim_offset;
20
+ VK_CHECK_COND (last_dim >= 0 );
21
+
22
+ size_t ndim = sizes.size ();
23
+ std::vector<int64_t > strides (ndim);
24
+
25
+ const int64_t last_dim_size = texel_strides
26
+ ? api::utils::div_up (sizes.at (last_dim), INT64_C (4 ))
27
+ : sizes.at (last_dim);
28
+
29
+ for (int stride_d = ndim - 1 ; stride_d >= 0 ; stride_d--) {
30
+ strides.at (stride_d) = 1 ;
31
+ if (stride_d == last_dim) {
32
+ continue ;
33
33
}
34
- }
35
- // For texture storage, tensors are typically stored using 3D image textures.
36
- // Batches are stacked along the depth dimension. To represent the physical
37
- // 3 dimensionality of the image texture (with concatenated batches) GPU sizes
38
- // will be fixed to 4 dimensions when using texture storage.
39
- else {
40
- VK_CHECK_COND (
41
- sizes.size () >= 0 && sizes.size () <= 4 ,
42
- " Texture storage only valid for 0 <= ndim <= 4, received: " ,
43
- sizes.size ());
44
-
45
- gpu_sizes.resize (4 );
46
- gpu_sizes.at (0 ) = api::utils::val_at (-4 , sizes);
47
- gpu_sizes.at (1 ) = api::utils::val_at (-3 , sizes);
48
- gpu_sizes.at (2 ) = api::utils::val_at (-2 , sizes);
49
- gpu_sizes.at (3 ) = api::utils::val_at (-1 , sizes);
50
- }
51
-
52
- size_t ndim = gpu_sizes.size ();
53
- switch (memory_layout) {
54
- case api::kWidthPacked :
55
- if (ndim >= 1 ) {
56
- gpu_sizes.at (ndim - 1 ) =
57
- api::utils::align_up (api::utils::val_at (-1 , sizes), INT64_C (4 ));
34
+ strides.at (stride_d) = last_dim_size;
35
+ for (int size_d = ndim - 1 ; size_d > stride_d; size_d--) {
36
+ if (size_d != last_dim) {
37
+ strides.at (stride_d) *= sizes.at (size_d);
58
38
}
59
- break ;
39
+ }
40
+ }
41
+ return strides;
42
+ }
60
43
61
- case api::kHeightPacked :
62
- if (ndim >= 2 ) {
63
- gpu_sizes.at (ndim - 2 ) =
64
- api::utils::align_up (api::utils::val_at (-2 , sizes), INT64_C (4 ));
65
- }
66
- break ;
44
+ std::vector<int64_t > calculate_padded_sizes (
45
+ const std::vector<int64_t >& sizes,
46
+ const api::GPUMemoryLayout memory_layout) {
47
+ int64_t ndim = sizes.size ();
48
+ if (ndim == 0 ) {
49
+ ndim = 1 ;
50
+ }
67
51
68
- case api::kChannelsPacked :
69
- if (ndim >= 3 ) {
70
- gpu_sizes.at (ndim - 3 ) =
71
- api::utils::align_up (api::utils::val_at (-3 , sizes), INT64_C (4 ));
72
- }
73
- break ;
52
+ // Tensor sizes will be unsqueezed up to the next multiple of 4
53
+ const int64_t ndim_up4 = api::utils::align_up (ndim, INT64_C (4 ));
54
+ std::vector<int64_t > gpu_sizes (ndim_up4);
55
+ for (int64_t i = 0 ; i < ndim_up4; ++i) {
56
+ gpu_sizes.at (i) = api::utils::val_at (i - ndim_up4, sizes);
74
57
}
75
58
59
+ // Pad the packed dim to the next multiple of 4.
60
+ const int64_t dim_offset = api::to_dim_offset<int64_t >(memory_layout);
61
+ const int64_t padded_dim_size = api::utils::val_at (-dim_offset, sizes);
62
+ gpu_sizes.at (ndim_up4 - dim_offset) =
63
+ api::utils::align_up (padded_dim_size, INT64_C (4 ));
64
+
76
65
return gpu_sizes;
77
66
}
78
67
79
- /*
80
- * Creates a uvec3 denoting the extents of the image texture that will be
81
- * created to store a tensor of a given size.
82
- */
83
- api::utils::uvec3 create_image_extents (
68
+ api::utils::uvec3 calculate_texture_limits (
84
69
const std::vector<int64_t >& gpu_sizes,
85
- const api::StorageType storage_type,
86
70
const api::GPUMemoryLayout memory_layout) {
87
- size_t ndim = gpu_sizes.size ();
71
+ VK_CHECK_COND ( gpu_sizes.size () == 4 );
88
72
89
- if (storage_type == api::kBuffer ) {
90
- // image extents do not apply to buffer storage
91
- return {0u , 0u , 0u };
92
- } else {
93
- VK_CHECK_COND (
94
- ndim >= 1 && ndim <= 4 ,
95
- " Texture storage only valid for 1 <= ndim <= 4!" );
96
-
97
- using namespace api ::utils;
98
- uint32_t width = safe_downcast<uint32_t >(val_at (-1 , gpu_sizes));
99
- uint32_t height = safe_downcast<uint32_t >(val_at (-2 , gpu_sizes));
100
- uint32_t channels = safe_downcast<uint32_t >(val_at (-3 , gpu_sizes));
101
- uint32_t batch = safe_downcast<uint32_t >(val_at (-4 , gpu_sizes));
102
-
103
- switch (memory_layout) {
104
- case api::kWidthPacked :
105
- VK_CHECK_COND (width % 4 == 0 , " Width must be divisible by 4!" );
106
- width /= 4 ;
107
- break ;
108
- case api::kHeightPacked :
109
- VK_CHECK_COND (height % 4 == 0 , " Height must be divisible by 4!" );
110
- height /= 4 ;
111
- break ;
112
- case api::kChannelsPacked :
113
- VK_CHECK_COND (channels % 4 == 0 , " Channels must be divisible by 4!" );
114
- channels /= 4 ;
115
- break ;
116
- default :
117
- VK_THROW (" Invalid memory format used!" );
118
- }
73
+ uint32_t N = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (0 ));
74
+ uint32_t C = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (1 ));
75
+ uint32_t H = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (2 ));
76
+ uint32_t W = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (3 ));
119
77
120
- return {width, height, batch * channels};
78
+ switch (memory_layout) {
79
+ case api::kWidthPacked :
80
+ VK_CHECK_COND (W % 4 == 0 );
81
+ W /= 4 ;
82
+ break ;
83
+ case api::kHeightPacked :
84
+ VK_CHECK_COND (H % 4 == 0 );
85
+ H /= 4 ;
86
+ break ;
87
+ case api::kChannelsPacked :
88
+ VK_CHECK_COND (C % 4 == 0 );
89
+ C /= 4 ;
90
+ break ;
121
91
}
122
- }
123
92
124
- } // namespace
93
+ return {W, H, C * N};
94
+ }
125
95
126
96
//
127
97
// vTensor
@@ -138,12 +108,14 @@ vTensor::vTensor(
138
108
memory_layout_ (memory_layout),
139
109
// Calculate sizes and strides
140
110
sizes_(sizes.begin(), sizes.end()),
141
- gpu_sizes_{calc_gpu_sizes (sizes, memory_layout_, storage_type )},
111
+ gpu_sizes_{calculate_padded_sizes (sizes, memory_layout_)},
142
112
texture_limits_{{0 , 0 , 0 }},
143
113
// Utility Uniform Buffers that can be passed to shaders as arguments
144
114
sizes_uniform_ (),
145
115
texture_limits_uniform_ (),
146
- packed_dim_meta_ (),
116
+ strides_uniform_ (),
117
+ ntexels_uniform_ (),
118
+ buffer_meta_uniform_ (),
147
119
// Construct Tensor storage
148
120
storage_ (
149
121
context,
@@ -197,6 +169,16 @@ api::VulkanBuffer& vTensor::buffer(
197
169
return storage_.buffer_ ;
198
170
}
199
171
172
+ vTensor::BufferMetadata vTensor::make_buffer_metadata () {
173
+ auto strides = calculate_strides (gpu_sizes_, memory_layout_);
174
+ return BufferMetadata{
175
+ api::utils::make_whcn_ivec4 (sizes_),
176
+ api::utils::make_whcn_ivec4 (
177
+ calculate_strides (gpu_sizes_, memory_layout_)),
178
+ texel_numel (),
179
+ packed_dim_ntexels ()};
180
+ }
181
+
200
182
const api::BufferBindInfo vTensor::sizes_ubo () {
201
183
if (!sizes_uniform_.buffer ()) {
202
184
sizes_uniform_ = api::UniformParamsBuffer (
@@ -213,28 +195,30 @@ const api::BufferBindInfo vTensor::texture_limits_ubo() {
213
195
return api::BufferBindInfo (texture_limits_uniform_.buffer ());
214
196
}
215
197
216
- vTensor::PackedDimMeta vTensor::make_packed_dim_metadata () const {
217
- int64_t packed_dim = gpu_memory_layout_int ();
218
- int32_t dim_size = api::utils::val_at (-(packed_dim + 1 ), sizes_);
219
- int32_t dim_size_padded = api::utils::val_at (-(packed_dim + 1 ), gpu_sizes_);
220
- int32_t dim_texel_len =
221
- api::utils::safe_downcast<int32_t >(extents ().data [packed_dim]);
222
- int32_t padding = dim_size_padded - dim_size;
223
-
224
- return {
225
- dim_size,
226
- dim_size_padded,
227
- dim_texel_len,
228
- padding,
229
- };
198
+ const api::BufferBindInfo vTensor::strides_ubo () {
199
+ if (!strides_uniform_.buffer ()) {
200
+ strides_uniform_ = api::UniformParamsBuffer (
201
+ storage_.context_ ,
202
+ api::utils::make_whcn_ivec4 (
203
+ calculate_strides (gpu_sizes_, memory_layout_)));
204
+ }
205
+ return api::BufferBindInfo (strides_uniform_.buffer ());
230
206
}
231
207
232
- const api::BufferBindInfo vTensor::packed_dim_meta_ubo () {
233
- if (!packed_dim_meta_ .buffer ()) {
234
- packed_dim_meta_ =
235
- api::UniformParamsBuffer (storage_.context_ , make_packed_dim_metadata ());
208
+ const api::BufferBindInfo vTensor::ntexels_ubo () {
209
+ if (!ntexels_uniform_ .buffer ()) {
210
+ ntexels_uniform_ =
211
+ api::UniformParamsBuffer (storage_.context_ , texel_numel ());
236
212
}
237
- return api::BufferBindInfo (packed_dim_meta_.buffer ());
213
+ return api::BufferBindInfo (ntexels_uniform_.buffer ());
214
+ }
215
+
216
+ const api::BufferBindInfo vTensor::buffer_meta_ubo () {
217
+ if (!buffer_meta_uniform_.buffer ()) {
218
+ sizes_uniform_ =
219
+ api::UniformParamsBuffer (storage_.context_ , make_buffer_metadata ());
220
+ }
221
+ return api::BufferBindInfo (sizes_uniform_.buffer ());
238
222
}
239
223
240
224
VmaAllocationCreateInfo vTensor::get_allocation_create_info () const {
@@ -273,45 +257,51 @@ void vTensor::bind_allocation(const api::Allocation& allocation) {
273
257
274
258
void vTensor::update_size_metadata (const std::vector<int64_t >& new_sizes) {
275
259
sizes_ = new_sizes;
276
- gpu_sizes_ = calc_gpu_sizes (sizes_, memory_layout_, storage_type () );
260
+ gpu_sizes_ = calculate_padded_sizes (sizes_, memory_layout_);
277
261
278
- if (storage_type () != api::kBuffer ) {
279
- // Calculate the extents of the image texture that would have been required
280
- // for a tensor of the new sizes.
281
- api::utils::uvec3 virtual_extents =
282
- create_image_extents (gpu_sizes_, storage_type (), memory_layout_);
283
- // Update the texture limits to reflect the new virtual extents.
284
- texture_limits_.limits = api::utils::ivec3{
285
- api::utils::safe_downcast<int32_t >(virtual_extents.data [0 ]),
286
- api::utils::safe_downcast<int32_t >(virtual_extents.data [1 ]),
287
- api::utils::safe_downcast<int32_t >(virtual_extents.data [2 ])};
288
- }
262
+ // Calculate the extents of the image texture that would have been required
263
+ // for a tensor of the new sizes.
264
+ api::utils::uvec3 virtual_extents =
265
+ calculate_texture_limits (gpu_sizes_, memory_layout_);
266
+
267
+ // Update the texture limits to reflect the new virtual extents.
268
+ texture_limits_.limits = api::utils::ivec3{
269
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [0 ]),
270
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [1 ]),
271
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [2 ])};
289
272
290
273
if (sizes_uniform_.buffer ()) {
291
274
sizes_uniform_.update (api::utils::make_whcn_ivec4 (sizes_));
292
275
}
293
276
if (texture_limits_uniform_.buffer ()) {
294
277
texture_limits_uniform_.update (texture_limits_);
295
278
}
296
- if (packed_dim_meta_.buffer ()) {
297
- packed_dim_meta_.update (make_packed_dim_metadata ());
279
+ if (strides_uniform_.buffer ()) {
280
+ strides_uniform_.update (api::utils::make_whcn_ivec4 (
281
+ calculate_strides (gpu_sizes_, memory_layout_)));
282
+ }
283
+ if (ntexels_uniform_.buffer ()) {
284
+ ntexels_uniform_.update (texel_numel ());
285
+ }
286
+ if (buffer_meta_uniform_.buffer ()) {
287
+ buffer_meta_uniform_.update (make_buffer_metadata ());
298
288
}
299
289
}
300
290
301
291
void vTensor::reallocate (const std::vector<int64_t >& new_sizes) {
302
292
update_size_metadata (new_sizes);
303
293
storage_.discard_and_reallocate (
304
- calc_gpu_sizes (new_sizes, memory_layout_, storage_type () ),
294
+ calculate_padded_sizes (new_sizes, memory_layout_),
305
295
memory_layout_,
306
296
dtype_);
307
297
}
308
298
309
299
void vTensor::virtual_resize (const std::vector<int64_t >& new_sizes) {
310
- // For texture storage check that the current texture is large enough for the
311
- // new sizes of the tensor.
312
300
if (storage_type () != api::kBuffer ) {
301
+ // For texture storage check that the current texture is large enough for
302
+ // the new sizes of the tensor.
313
303
api::utils::uvec3 virtual_extents =
314
- create_image_extents (gpu_sizes_, storage_type () , memory_layout_);
304
+ calculate_texture_limits (gpu_sizes_, memory_layout_);
315
305
316
306
bool valid_resize = virtual_extents.data [0 ] <= extents ().data [0 ];
317
307
valid_resize = valid_resize && virtual_extents.data [1 ] <= extents ().data [1 ];
@@ -403,8 +393,7 @@ vTensorStorage::vTensorStorage(
403
393
const bool allocate_memory)
404
394
: context_(context),
405
395
storage_type_{storage_type},
406
- extents_ (
407
- create_image_extents (gpu_sizes, storage_type, gpu_memory_layout)),
396
+ extents_ (calculate_texture_limits(gpu_sizes, gpu_memory_layout)),
408
397
buffer_length_{api::utils::multiply_integers (gpu_sizes)},
409
398
image_ (allocate_image(
410
399
context_,
@@ -496,7 +485,7 @@ void vTensorStorage::discard_and_reallocate(
496
485
497
486
flush ();
498
487
499
- extents_ = create_image_extents (gpu_sizes, storage_type_ , gpu_memory_layout);
488
+ extents_ = calculate_texture_limits (gpu_sizes, gpu_memory_layout);
500
489
image_ = allocate_image (
501
490
context_,
502
491
extents_,
0 commit comments