11
11
12
12
namespace vkcompute {
13
13
14
- namespace {
15
-
16
- /*
17
- * When stored on the GPU, one dimension will be aligned to the next multiple of
18
- * 4 in order to take advantage of vec4 data types. The dimension that is
19
- * packed is denoted by the GPUMemoryLayout. This function adjusts one of
20
- * the dimensions based on the desired memory format and storage type and
21
- * returns a sizes array describing the dimensions of the memory used to store
22
- * the tensor data on the GPU.
23
- */
24
- std::vector<int64_t > calc_gpu_sizes (
14
+ std::vector<int64_t > calculate_strides (
25
15
const std::vector<int64_t >& sizes,
26
16
const api::GPUMemoryLayout memory_layout,
27
- const api::StorageType storage_type) {
28
- std::vector<int64_t > gpu_sizes;
29
- if (storage_type == api::kBuffer ) {
30
- gpu_sizes.resize (sizes.size ());
31
- for (size_t i = 0 ; i < sizes.size (); i++) {
32
- gpu_sizes.at (i) = sizes.at (i);
17
+ const bool texel_strides) {
18
+ const int64_t dim_offset = api::to_dim_offset<int64_t >(memory_layout);
19
+ const int64_t last_dim = sizes.size () - dim_offset;
20
+ VK_CHECK_COND (last_dim >= 0 );
21
+
22
+ size_t ndim = sizes.size ();
23
+ std::vector<int64_t > strides (ndim);
24
+
25
+ const int64_t last_dim_size = texel_strides
26
+ ? api::utils::div_up (sizes.at (last_dim), INT64_C (4 ))
27
+ : sizes.at (last_dim);
28
+
29
+ for (int stride_d = ndim - 1 ; stride_d >= 0 ; stride_d--) {
30
+ strides.at (stride_d) = 1 ;
31
+ if (stride_d == last_dim) {
32
+ continue ;
33
33
}
34
- }
35
- // For texture storage, tensors are typically stored using 3D image textures.
36
- // Batches are stacked along the depth dimension. To represent the physical
37
- // 3 dimensionality of the image texture (with concatenated batches) GPU sizes
38
- // will be fixed to 4 dimensions when using texture storage.
39
- else {
40
- VK_CHECK_COND (
41
- sizes.size () >= 0 && sizes.size () <= 4 ,
42
- " Texture storage only valid for 0 <= ndim <= 4, received: " ,
43
- sizes.size ());
44
-
45
- gpu_sizes.resize (4 );
46
- gpu_sizes.at (0 ) = api::utils::val_at (-4 , sizes);
47
- gpu_sizes.at (1 ) = api::utils::val_at (-3 , sizes);
48
- gpu_sizes.at (2 ) = api::utils::val_at (-2 , sizes);
49
- gpu_sizes.at (3 ) = api::utils::val_at (-1 , sizes);
50
- }
51
-
52
- size_t ndim = gpu_sizes.size ();
53
- switch (memory_layout) {
54
- case api::kWidthPacked :
55
- if (ndim >= 1 ) {
56
- gpu_sizes.at (ndim - 1 ) =
57
- api::utils::align_up (api::utils::val_at (-1 , sizes), INT64_C (4 ));
34
+ strides.at (stride_d) = last_dim_size;
35
+ for (int size_d = ndim - 1 ; size_d > stride_d; size_d--) {
36
+ if (size_d != last_dim) {
37
+ strides.at (stride_d) *= sizes.at (size_d);
58
38
}
59
- break ;
39
+ }
40
+ }
41
+ return strides;
42
+ }
60
43
61
- case api::kHeightPacked :
62
- if (ndim >= 2 ) {
63
- gpu_sizes.at (ndim - 2 ) =
64
- api::utils::align_up (api::utils::val_at (-2 , sizes), INT64_C (4 ));
65
- }
66
- break ;
44
+ std::vector<int64_t > calculate_padded_sizes (
45
+ const std::vector<int64_t >& sizes,
46
+ const api::GPUMemoryLayout memory_layout) {
47
+ int64_t ndim = sizes.size ();
48
+ if (ndim == 0 ) {
49
+ ndim = 1 ;
50
+ }
67
51
68
- case api::kChannelsPacked :
69
- if (ndim >= 3 ) {
70
- gpu_sizes.at (ndim - 3 ) =
71
- api::utils::align_up (api::utils::val_at (-3 , sizes), INT64_C (4 ));
72
- }
73
- break ;
52
+ // Tensor sizes will be unsqueezed up to the next multiple of 4
53
+ const int64_t ndim_up4 = api::utils::align_up (ndim, INT64_C (4 ));
54
+ std::vector<int64_t > gpu_sizes (ndim_up4);
55
+ for (int64_t i = 0 ; i < ndim_up4; ++i) {
56
+ gpu_sizes.at (i) = api::utils::val_at (i - ndim_up4, sizes);
74
57
}
75
58
59
+ // Pad the packed dim to the next multiple of 4.
60
+ const int64_t dim_offset = api::to_dim_offset<int64_t >(memory_layout);
61
+ const int64_t padded_dim_size = api::utils::val_at (-dim_offset, sizes);
62
+ gpu_sizes.at (ndim_up4 - dim_offset) =
63
+ api::utils::align_up (padded_dim_size, INT64_C (4 ));
64
+
76
65
return gpu_sizes;
77
66
}
78
67
79
- /*
80
- * Creates a uvec3 denoting the extents of the image texture that will be
81
- * created to store a tensor of a given size.
82
- */
83
- api::utils::uvec3 create_image_extents (
68
+ api::utils::uvec3 calculate_texture_limits (
84
69
const std::vector<int64_t >& gpu_sizes,
85
- const api::StorageType storage_type,
86
70
const api::GPUMemoryLayout memory_layout) {
87
- size_t ndim = gpu_sizes.size ();
71
+ VK_CHECK_COND ( gpu_sizes.size () == 4 );
88
72
89
- if (storage_type == api::kBuffer ) {
90
- // image extents do not apply to buffer storage
91
- return {0u , 0u , 0u };
92
- } else {
93
- VK_CHECK_COND (
94
- ndim >= 1 && ndim <= 4 ,
95
- " Texture storage only valid for 1 <= ndim <= 4!" );
96
-
97
- using namespace api ::utils;
98
- uint32_t width = safe_downcast<uint32_t >(val_at (-1 , gpu_sizes));
99
- uint32_t height = safe_downcast<uint32_t >(val_at (-2 , gpu_sizes));
100
- uint32_t channels = safe_downcast<uint32_t >(val_at (-3 , gpu_sizes));
101
- uint32_t batch = safe_downcast<uint32_t >(val_at (-4 , gpu_sizes));
102
-
103
- switch (memory_layout) {
104
- case api::kWidthPacked :
105
- VK_CHECK_COND (width % 4 == 0 , " Width must be divisible by 4!" );
106
- width /= 4 ;
107
- break ;
108
- case api::kHeightPacked :
109
- VK_CHECK_COND (height % 4 == 0 , " Height must be divisible by 4!" );
110
- height /= 4 ;
111
- break ;
112
- case api::kChannelsPacked :
113
- VK_CHECK_COND (channels % 4 == 0 , " Channels must be divisible by 4!" );
114
- channels /= 4 ;
115
- break ;
116
- default :
117
- VK_THROW (" Invalid memory format used!" );
118
- }
73
+ uint32_t N = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (0 ));
74
+ uint32_t C = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (1 ));
75
+ uint32_t H = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (2 ));
76
+ uint32_t W = api::utils::safe_downcast<uint32_t >(gpu_sizes.at (3 ));
119
77
120
- return {width, height, batch * channels};
78
+ switch (memory_layout) {
79
+ case api::kWidthPacked :
80
+ VK_CHECK_COND (W % 4 == 0 );
81
+ W /= 4 ;
82
+ break ;
83
+ case api::kHeightPacked :
84
+ VK_CHECK_COND (H % 4 == 0 );
85
+ H /= 4 ;
86
+ break ;
87
+ case api::kChannelsPacked :
88
+ VK_CHECK_COND (C % 4 == 0 );
89
+ C /= 4 ;
90
+ break ;
121
91
}
122
- }
123
92
124
- } // namespace
93
+ return {W, H, C * N};
94
+ }
125
95
126
96
//
127
97
// vTensor
@@ -138,12 +108,12 @@ vTensor::vTensor(
138
108
memory_layout_ (memory_layout),
139
109
// Calculate sizes and strides
140
110
sizes_(sizes.begin(), sizes.end()),
141
- gpu_sizes_{calc_gpu_sizes (sizes, memory_layout_, storage_type )},
111
+ gpu_sizes_{calculate_padded_sizes (sizes, memory_layout_)},
142
112
texture_limits_{{0 , 0 , 0 }},
143
113
// Utility Uniform Buffers that can be passed to shaders as arguments
144
114
sizes_uniform_ (),
145
115
texture_limits_uniform_ (),
146
- packed_dim_meta_ (),
116
+ buffer_meta_uniform_ (),
147
117
// Construct Tensor storage
148
118
storage_ (
149
119
context,
@@ -197,6 +167,16 @@ api::VulkanBuffer& vTensor::buffer(
197
167
return storage_.buffer_ ;
198
168
}
199
169
170
+ vTensor::BufferMetadata vTensor::make_buffer_metadata () {
171
+ auto strides = calculate_strides (gpu_sizes_, memory_layout_);
172
+ return BufferMetadata{
173
+ api::utils::make_whcn_ivec4 (sizes_),
174
+ api::utils::make_whcn_ivec4 (
175
+ calculate_strides (gpu_sizes_, memory_layout_)),
176
+ texel_numel (),
177
+ packed_dim_ntexels ()};
178
+ }
179
+
200
180
const api::BufferBindInfo vTensor::sizes_ubo () {
201
181
if (!sizes_uniform_.buffer ()) {
202
182
sizes_uniform_ = api::UniformParamsBuffer (
@@ -213,28 +193,12 @@ const api::BufferBindInfo vTensor::texture_limits_ubo() {
213
193
return api::BufferBindInfo (texture_limits_uniform_.buffer ());
214
194
}
215
195
216
- vTensor::PackedDimMeta vTensor::make_packed_dim_metadata () const {
217
- int64_t packed_dim = gpu_memory_layout_int ();
218
- int32_t dim_size = api::utils::val_at (-(packed_dim + 1 ), sizes_);
219
- int32_t dim_size_padded = api::utils::val_at (-(packed_dim + 1 ), gpu_sizes_);
220
- int32_t dim_texel_len =
221
- api::utils::safe_downcast<int32_t >(extents ().data [packed_dim]);
222
- int32_t padding = dim_size_padded - dim_size;
223
-
224
- return {
225
- dim_size,
226
- dim_size_padded,
227
- dim_texel_len,
228
- padding,
229
- };
230
- }
231
-
232
- const api::BufferBindInfo vTensor::packed_dim_meta_ubo () {
233
- if (!packed_dim_meta_.buffer ()) {
234
- packed_dim_meta_ =
235
- api::UniformParamsBuffer (storage_.context_ , make_packed_dim_metadata ());
196
+ const api::BufferBindInfo vTensor::buffer_meta_ubo () {
197
+ if (!buffer_meta_uniform_.buffer ()) {
198
+ sizes_uniform_ =
199
+ api::UniformParamsBuffer (storage_.context_ , make_buffer_metadata ());
236
200
}
237
- return api::BufferBindInfo (packed_dim_meta_ .buffer ());
201
+ return api::BufferBindInfo (sizes_uniform_ .buffer ());
238
202
}
239
203
240
204
VmaAllocationCreateInfo vTensor::get_allocation_create_info () const {
@@ -273,45 +237,44 @@ void vTensor::bind_allocation(const api::Allocation& allocation) {
273
237
274
238
void vTensor::update_size_metadata (const std::vector<int64_t >& new_sizes) {
275
239
sizes_ = new_sizes;
276
- gpu_sizes_ = calc_gpu_sizes (sizes_, memory_layout_, storage_type () );
240
+ gpu_sizes_ = calculate_padded_sizes (sizes_, memory_layout_);
277
241
278
- if (storage_type () != api::kBuffer ) {
279
- // Calculate the extents of the image texture that would have been required
280
- // for a tensor of the new sizes.
281
- api::utils::uvec3 virtual_extents =
282
- create_image_extents (gpu_sizes_, storage_type (), memory_layout_);
283
- // Update the texture limits to reflect the new virtual extents.
284
- texture_limits_.limits = api::utils::ivec3{
285
- api::utils::safe_downcast<int32_t >(virtual_extents.data [0 ]),
286
- api::utils::safe_downcast<int32_t >(virtual_extents.data [1 ]),
287
- api::utils::safe_downcast<int32_t >(virtual_extents.data [2 ])};
288
- }
242
+ // Calculate the extents of the image texture that would have been required
243
+ // for a tensor of the new sizes.
244
+ api::utils::uvec3 virtual_extents =
245
+ calculate_texture_limits (gpu_sizes_, memory_layout_);
246
+
247
+ // Update the texture limits to reflect the new virtual extents.
248
+ texture_limits_.limits = api::utils::ivec3{
249
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [0 ]),
250
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [1 ]),
251
+ api::utils::safe_downcast<int32_t >(virtual_extents.data [2 ])};
289
252
290
253
if (sizes_uniform_.buffer ()) {
291
254
sizes_uniform_.update (api::utils::make_whcn_ivec4 (sizes_));
292
255
}
293
256
if (texture_limits_uniform_.buffer ()) {
294
257
texture_limits_uniform_.update (texture_limits_);
295
258
}
296
- if (packed_dim_meta_ .buffer ()) {
297
- packed_dim_meta_ .update (make_packed_dim_metadata ());
259
+ if (buffer_meta_uniform_ .buffer ()) {
260
+ buffer_meta_uniform_ .update (make_buffer_metadata ());
298
261
}
299
262
}
300
263
301
264
void vTensor::reallocate (const std::vector<int64_t >& new_sizes) {
302
265
update_size_metadata (new_sizes);
303
266
storage_.discard_and_reallocate (
304
- calc_gpu_sizes (new_sizes, memory_layout_, storage_type () ),
267
+ calculate_padded_sizes (new_sizes, memory_layout_),
305
268
memory_layout_,
306
269
dtype_);
307
270
}
308
271
309
272
void vTensor::virtual_resize (const std::vector<int64_t >& new_sizes) {
310
- // For texture storage check that the current texture is large enough for the
311
- // new sizes of the tensor.
312
273
if (storage_type () != api::kBuffer ) {
274
+ // For texture storage check that the current texture is large enough for
275
+ // the new sizes of the tensor.
313
276
api::utils::uvec3 virtual_extents =
314
- create_image_extents (gpu_sizes_, storage_type () , memory_layout_);
277
+ calculate_texture_limits (gpu_sizes_, memory_layout_);
315
278
316
279
bool valid_resize = virtual_extents.data [0 ] <= extents ().data [0 ];
317
280
valid_resize = valid_resize && virtual_extents.data [1 ] <= extents ().data [1 ];
@@ -403,8 +366,7 @@ vTensorStorage::vTensorStorage(
403
366
const bool allocate_memory)
404
367
: context_(context),
405
368
storage_type_{storage_type},
406
- extents_ (
407
- create_image_extents (gpu_sizes, storage_type, gpu_memory_layout)),
369
+ extents_ (calculate_texture_limits(gpu_sizes, gpu_memory_layout)),
408
370
buffer_length_{api::utils::multiply_integers (gpu_sizes)},
409
371
image_ (allocate_image(
410
372
context_,
@@ -496,7 +458,7 @@ void vTensorStorage::discard_and_reallocate(
496
458
497
459
flush ();
498
460
499
- extents_ = create_image_extents (gpu_sizes, storage_type_ , gpu_memory_layout);
461
+ extents_ = calculate_texture_limits (gpu_sizes, gpu_memory_layout);
500
462
image_ = allocate_image (
501
463
context_,
502
464
extents_,
0 commit comments