Skip to content

Commit 7c6d58a

Browse files
SS-JIAfacebook-github-bot
authored andcommitted
vTensor cleanup 6/N - Do not use gpu_memory_layout as a source of truth, use packed_dim_whcn_idx directly (#5479)
Summary: Pull Request resolved: #5479 ## Context `GPUMemoryLayout` is not a sufficient description of how a tensor is laid out in GPU memory. For buffer backed tensors, this was true ever since strides were introduced; for texture backed tensors, this is true since the introduction of `axis_map`. For buffer backed tensors, the `strides` of the tensor is required to fully represent how the data of the tensor is laid out in GPU memory. For texture backed tensors, the `axis_map` and `packed_dim_whcn_idx` is required to fully represent the layout of the tensor as an image texture. Furthermore, with the introduction of functions like `virtual_transpose()`, tensor layouts may be produced which cannot be captured cleanly by an enum. This diff decouples `GPUMemoryLayout` from `vTensor`. Rather than storing it as a tensor property, it is only used during construction to determine the initial tensor layout metadata. The layout of a tensor can be estimated afterwards using `estimate_memory_layout()`, but this is only a "best effort" at producing a comparable memory layout. `GPUMemoryLayout` was helpful as a compact representation of the `packed_dim_whcn_idx` of the tensor, which identifies the "fastest moving dimension" in buffer backed tensors, or which dim is packed along a texel for texture backed tensors. Therefore, whenever `GPUMemoryLayout` is referenced, what is really of interest is the packed dim index. Therefore, this diff also replaces references to `memory_layout()` to reference `packed_dim_whcn_idx()` instead. ghstack-source-id: 243563521 exported-using-ghexport Reviewed By: jorgep31415 Differential Revision: D62995121 fbshipit-source-id: 435063a4aa30b266bb948f0ff4987fb46e86dd56
1 parent 47f4f07 commit 7c6d58a

27 files changed

+241
-201
lines changed

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 43 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,15 @@
1313
namespace vkcompute {
1414
namespace api {
1515

16-
/*
17-
* Given the strides of a buffer-backed tensor, estimate the equivalent memory
18-
* layout enum value by identifying the fastest moving dimension.
19-
*/
20-
utils::GPUMemoryLayout estimate_memory_layout(
21-
const std::vector<int64_t>& dim_order) {
22-
int64_t fastest_dim_whcn = dim_order.size() - 1 - dim_order.back();
23-
if (fastest_dim_whcn >= 0 && fastest_dim_whcn < 3) {
24-
return utils::GPUMemoryLayout(fastest_dim_whcn);
25-
}
26-
27-
// TODO(ssjia) find a way to gracefully recover from this case by i.e. adding
28-
// a UNKOWN GPUMemoryLayout. This is not high priority though because we don't
29-
// expect this to ever come up in practice.
30-
VK_THROW("No compatible GPUMemoryLayout value");
31-
}
32-
3316
std::vector<int64_t> calculate_dim_order(
3417
const size_t ndim,
35-
const utils::GPUMemoryLayout memory_layout) {
18+
const int32_t packed_dim_whcn_idx) {
3619
// Special case for zero dim tensors
3720
if (ndim == 0) {
3821
return {0};
3922
}
4023
std::vector<int64_t> dim_order(ndim);
41-
int64_t last_dim =
42-
ndim - utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
24+
int64_t last_dim = ndim - 1 - packed_dim_whcn_idx;
4325

4426
int64_t cur_dim = 0;
4527
for (int d = 0; d < ndim; ++d) {
@@ -149,7 +131,7 @@ std::vector<int64_t> unsqueeze_strides(
149131

150132
std::vector<int64_t> calculate_padded_sizes(
151133
const std::vector<int64_t>& sizes,
152-
const utils::GPUMemoryLayout memory_layout) {
134+
const int32_t packed_dim_whcn_idx) {
153135
int64_t ndim = sizes.size();
154136
if (ndim == 0) {
155137
ndim = 1;
@@ -163,8 +145,7 @@ std::vector<int64_t> calculate_padded_sizes(
163145
}
164146

165147
// Pad the packed dim to the next multiple of 4.
166-
const int64_t dim_offset =
167-
utils::to_packed_dim_nchw_offset<int64_t>(memory_layout);
148+
const int64_t dim_offset = packed_dim_whcn_idx + 1;
168149
const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
169150
padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
170151

@@ -174,7 +155,7 @@ std::vector<int64_t> calculate_padded_sizes(
174155
utils::uvec3 calculate_image_extents(
175156
const std::vector<int64_t>& padded_sizes,
176157
const std::vector<int64_t>& axis_map,
177-
const utils::GPUMemoryLayout memory_layout) {
158+
const int32_t packed_dim_whcn_idx) {
178159
VK_CHECK_COND(padded_sizes.size() == 4);
179160
VK_CHECK_COND(axis_map.size() == 4);
180161

@@ -195,21 +176,8 @@ utils::uvec3 calculate_image_extents(
195176
// Multiply the extents of the batch axis by the batch size.
196177
extents[batch_axis] *= padded_sizes.at(0);
197178

198-
switch (memory_layout) {
199-
case utils::kWidthPacked:
200-
VK_CHECK_COND(extents[axis_map.at(0)] % 4 == 0);
201-
extents[axis_map.at(0)] /= 4;
202-
break;
203-
case utils::kHeightPacked:
204-
VK_CHECK_COND(extents[axis_map.at(1)] % 4 == 0);
205-
extents[axis_map.at(1)] /= 4;
206-
break;
207-
case utils::kChannelsPacked:
208-
VK_CHECK_COND(extents[axis_map.at(2)] % 4 == 0);
209-
extents[axis_map.at(2)] /= 4;
210-
break;
211-
}
212-
179+
VK_CHECK_COND(extents[axis_map.at(packed_dim_whcn_idx)] % 4 == 0);
180+
extents[axis_map.at(packed_dim_whcn_idx)] /= 4;
213181
return extents;
214182
}
215183

@@ -285,15 +253,15 @@ vkapi::VulkanBuffer allocate_buffer(
285253
vTensorStorage::vTensorStorage(
286254
Context* const context,
287255
const utils::StorageType storage_type,
288-
const utils::GPUMemoryLayout gpu_memory_layout,
289256
const std::vector<int64_t>& axis_map,
257+
const int32_t packed_dim_whcn_idx,
290258
const std::vector<int64_t>& padded_sizes,
291259
const vkapi::ScalarType dtype,
292260
const bool allocate_memory)
293261
: context_(context),
294262
storage_type_{storage_type},
295263
image_extents_(
296-
calculate_image_extents(padded_sizes, axis_map, gpu_memory_layout)),
264+
calculate_image_extents(padded_sizes, axis_map, packed_dim_whcn_idx)),
297265
buffer_length_{utils::multiply_integers(padded_sizes)},
298266
buffer_offset_{0},
299267
image_(allocate_image(
@@ -408,14 +376,15 @@ vTensor::vTensor(
408376
const utils::GPUMemoryLayout memory_layout,
409377
const bool allocate_memory)
410378
: dtype_(dtype),
411-
memory_layout_(memory_layout),
412379
// Calculate tensor metadata
413380
sizes_(sizes.begin(), sizes.end()),
414-
dim_order_(calculate_dim_order(sizes_.size(), memory_layout_)),
381+
packed_dim_whcn_idx_(
382+
utils::to_packed_dim_whcn_idx<int32_t>(memory_layout)),
383+
dim_order_(calculate_dim_order(sizes_.size(), packed_dim_whcn_idx_)),
415384
axis_map_(default_axis_map()),
416385
strides_(calculate_strides(sizes, dim_order_)),
417386
numel_(utils::multiply_integers(sizes_)),
418-
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
387+
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_whcn_idx_)},
419388
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
420389
padded_numel_(utils::multiply_integers(padded_sizes_)),
421390
logical_limits_{{0, 0, 0}},
@@ -429,8 +398,8 @@ vTensor::vTensor(
429398
storage_(
430399
context,
431400
storage_type,
432-
memory_layout_,
433401
axis_map_,
402+
packed_dim_whcn_idx_,
434403
padded_sizes_,
435404
dtype_,
436405
allocate_memory) {
@@ -451,9 +420,9 @@ vTensor::vTensor(
451420

452421
vTensor::vTensor(const vTensor& other)
453422
: dtype_(other.dtype_),
454-
memory_layout_(other.memory_layout_),
455423
// Copy tensor size metadata
456424
sizes_(other.sizes_.begin(), other.sizes_.end()),
425+
packed_dim_whcn_idx_{other.packed_dim_whcn_idx_},
457426
dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
458427
axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
459428
strides_(other.strides_.begin(), other.strides_.end()),
@@ -479,14 +448,14 @@ vTensor::vTensor(
479448
const std::vector<int64_t>& dim_order,
480449
const int64_t offset_numel)
481450
: dtype_(other.dtype_),
482-
memory_layout_(estimate_memory_layout(dim_order)),
483451
// Copy tensor size metadata
484452
sizes_(sizes.begin(), sizes.end()),
453+
packed_dim_whcn_idx_(other.packed_dim_whcn_idx_),
485454
dim_order_(dim_order.begin(), dim_order.end()),
486455
axis_map_(default_axis_map()),
487456
strides_(calculate_strides(sizes_, dim_order_)),
488457
numel_(utils::multiply_integers(sizes_)),
489-
padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
458+
padded_sizes_{calculate_padded_sizes(sizes, packed_dim_whcn_idx_)},
490459
unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
491460
padded_numel_(utils::multiply_integers(padded_sizes_)),
492461
logical_limits_(other.logical_limits_),
@@ -542,6 +511,19 @@ void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
542511
logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
543512
}
544513

514+
utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
515+
switch (packed_dim_whcn_idx_) {
516+
case WHCN::kWidthDim:
517+
return utils::kWidthPacked;
518+
case WHCN::kHeightDim:
519+
return utils::kHeightPacked;
520+
case WHCN::kChannelsDim:
521+
return utils::kChannelsPacked;
522+
default:
523+
VK_THROW("Invalid packed dim");
524+
}
525+
}
526+
545527
const vkapi::BufferBindInfo vTensor::sizes_ubo() {
546528
if (!sizes_uniform_.buffer()) {
547529
sizes_uniform_ =
@@ -618,21 +600,16 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
618600

619601
void vTensor::update_metadata() {
620602
strides_ = calculate_strides(sizes_, dim_order_);
621-
// Only update the memory layout for buffer-backed tensors. Strides are
622-
// meaningless for texture-backed tensors and do not impact the memory layout.
623-
if (storage_type() == utils::kBuffer) {
624-
memory_layout_ = estimate_memory_layout(dim_order_);
625-
}
626603
numel_ = utils::multiply_integers(sizes_);
627604

628-
padded_sizes_ = calculate_padded_sizes(sizes_, memory_layout_);
605+
padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_whcn_idx_);
629606
unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
630607
padded_numel_ = utils::multiply_integers(padded_sizes_);
631608

632609
// Calculate the image extents that would have been used to allocate a texture
633610
// withthe current sizes, and use that to set the logical limits.
634611
set_logical_limits(
635-
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_));
612+
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_whcn_idx_));
636613

637614
if (sizes_uniform_.buffer()) {
638615
sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -656,7 +633,7 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
656633
// For texture storage check that the current texture is large enough for
657634
// the new sizes of the tensor.
658635
utils::uvec3 virtual_extents =
659-
calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
636+
calculate_image_extents(padded_sizes_, axis_map_, packed_dim_whcn_idx_);
660637

661638
bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
662639
valid_resize =
@@ -725,23 +702,23 @@ void transpose_dim_order_inplace(
725702

726703
void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
727704
std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
705+
706+
const int dim0_whcn = sizes_.size() - 1 - dim0;
707+
const int dim1_whcn = sizes_.size() - 1 - dim1;
708+
if (packed_dim_whcn_idx_ == dim0_whcn) {
709+
packed_dim_whcn_idx_ = dim1_whcn;
710+
}
711+
if (packed_dim_whcn_idx_ == dim1_whcn) {
712+
packed_dim_whcn_idx_ = dim0_whcn;
713+
}
714+
728715
if (storage_type() == utils::kBuffer) {
729716
transpose_dim_order_inplace(dim_order_, dim0, dim1);
730717
} else {
731-
const int dim0_whcn = sizes_.size() - 1 - dim0;
732-
const int dim1_whcn = sizes_.size() - 1 - dim1;
733718
// Cannot transpose batch dimension for texture storage
734719
VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
735-
736720
std::iter_swap(
737721
axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
738-
739-
if (packed_dim_whcn_idx() == dim0_whcn) {
740-
memory_layout_ = utils::GPUMemoryLayout(dim1_whcn);
741-
}
742-
if (packed_dim_whcn_idx() == dim1_whcn) {
743-
memory_layout_ = utils::GPUMemoryLayout(dim0_whcn);
744-
}
745722
}
746723
update_metadata();
747724
}

backends/vulkan/runtime/api/containers/Tensor.h

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ namespace api {
2626
*/
2727
std::vector<int64_t> calculate_dim_order(
2828
const size_t ndim,
29-
const utils::GPUMemoryLayout memory_layout);
29+
const int32_t packed_dim_whcn_idx);
3030

3131
/*
3232
* Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
@@ -57,15 +57,15 @@ std::vector<int64_t> unsqueeze_strides(
5757
*/
5858
std::vector<int64_t> calculate_padded_sizes(
5959
const std::vector<int64_t>& sizes,
60-
const utils::GPUMemoryLayout memory_layout);
60+
const int32_t packed_dim_whcn_idx);
6161

6262
/*
6363
* Calculate the image extents required of a texture backed tensor.
6464
*/
6565
utils::uvec3 calculate_image_extents(
6666
const std::vector<int64_t>& padded_sizes,
6767
const std::vector<int64_t>& axis_map,
68-
const utils::GPUMemoryLayout memory_layout);
68+
const int32_t packed_dim_whcn_idx);
6969

7070
struct LastAccess {
7171
vkapi::PipelineStageFlags stage;
@@ -89,8 +89,8 @@ class vTensorStorage final {
8989
vTensorStorage(
9090
Context* context,
9191
const utils::StorageType storage_type,
92-
const utils::GPUMemoryLayout gpu_memory_layout,
9392
const std::vector<int64_t>& axis_map,
93+
const int32_t packed_dim_whcn_idx,
9494
const std::vector<int64_t>& padded_sizes,
9595
const vkapi::ScalarType dtype,
9696
const bool allocate_memory = true);
@@ -221,13 +221,14 @@ class vTensor final {
221221

222222
// Whether the tensor has elements of type float, int, etc.
223223
vkapi::ScalarType dtype_;
224-
// Describes which dimension is "tightly packed". For texture backed tensors,
225-
// this describes which dimension is packed along a texel. For buffer backed
226-
// tensors, this describes which dimension has a stride of 1 (i.e. is last in
227-
// the dim order).
228-
utils::GPUMemoryLayout memory_layout_;
229224
// sizes of the tensor in NCHW dimension order
230225
std::vector<int64_t> sizes_;
226+
// Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
227+
// width, 1 for height, etc.). For texture backed tensors, this describes
228+
// which dimension is packed along a texel. For buffer backed tensors, this
229+
// describes which dimension has a stride of 1 (i.e. is last in the dim
230+
// order).
231+
int32_t packed_dim_whcn_idx_;
231232

232233
/*
233234
* "Layout" metadata. These describe with further detail how tensor data is
@@ -371,12 +372,18 @@ class vTensor final {
371372
return dtype_;
372373
}
373374

374-
inline utils::GPUMemoryLayout gpu_memory_layout() const {
375-
return memory_layout_;
376-
}
375+
/*
376+
* Provide a "best guess" of a memory layout that can be used to construct a
377+
* tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
378+
* tensor. In some scenarios, the exact layout of the tensor may not be able
379+
* to be replicated due to calling `virtual_*()` functions after construction;
380+
* however, this function will provide a memory layout that will produce the
381+
* same `packed_dim_whcn_idx` as this tensor.
382+
*/
383+
utils::GPUMemoryLayout estimate_memory_layout() const;
377384

378385
inline int32_t packed_dim_whcn_idx() const {
379-
return static_cast<int32_t>(memory_layout_);
386+
return packed_dim_whcn_idx_;
380387
}
381388

382389
inline const std::vector<int64_t>& sizes() const {
@@ -496,6 +503,9 @@ class vTensor final {
496503
*
497504
* This function can only be used for buffer-backed tensors, since texture
498505
* backed buffers cannot change dimensionality or memory layout.
506+
*
507+
* TODO(ssjia): delete this API. prefer functions such as virtual_transpose
508+
* instead.
499509
*/
500510
void virtual_reconfigure(
501511
const std::vector<int64_t>& new_sizes,

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,8 +307,9 @@ class ComputeGraph final {
307307
.is_view_of(values_.at(base).toConstTensor());
308308
}
309309

310-
inline utils::GPUMemoryLayout memory_layout_of(const ValueRef idx) const {
311-
return values_.at(idx).toConstTensor().gpu_memory_layout();
310+
inline utils::GPUMemoryLayout estimate_memory_layout_of(
311+
const ValueRef idx) const {
312+
return values_.at(idx).toConstTensor().estimate_memory_layout();
312313
}
313314

314315
inline int32_t packed_dim_whcn_idx_of(const ValueRef idx) const {

backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ void check_binary_op_args(
2121
const api::vTensor& self,
2222
const api::vTensor& other,
2323
const api::vTensor& out) {
24-
VK_CHECK_COND(check_same_memory_layout(self, other, out));
24+
VK_CHECK_COND(check_same_packed_dim(self, other, out));
2525
std::vector<int64_t> broadcasted_sizes =
2626
calculate_broadcasted_output_size(self, other);
2727
VK_CHECK_COND(out.sizes() == broadcasted_sizes);
@@ -53,7 +53,7 @@ void add_binary_op_node(
5353
const std::string& op_name) {
5454
ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
5555
ValueRef arg2 =
56-
prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));
56+
prepack_if_tensor_ref(graph, in2, graph.estimate_memory_layout_of(arg1));
5757

5858
vTensorPtr t_in1 = graph.get_tensor(arg1);
5959
vTensorPtr t_in2 = graph.get_tensor(arg2);

backends/vulkan/runtime/graph/ops/impl/Cat.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ void add_cat_default_node(
2525

2626
for (ValueRef input_ref : *input_list) {
2727
vTensorPtr t_in = graph.get_tensor(input_ref);
28-
VK_CHECK_COND(check_memory_layout_is(*t_in, utils::kChannelsPacked));
28+
VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
2929
}
3030

3131
int64_t dim = graph.extract_scalar<int64_t>(dim_ref);

backends/vulkan/runtime/graph/ops/impl/Convolution.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ ValueRef prepack_weights(
222222
}
223223

224224
void check_conv_args(const api::vTensor& in, const api::vTensor& out) {
225-
VK_CHECK_COND(check_memory_layout_is(in, utils::kChannelsPacked));
226-
VK_CHECK_COND(check_memory_layout_is(out, utils::kChannelsPacked));
225+
VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
226+
VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
227227
}
228228

229229
struct Conv2dParams final {

0 commit comments

Comments
 (0)