Skip to content

[ET-VK] Add binary op support for height and width packing GPU layouts #2516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,27 +84,68 @@ void ComputeGraph::update_descriptor_counts(
}
}

api::StorageType ComputeGraph::suggested_storage_type() {
if (config_.enableStorageTypeOverride) {
return config_.storageTypeOverride;
}
return api::StorageType::TEXTURE_3D;
}

api::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
const std::vector<int64_t>& sizes) {
if (config_.enableMemoryLayoutOverride) {
return config_.memoryLayoutOverride;
}
if (sizes.size() < 3) {
return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
}
// For 3 dimensional tensors that only have a channels dimension of 1, still
// prefer width packed.
if (api::utils::val_at(-3, sizes) == 1) {
return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
}
return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
}

ValueRef ComputeGraph::add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const api::StorageType storage_type,
const api::GPUMemoryLayout memory_layout,
const int64_t shared_object_idx) {
bool allocate_memory = shared_object_idx < 0;

ValueRef idx(static_cast<int>(values_.size()));
values_.emplace_back(vTensor(
context(),
sizes,
dtype,
api::StorageType::TEXTURE_3D,
api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
allocate_memory));
context(), sizes, dtype, storage_type, memory_layout, allocate_memory));

if (!allocate_memory) {
get_shared_object(shared_object_idx).add_user(this, idx);
}
return idx;
}

ValueRef ComputeGraph::add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const api::GPUMemoryLayout memory_layout,
const int64_t shared_object_idx) {
return add_tensor(
sizes, dtype, suggested_storage_type(), memory_layout, shared_object_idx);
}

ValueRef ComputeGraph::add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const int64_t shared_object_idx) {
return add_tensor(
sizes,
dtype,
suggested_storage_type(),
suggested_memory_layout(sizes),
shared_object_idx);
}

ValueRef ComputeGraph::add_tensorref(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
Expand Down
91 changes: 89 additions & 2 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,13 @@ class ComputeGraph final {
bool execute);

/*
* Returns the value at a particular reference
* Returns the value at a particular index in the graph. If storing this
* function's return value in a lvalue reference, it is imperative that no
* values are added to the graph while the reference is in scope, otherwise
* the underlying value may have been moved as part of a vector resize.
*/
inline Value& get_val(ValueRef idx) {
return values_[idx];
return values_.at(idx);
}

inline const std::vector<int64_t>& get_val_sizes(ValueRef idx) {
Expand Down Expand Up @@ -127,18 +130,88 @@ class ComputeGraph final {
return execute_nodes_;
}

//
// Utility functions
//

/*
* Returns a suggested storage type (i.e. buffer or texture) that can be used
* to construct `vTensor`s. The storage type is typically determined by the
* GPU reported by the Vulkan context, unless a storage type override is
* defined in the graph configuration. Some GPU architectures work better with
* buffer storage, and others with texture storage. Current only texture
* storage is supported.
*/
api::StorageType suggested_storage_type();

/*
* Returns a suggested memory layout (i.e. channels, width, or height packed)
* that can be used to construct `vTensor`s. The memory layout impacts which
* dimension will be treated as the vectorized dimension. For texture storage,
* elements along the vectorized dimension are packed into texels. The
* suggested memory layout is determined based on the sizes of the tensor,
* unless a memory layout override is defined in the graph configuration.
*/
api::GPUMemoryLayout suggested_memory_layout(
const std::vector<int64_t>& sizes);

/*
* Returns the memory layout of a Tensor value at the specified index.
*/
inline api::GPUMemoryLayout memory_layout_of(ValueRef idx) {
return get_val(idx).toTensor().gpu_memory_layout();
}

//
// Graph Building
//

/*
* Add a `vTensor` value to the graph with the specified properties. There are
* various convenience overloads of this function that may be used instead.
*/
ValueRef add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const api::StorageType storage_type,
const api::GPUMemoryLayout memory_layout,
const int64_t shared_object_idx);

/*
* Add a `vTensor` value to the graph with the specified properties. The
* suggested storage type will be used to construct the `vTensor`.
*/
ValueRef add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const api::GPUMemoryLayout memory_layout,
const int64_t shared_object_idx = -1);

/*
* Add a `vTensor` value to the graph with the specified properties. The
* suggested storage type and memory layout will be used to construct the
* `vTensor`.
*/
ValueRef add_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype = api::ScalarType::Float,
const int64_t shared_object_idx = -1);

/*
* Add a `TensorRef` value to the graph with the specific properties. A
* `TensorRef` is a reference to a `vTensor` whose data is stored in an
* external CPU buffer.
*/
ValueRef add_tensorref(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const void* const data);

/*
* Add a staging buffer to the graph. Staging buffers are data buffers that
* use memory that is visible to both the CPU and GPU, and therefore is used
* as a intermediary when transferring data between the CPU and GPU.
*/
ValueRef add_staging(const api::ScalarType dtype, const size_t numel);

ValueRef add_none();
Expand Down Expand Up @@ -176,6 +249,20 @@ class ComputeGraph final {
return {t, staging};
}

/*
* Convenience function to add an input tensor with a specific memory layout
* along with its staging buffer
*/
inline IOValueRef add_input_tensor(
const std::vector<int64_t>& sizes,
const api::ScalarType dtype,
const api::GPUMemoryLayout memory_layout,
const int64_t shared_object_idx = -1) {
ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
ValueRef staging = set_input_tensor(t);
return {t, staging};
}

SharedObject& get_shared_object(const int64_t idx);

//
Expand Down
20 changes: 20 additions & 0 deletions backends/vulkan/runtime/graph/GraphConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,26 @@ GraphConfig::GraphConfig() {
// Empirically selected safety factor. If descriptor pools start running out
// of memory, increase this safety factor.
descriptorPoolSafetyFactor = 1.25;

// For now, force TEXTURE_3D storage as we are still developing shader
// support for buffer storage type.
enableStorageTypeOverride = true;
storageTypeOverride = api::StorageType::TEXTURE_3D;

// For now, force TENSOR_CHANNELS_PACKED memory layout by default as we are
// still developing support for other memory layouts.
enableMemoryLayoutOverride = true;
memoryLayoutOverride = api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
}

void GraphConfig::setStorageTypeOverride(api::StorageType storage_type) {
enableStorageTypeOverride = true;
storageTypeOverride = storage_type;
}

void GraphConfig::setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout) {
enableMemoryLayoutOverride = true;
memoryLayoutOverride = memory_layout;
}

} // namespace vulkan
Expand Down
9 changes: 9 additions & 0 deletions backends/vulkan/runtime/graph/GraphConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,17 @@ struct GraphConfig final {
// risk.
float descriptorPoolSafetyFactor;

bool enableStorageTypeOverride;
api::StorageType storageTypeOverride;

bool enableMemoryLayoutOverride;
api::GPUMemoryLayout memoryLayoutOverride;

// Generate a default graph config with pre-configured settings
explicit GraphConfig();

void setStorageTypeOverride(api::StorageType storage_type);
void setMemoryLayoutOverride(api::GPUMemoryLayout memory_layout);
};

} // namespace vulkan
Expand Down
5 changes: 5 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,10 @@ void main() {
COORD_TO_POS_${PACKING}(other_coord, other_sizes.data),
0);

// Detect broadcasting
if (PACKED_DIM_${PACKING}(other_sizes.data) < PACKED_DIM_${PACKING}(in_sizes.data)) {
other_texel = other_texel.xxxx;
}

imageStore(image_out, pos, OP(in_texel, other_texel, alpha.data));
}
15 changes: 11 additions & 4 deletions backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,18 @@ binary_op:
DTYPE: float
PACKING: CHANNELS_PACKED
generate_variant_forall:
PACKING:
- VALUE: CHANNELS_PACKED
SUFFIX: C_packed
- VALUE: WIDTH_PACKED
SUFFIX: W_packed
- VALUE: HEIGHT_PACKED
SUFFIX: H_packed
DTYPE:
- VALUE: "half"
SUFFIX: "half"
- VALUE: "float"
SUFFIX: "float"
- VALUE: half
SUFFIX: half
- VALUE: float
SUFFIX: float
shader_variants:
- NAME: binary_add
- NAME: binary_sub
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ivec4 out_coord_to_in_coord(const ivec4 out_coord, const ivec4 in_sizes) {
ivec4 in_coord = out_coord;
for (int i = 0; i < 4; ++i) {
if (in_sizes[i] == 1) {
if (out_coord[i] >= in_sizes[i]) {
in_coord[i] = 0;
}
}
Expand Down
9 changes: 9 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@
#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)

#define COORD_TO_POS_WIDTH_PACKED(coord, sizes) \
ivec3(coord.x / 4, coord.y, (coord.z + coord.w * sizes.z))

#define COORD_TO_POS_HEIGHT_PACKED(coord, sizes) \
ivec3(coord.x, coord.y / 4, (coord.z + coord.w * sizes.z))

#define COORD_TO_POS_CHANNELS_PACKED(coord, sizes) \
ivec3(coord.x, coord.y, (coord.z + coord.w * sizes.z) / 4)

#define COORD_TO_BUFFER_IDX(coord, sizes) \
coord.x + coord.y* sizes.x + coord.z* sizes.y* sizes.x + \
coord.w* sizes.z* sizes.y* sizes.x;
Expand Down
29 changes: 19 additions & 10 deletions backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ namespace at {
namespace native {
namespace vulkan {

void check_binary_op_args(
const vTensor& self,
const vTensor& other,
const vTensor& out) {
VK_CHECK_COND(check_same_memory_layout(self, other, out));
VK_CHECK_COND(check_broadcastable(self, other));
std::vector<int64_t> broadcasted_sizes =
calculate_broadcasted_output_size(self, other);
VK_CHECK_COND(out.sizes() == broadcasted_sizes);
}

void resize_binary_op_node(
ComputeGraph* graph,
const std::vector<ArgGroup>& args,
Expand All @@ -28,15 +39,8 @@ void resize_binary_op_node(
vTensor& self = graph->get_val(args[1].refs[0]).toTensor();
vTensor& other = graph->get_val(args[1].refs[1]).toTensor();

std::vector<int64_t> new_out_sizes(
std::max(self.sizes().size(), other.sizes().size()));

// Match the sizes in reverse because sizes are in NCHW order
for (int i = -1; i >= -new_out_sizes.size(); --i) {
new_out_sizes.at(new_out_sizes.size() + i) = std::max(
api::utils::val_at(i, self.sizes()),
api::utils::val_at(i, other.sizes()));
}
std::vector<int64_t> new_out_sizes =
calculate_broadcasted_output_size(self, other);

out.virtual_resize(new_out_sizes);
}
Expand All @@ -49,12 +53,16 @@ void add_binary_op_node(
const ValueRef out,
const std::string& op_name) {
ValueRef arg1 = prepack_if_tensor_ref(graph, in1);
ValueRef arg2 = prepack_if_tensor_ref(graph, in2);
ValueRef arg2 =
prepack_if_tensor_ref(graph, in2, graph.memory_layout_of(arg1));

vTensor& t_in1 = graph.get_val(arg1).toTensor();
vTensor& t_in2 = graph.get_val(arg2).toTensor();

vTensor& t_out = graph.get_val(out).toTensor();

check_binary_op_args(t_in1, t_in2, t_out);

api::utils::uvec3 global_size = t_out.virtual_extents();
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);

Expand All @@ -67,6 +75,7 @@ void add_binary_op_node(

std::stringstream kernel_name;
kernel_name << "binary_" << op_name;
apply_memory_layout_suffix(kernel_name, t_out);
apply_dtype_suffix(kernel_name, t_out);

graph.execute_nodes().emplace_back(new ExecuteNode(
Expand Down
Loading