Skip to content

[ET-VK][11/n] copy_channel_offsets node #3351

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D existing_out;
layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 3) uniform PRECISION restrict CopyArgs {
ivec4 out_sizes;
ivec4 in_sizes;
// Analogus to range variable in copy. It defines the # of channel being
// copied.
int channel_range;
int src_channel_offset;
int dst_channel_offset;
int unused;
// Operates on (x, y, z) extents.
ivec3 range;
int unused1;
ivec3 dst_offset;
int unused2;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

void main() {
// Note: Unlike other shaders, the range is often not equal to the destination
// texture extent.
const ivec3 pos = ivec3(gl_GlobalInvocationID);
if (any(greaterThanEqual(pos, range))) {
return;
}

const ivec3 out_pos = pos + dst_offset;

const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);

// First read the existing values to make sure the boundary values stay.
VEC4_T v = VEC4_T(texelFetch(existing_out, out_pos, 0));

for (int i=0; i<4; i++) {
ivec4 in_whcn = out_whcn;

in_whcn.z = out_whcn.z - dst_channel_offset + i;

// Handle the partial update for begining of channel in an existing tensor.
// If the source channel index is below zero or exceeds the range, we skip
// updating the element to avoid overwriting existing data.
if ((in_whcn.z < 0) || (in_whcn.z >= channel_range)) {
continue;
}

// Readjust for the source offset.
in_whcn.z = in_whcn.z + src_channel_offset;

ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
}

imageStore(image_out, out_pos, v);
}
10 changes: 10 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
copy_channel_offset:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
shader_variants:
- NAME: copy_channel_offset
16 changes: 1 addition & 15 deletions backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,12 @@

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
ivec3 out_limits;
};

layout(set = 0, binding = 3) uniform PRECISION restrict InLimits {
ivec3 in_limits;
};



layout(set = 0, binding = 4) uniform PRECISION restrict CopyArgs {
layout(set = 0, binding = 2) uniform PRECISION restrict CopyArgs {
ivec3 range;
int unused0;
ivec3 src_offset;
Expand Down
184 changes: 169 additions & 15 deletions backends/vulkan/runtime/graph/ops/impl/Copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,38 +8,39 @@

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

using api::utils::ivec3;
using api::utils::uvec3;

void add_copy_offset_node(
ComputeGraph& graph,
const ValueRef in,
const api::utils::ivec3& range,
const api::utils::ivec3& src_offset,
const api::utils::ivec3& dst_offset,
const ivec3& range,
const ivec3& src_offset,
const ivec3& dst_offset,
const ValueRef out) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));

std::string kernel_name = "copy_offset";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

api::utils::uvec3 global_size = api::utils::make_uvec3(range);
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
uvec3 global_size = api::utils::make_uvec3(range);
uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
api::utils::ivec3 range;
ivec3 range;
int32_t unused0;
api::utils::ivec3 src_offset;
ivec3 src_offset;
int32_t unused1;
api::utils::ivec3 dst_offset;
ivec3 dst_offset;
int32_t unused2;
} offset_params{
range,
Expand All @@ -58,13 +59,166 @@ void add_copy_offset_node(
global_size,
local_size,
// Inputs and Outputs
{{out, api::MemoryAccessType::WRITE}, {in, api::MemoryAccessType::READ}},
{
{out, api::MemoryAccessType::WRITE},
{in, api::MemoryAccessType::READ},
},
// Parameter buffers
{t_out->texture_limits_ubo(),
t_in->texture_limits_ubo(),
graph.create_params_buffer(offset_params)},
{graph.create_params_buffer(offset_params)},
// Specialization Constants
{}));
}

void add_copy_channel_offset_node(
ComputeGraph& graph,
const ValueRef in,
int32_t channel_range,
int32_t src_channel_offset,
int32_t dst_channel_offset,
const ValueRef out) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

// Likely need to prepad these numbers.
std::vector<int64_t> in_sizes = t_in->sizes();
std::vector<int64_t> out_sizes = t_out->sizes();

VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));

// NOTE: This function should be able to support 1d and 2d tensors when
// range=1, src_offset=dst_offset=1.
VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");

VK_CHECK_COND(
dim_at<Dim4D::Channel>(in_sizes) >= src_channel_offset + channel_range,
"Source channel plus range should be less than or equal to input tensor's channel size");
VK_CHECK_COND(
dim_at<Dim4D::Channel>(out_sizes) >= dst_channel_offset + channel_range,
"Source channel and range should be less than or equal to input tensor's channel size");

VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
VK_CHECK_COND(
src_channel_offset >= 0, "Src channel offset must be non-negative");
VK_CHECK_COND(
dst_channel_offset >= 0, "Dst channel offset must be non-negative");

std::string kernel_name = "copy_channel_offset";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

int32_t out_channels = dim_at<Dim4D::Channel>(out_sizes);

// Copy one batch at a time.
for (int batch_idx = 0; batch_idx < dim_at<Dim4D::Batch>(in_sizes);
batch_idx++) {
// Mapping the tensor NCHW coordinates into texture XYZ coordinates
int32_t dst_first_z = dst_channel_offset / 4;
int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;

// We copy the entire width and height dimension. For the channel dimension,
// we use the z-dimension of the global_size to specify the texture range.
// The shader combines the global invocation id and the dst_offset to get
// the actual coordinate.

ivec3 dst_offset{
0, 0, dst_first_z + batch_idx * api::utils::div_up(out_channels, 4)};

uvec3 global_size{
dim_at<Dim4D::Width>(in_sizes),
dim_at<Dim4D::Height>(in_sizes),
api::utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};

uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
api::utils::ivec4 out_sizes;
api::utils::ivec4 in_sizes;
int32_t channel_range;
int32_t src_channel_offset;
int32_t dst_channel_offset;
int32_t unused;
ivec3 range;
int32_t unused1;
ivec3 dst_offset;
int32_t unused2;

} channel_offset_params{
api::utils::make_whcn_ivec4(out_sizes),
api::utils::make_whcn_ivec4(in_sizes),
channel_range,
src_channel_offset,
dst_channel_offset,
0,
api::utils::make_ivec3(global_size),
0,
dst_offset,
0,
};

auto shader = VK_KERNEL_FROM_STR(kernel_name);

graph.execute_nodes().emplace_back(new ExecuteNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
// Inputs and Outputs
{
{out, api::MemoryAccessType::WRITE},
{out, api::MemoryAccessType::READ},
{in, api::MemoryAccessType::READ},
},
// Parameter buffers
{graph.create_params_buffer(channel_offset_params)},
// Specialization Constants
{}));
}
}

void add_copy_offset_node(
ComputeGraph& graph,
ValueRef in,
ValueRef range_ref,
ValueRef src_offset_ref,
ValueRef dst_offset_ref,
ValueRef out) {
ivec3 range = api::utils::make_ivec3(*graph.get_int_list(range_ref));
ivec3 src_offset =
api::utils::make_ivec3(*graph.get_int_list(src_offset_ref));
ivec3 dst_offset =
api::utils::make_ivec3(*graph.get_int_list(dst_offset_ref));

add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
}

void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
}

void copy_channel_offset(
ComputeGraph& graph,
const std::vector<ValueRef>& args) {
ValueRef in = args[0];
ValueRef channel_range_ref = args[1];
ValueRef src_channel_offset_ref = args[2];
ValueRef dst_channel_offset_ref = args[3];
ValueRef out = args[4];

auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
auto src_channel_offset =
graph.extract_scalar<int64_t>(src_channel_offset_ref);
auto dst_channel_offset =
graph.extract_scalar<int64_t>(dst_channel_offset_ref);

add_copy_channel_offset_node(
graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
}

REGISTER_OPERATORS {
VK_REGISTER_OP(etvk.copy_offset, copy_offset);
VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
}

} // namespace vkcompute
28 changes: 28 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@

namespace vkcompute {

// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
// texture extents specified by the range, src_offset, and dst_offset (all are
// in texture coordinate (x, y, z) from the input image to the output image.
//
// It is possible to have input and output to point to the same image
// object. But when the source range and destination range overlap, the behavior
// is undefined.
void add_copy_offset_node(
ComputeGraph& graph,
const ValueRef in,
Expand All @@ -22,4 +29,25 @@ void add_copy_offset_node(
const api::utils::ivec3& dst_offset,
const ValueRef out);

// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
// The range and offset arguments are in the tensor coordinate. It assumes the
// underlying texture is channel-packed.
//
// This function is specialized implementation for copying
// channel packed values. The complication comes from when reading / writing the
// channel dimension on indices that are not aligned to packing, we will need
// be careful about the boundaries.
//
// It achieves the following:
// out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
// in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
void add_copy_channel_offset_node(
ComputeGraph& graph,
const ValueRef in,
int32_t channel_range,
int32_t src_channel_offset,
int32_t dst_channel_offset,
const ValueRef out);

} // namespace vkcompute
Loading