Skip to content

[ET-VK][7/n] Slice, with lots of codegen improvements #3171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName

#include <optional>

#include <executorch/backends/vulkan/runtime/api/api.h>

#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
Expand Down Expand Up @@ -184,6 +186,15 @@ class ComputeGraph final {
VK_THROW("Cannot extract scalar from Value with type ", value.type());
}

template <typename T>
std::optional<T> extract_optional_scalar(const ValueRef idx) {
if (val_is_none(idx)) {
return ::std::nullopt;
} else {
return extract_scalar<T>(idx);
}
}

inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
return prepack_nodes_;
}
Expand Down
11 changes: 11 additions & 0 deletions backends/vulkan/runtime/graph/Logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <executorch/backends/vulkan/runtime/api/Utils.h>

#include <optional>
#include <ostream>
#include <vector>

Expand All @@ -33,4 +34,14 @@ inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec4& v) {
return api::utils::operator<<(os, v);
}

template <typename T>
inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& opt) {
os << "[";
if (opt) {
os << opt.value();
}
os << "]";
return os;
}

} // namespace vkcompute
6 changes: 3 additions & 3 deletions backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

#define divup4(x) ((x + 3) / 4)

// Input: idx is a ivec4 user-level coordinate, sizes is the tensor shape
// Output: buffer_idx in the continuous nchw-buffer.
// Input: idx is a ivec4 user-level (w, h, c, n) coordinate, sizes is the tensor
// shape Output: buffer_idx in the continuous nchw-buffer.
#define to_buffer_i(idx, sizes) \
(idx.x + idx.y * sizes.x + idx.z * sizes.y * sizes.x + \
idx.w * sizes.z * sizes.y * sizes.x)

// Inverse of to_buffer_i
// Input: buffer_idx in the continuous nchw-buffer, sizes is the tensor shape
// Output: ivec4 user-level coorindate
// Output: ivec4 user-level (w, h, c, n) coorindate
#define from_buffer_i(buf_i, sizes) \
ivec4( \
buf_i % sizes.x, \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}

layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
uvec4 data;
}
out_sizes;

layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
int dim;
int offset;
int step;
// Used when dim=batch. Stride is the # of plances for each batch value.
int stride;
}
slice_arg;

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
const ivec3 out_pos = ivec3(gl_GlobalInvocationID);

const ivec4 idx = to_tensor_idx_C_packed(out_pos, out_sizes.data);

if (any(greaterThanEqual(idx, out_sizes.data))) {
return;
}

ivec3 in_pos = out_pos;

int index = out_pos[slice_arg.dim] / slice_arg.stride;
int within_stride = out_pos[slice_arg.dim] % slice_arg.stride;

in_pos[slice_arg.dim] = slice_arg.offset * slice_arg.stride + index * slice_arg.step *
slice_arg.stride + within_stride;

imageStore(image_out, out_pos, texelFetch(image_in, in_pos, 0));

}


Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
slice_batch_height_width:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
generate_variant_forall:
DTYPE:
- VALUE: half
- VALUE: float
shader_variants:
- NAME: slice_batch_height_width
85 changes: 85 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/slice_channel.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}


#define to_tensor_idx to_tensor_idx_${PACKING}
#define to_texture_pos_elem to_texture_pos_elem_${PACKING}
#define get_packed_stride get_packed_stride_${PACKING}


layout(std430) buffer;

#include "indexing_utils.h"

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;

layout(set = 0, binding = 2) uniform PRECISION restrict OutSizes {
uvec4 data;
}
out_sizes;

layout(set = 0, binding = 3) uniform PRECISION restrict OutCpuSizes {
uvec4 out_cpu_sizes;
};

layout(set = 0, binding = 4) uniform PRECISION restrict InGpuSizes {
uvec4 in_gpu_sizes;
};

layout(set = 0, binding = 5) uniform PRECISION restrict SliceArg {
int offset;
int step;
}
slice_arg;

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

void main() {
const ivec3 out_pos = ivec3(gl_GlobalInvocationID);

const ivec4 idx = to_tensor_idx_C_packed(out_pos, out_sizes.data);

if (any(greaterThanEqual(idx, out_sizes.data))) {
return;
}

// We map the output pos using the buffer index. For each index in the texel,
// we calculate the source whcn-coordinate amended with offset-ed channel
// value. Then we calculate the actual texture position from the
// whcn-coordinate.

const uint base_index = to_buffer_i(idx, out_cpu_sizes);
uvec4 buf_indices =
base_index + ivec4(0, 1, 2, 3) * get_packed_stride(out_cpu_sizes);

vec4 outex;
for (int i=0;i<4;i++) {
ivec4 user_coor = from_buffer_i(buf_indices[i], out_cpu_sizes);

int in_channel = user_coor.z;

ivec4 in_user_coor = user_coor;
in_user_coor.z = slice_arg.offset + in_channel * slice_arg.step;

ivec4 in_pow_elem = to_texture_pos_elem_C_packed(
in_user_coor,
in_gpu_sizes);

vec4 v = texelFetch(image_in, in_pow_elem.xyz, 0);

outex[i] = v[in_pow_elem.w];
}
imageStore(image_out, out_pos, outex);
}
11 changes: 11 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/slice_channel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
slice_channel:
parameter_names_with_default_values:
DTYPE: float
NDIM: 3
generate_variant_forall:
DTYPE:
- VALUE: float
PACKING:
- VALUE: C_packed
shader_variants:
- NAME: slice_channel
159 changes: 159 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Slice.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>

#include <executorch/backends/vulkan/runtime/graph/Logging.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

void add_slice_tensor_out_node(
ComputeGraph& graph,
ValueRef in,
ValueRef dim_ref,
ValueRef opt_start_ref,
ValueRef opt_end_ref,
ValueRef step_ref,
ValueRef out) {
vTensorPtr t_in = graph.get_tensor(in);
vTensorPtr t_out = graph.get_tensor(out);

VK_CHECK_COND(check_memory_layout_is(*t_in, api::kChannelsPacked));
VK_CHECK_COND(check_memory_layout_is(*t_out, api::kChannelsPacked));

// Need normalize the dim
int64_t dim = graph.extract_scalar<int64_t>(dim_ref);

VK_CHECK_COND(
-t_in->dim() <= dim && dim < t_in->dim(),
"dim must be in range of [-self.dim(), self.dim()), but current dim's value is ",
dim,
" and self.dim() = ",
t_in->dim());

dim = normalize(dim, t_in->dim());

// Create a dim value as in the underlying dim is 4-dimension.
int64_t nchw_dim = dim + (4 - t_in->dim());

std::optional<int64_t> opt_start =
graph.extract_optional_scalar<int64_t>(opt_start_ref);
std::optional<int64_t> opt_end =
graph.extract_optional_scalar<int64_t>(opt_end_ref);
int64_t step = graph.extract_scalar<int64_t>(step_ref);

const auto in_sizes = t_in->sizes();
const auto out_sizes = t_out->sizes();

int64_t start = opt_start.value_or(0);
int64_t end = opt_end.value_or(in_sizes[dim]);

VK_CHECK_COND((0 <= start) && (start < in_sizes[dim]));
VK_CHECK_COND((0 <= end) && (end <= in_sizes[dim]));

if (nchw_dim == 1) {
// slice by channel
std::string kernel_name = "slice_channel";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);
add_memory_layout_suffix(kernel_name, *t_out);

api::utils::uvec3 global_size = t_out->extents();
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
int offset;
int step;
} params{
static_cast<int32_t>(start),
static_cast<int32_t>(step),
};

graph.execute_nodes().emplace_back(new ExecuteNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
{{out, api::MemoryAccessType::WRITE},
{in, api::MemoryAccessType::READ}},
{t_out->gpu_sizes_ubo(),
t_out->cpu_sizes_ubo(),
t_in->gpu_sizes_ubo(),
graph.create_params_buffer(params)}));

} else {
// GPU's coordinate is in x, y, z
int64_t gpu_dim = -1;
int64_t stride = 1;
if (nchw_dim == 3) {
gpu_dim = 0; // width: x dimension in gpu
VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
} else if (nchw_dim == 2) {
gpu_dim = 1; // height: y dimension
VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
} else if (nchw_dim == 0) {
gpu_dim = 2; // batch: z dimension

// Due to channel packing, each batch value is span over stride planes
int64_t n_channels = dim_at<Dim4D::Channel>(in_sizes);
stride = api::utils::div_up<int64_t>(n_channels, 4ll);
} else {
VK_THROW("Unexpected ncwh_dim!");
}

std::string kernel_name = "slice_batch_height_width";
kernel_name.reserve(kShaderNameReserve);
add_dtype_suffix(kernel_name, *t_out);

api::utils::uvec3 global_size = t_out->extents();
api::utils::uvec3 local_size = adaptive_work_group_size(global_size);

const struct Block final {
int dim;
int offset;
int step;
int stride;
} params{
static_cast<int32_t>(gpu_dim),
static_cast<int32_t>(start),
static_cast<int32_t>(step),
static_cast<int32_t>(stride),
};

graph.execute_nodes().emplace_back(new ExecuteNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_size,
local_size,
{{out, api::MemoryAccessType::WRITE},
{in, api::MemoryAccessType::READ}},
{t_out->gpu_sizes_ubo(), graph.create_params_buffer(params)}));
}
}

void slice_tensor_out(ComputeGraph& graph, const std::vector<ValueRef>& args) {
return add_slice_tensor_out_node(
graph,
args[0],
args[1], // dim
args[2], // optional start
args[3], // optional end
args[4], // step
args[5]);
}

REGISTER_OPERATORS {
VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_out);
}

} // namespace vkcompute
Loading