Skip to content

Support int8 texture tensors without requiring int8 buffers #4485

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,24 +319,20 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
return image_extents_of(idx);
}

utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
utils::uvec3 ComputeGraph::create_local_wg_size(
const utils::uvec3 global_wg_size) {
if (config_.enable_local_wg_size_override) {
return config_.local_wg_size_override;
}

if (is_buffer_storage(idx)) {
return {64u, 1u, 1u};
}

const utils::uvec3 image_extents = image_extents_of(idx);
utils::uvec3 local_group_size = {4, 4, 4};

if (image_extents.data[2u] == 1) {
if (image_extents.data[1u] == 1) {
if (global_wg_size.data[2u] == 1) {
if (global_wg_size.data[1u] == 1) {
local_group_size.data[0u] = 64;
local_group_size.data[1u] = 1;
local_group_size.data[2u] = 1;
} else if (image_extents.data[1u] < 8) {
} else if (global_wg_size.data[1u] < 8) {
local_group_size.data[0u] = 16;
local_group_size.data[1u] = 4;
local_group_size.data[2u] = 1;
Expand All @@ -349,6 +345,10 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
return local_group_size;
}

utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
return create_local_wg_size(image_extents_of(idx));
}

void ComputeGraph::copy_into_staging(
const ValueRef idx,
const void* data,
Expand Down
28 changes: 24 additions & 4 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@ class ComputeGraph final {
return values_.at(idx).type();
}

// Get Tensor Property
//
// Tensor Properties Accessors
//

std::vector<int64_t> sizes_of(const ValueRef idx) const;

Expand Down Expand Up @@ -226,7 +228,9 @@ class ComputeGraph final {
return values_.at(idx).toTensor().ntexels_ubo();
}

//
// Scalar Value Extraction
//

template <typename T>
T extract_scalar(const ValueRef idx) {
Expand Down Expand Up @@ -459,16 +463,21 @@ class ComputeGraph final {
utils::uvec3 create_global_wg_size(const ValueRef idx);

/*
* Suggest a local workgroup size for a given `api::vTensor` value, assuming
* that every shader invocation calculates one texel element of the output
* tensor.
* Suggest a local workgroup size for a given global workgroup size.
*
* The local workgroup size will be formed to try and minimize the number of
* inactive invocations.
*
* Currently, the local workgroup size is hard-coded to contain a total of 64
* shader invocations. In the future, this value can be configured.
*/
utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);

/*
* Convenience function to suggest a local workgroup size for a given
* `api::vTensor` value, assuming that every shader invocation calculates one
* texel element of the output tensor.
*/
utils::uvec3 create_local_wg_size(const ValueRef idx);

//
Expand Down Expand Up @@ -500,6 +509,17 @@ class ComputeGraph final {
void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
void propagate_resize();

//
// Miscellaneous Utilities
//

/*
* Check whether the GPU supports 8 bit buffers.
*/
inline bool int8_buffers_enabled() const {
return context_->adapter_ptr()->has_full_int8_buffers_support();
}

//
// Debug support (implemented in Logging.cpp)
//
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ ivec4 from_nchw_buffer_i(int buf_i, ivec4 sizes) {
* Returns: The (x, y, z, n) texel position corresponding to the first element
* of the texel at the specified buffer index
*/
ivec4 to_texel_pos(int buf_i, ivec4 strides, int packed_dim) {
ivec4 to_tensor_idx(int buf_i, ivec4 strides, int packed_dim) {
ivec4 idx;
for (int i = 3; i >= 0; i--) {
if (i != packed_dim) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#include "indexing_utils.h"

layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_tensor(0, "r", "t_in", "int8", "texture3d")}
${layout_declare_buffer(1, "w", "nchw_out", "int")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
${layout_declare_ubo(3, "int", "out_ntexels")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

void main() {
const int out_buf_idx = int(gl_GlobalInvocationID.x);
if (out_buf_idx >= out_ntexels) {
return;
}

ivec4 values;
int in_buf_idx = 4 * out_buf_idx;

[[unroll]] for (int i = 0; i < 4; ++i) {
const ivec4 tensor_idx = from_nchw_buffer_i(in_buf_idx, tensor_sizes);
const ivec4 texture_pos = to_texture_elem_pos(
tensor_idx, tensor_sizes, packed_dim);
values[i] = load_texel(t_in, texture_pos.xyz)[texture_pos.w];
in_buf_idx++;
}

// Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
// endian is assumed, since most processors use little endian. Thus the
// "later" values are placed in most significant bytes.
int packed = ((values[3] & 0xFF) << 24)
| ((values[2] & 0xFF) << 16)
| ((values[1] & 0xFF) << 8)
| ((values[0] & 0xFF));

nchw_out[out_buf_idx] = packed;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

#include "indexing_utils.h"

layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
${layout_declare_buffer(1, "r", "nchw_in", "int")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(constant_id = 3) const int packed_dim = C_DIM;

/*
* Extends sign of int8
*/
int extend_sign(int x) {
if (x >> 7 == 1) {
return x | 0xFFFFFF00;
}
return x;
}

ivec4 read_texel(ivec4 tensor_idx) {
const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
tensor_idx, tensor_sizes, packed_dim);

int shift = (1 << 8) - 1;
ivec4 masks;
// Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
// little endian is assumed, as most processors use little endian. Thus the
// most significant bytes correspond to the "latter" packed values.
masks.x = shift << (8 * (buf_indices.x % 4));
masks.y = shift << (8 * (buf_indices.y % 4));
masks.z = shift << (8 * (buf_indices.z % 4));
masks.w = shift << (8 * (buf_indices.w % 4));

ivec4 out_tex = ivec4(0);

[[unroll]] for (int i = 0; i < 4; ++i) {
if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
int in_texel = nchw_in[buf_indices[i] / 4];
int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
extracted_val = extend_sign(extracted_val);
out_tex[i] = extracted_val;
}
}

return out_tex;
}

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);

if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
return;
}

write_texel(t_out, pos, read_texel(tensor_idx));
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void main() {
return;
}

ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
tensor_idx[packed_dim] *= 4;
t_out[t_id] = read_texel(tensor_idx);
}
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void main() {
return;
}

const ivec4 out_pos = to_texel_pos(t_id, out_strides, 0);
const ivec4 out_pos = to_tensor_idx(t_id, out_strides, 0);

VEC4_T outtex = q_8w_linear(out_pos, mat1_sizes.x);
write_texel(t_out, t_id, outtex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ void main() {
}

const VEC4_T intex = t_in[t_id];
ivec4 tensor_idx = to_texel_pos(t_id, gpu_strides, packed_dim);
ivec4 tensor_idx = to_tensor_idx(t_id, gpu_strides, packed_dim);
tensor_idx[packed_dim] *= 4;
write_out_texel(intex, tensor_idx);
}
Expand Down
31 changes: 24 additions & 7 deletions backends/vulkan/runtime/graph/ops/impl/Staging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ void add_staging_to_tensor_node(
const ValueRef out_tensor) {
VK_CHECK_COND(graph.val_is_staging(in_staging));

vkapi::ShaderInfo shader =
get_nchw_to_tensor_shader(*graph.get_tensor(out_tensor));
vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
*graph.get_tensor(out_tensor), graph.int8_buffers_enabled());

vkapi::ParamsBindList ubos({graph.sizes_ubo(out_tensor)});
if (graph.is_buffer_storage(out_tensor)) {
Expand Down Expand Up @@ -55,10 +55,26 @@ void add_tensor_to_staging_node(
const ValueRef out_staging) {
VK_CHECK_COND(graph.val_is_staging(out_staging));

vkapi::ShaderInfo shader =
get_tensor_to_nchw_shader(*graph.get_tensor(in_tensor));
vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
*graph.get_tensor(in_tensor), graph.int8_buffers_enabled());

utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
vkapi::ParamsBindList ubos({graph.sizes_ubo(in_tensor)});

// Normally, the tensor_to_nchw shader is structured so that each thread reads
// one texel from the input texture and writes each component of the texel
// into the corresponding location in the output buffer. However, this shader
// is structured slightly differently in that each thread writes out a
// complete 32 bit integer (containing 4 packed 8-bit integers) into the
// output buffer. Therefore, the global work group size for this shader will
// be the number of elements in the output buffer divided by 4, as opposed to
// the extents of the input texture.
if (shader.kernel_name == "int8_tensor_to_nchw_noint8") {
uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
global_wg_size = {buffer_len, 1, 1};
ubos.append({graph.ntexels_ubo(in_tensor)});
}

if (graph.is_buffer_storage(in_tensor)) {
ubos.append({
graph.texel_strides_ubo(in_tensor),
Expand All @@ -69,8 +85,8 @@ void add_tensor_to_staging_node(
graph.execute_nodes().emplace_back(new ExecuteNode(
graph,
shader,
graph.create_global_wg_size(in_tensor),
graph.create_local_wg_size(in_tensor),
global_wg_size,
graph.create_local_wg_size(global_wg_size),
// Input and Outputs
{{in_tensor, vkapi::MemoryAccessType::READ},
{out_staging, vkapi::MemoryAccessType::WRITE}},
Expand All @@ -86,7 +102,8 @@ ValueRef prepack(
const utils::GPUMemoryLayout layout) {
ValueRef v = graph.add_tensor_like(vref, layout);

vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(*graph.get_tensor(v));
vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
*graph.get_tensor(v), graph.int8_buffers_enabled());

vkapi::ParamsBindList ubos({graph.sizes_ubo(v)});
if (graph.is_buffer_storage(v)) {
Expand Down
18 changes: 16 additions & 2 deletions backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,21 +95,35 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes) {
memset(data_ptr, 0, staging.nbytes());
}

vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst) {
vkapi::ShaderInfo get_nchw_to_tensor_shader(
const api::vTensor& v_dst,
const bool int8_buffer_enabled) {
std::string kernel_name;
kernel_name.reserve(kShaderNameReserve);

if (v_dst.dtype() == vkapi::kChar &&
v_dst.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
return VK_KERNEL(nchw_to_int8_tensor_noint8);
}

kernel_name = "nchw_to_tensor";
add_dtype_suffix(kernel_name, v_dst);
add_storage_type_suffix(kernel_name, v_dst);

return VK_KERNEL_FROM_STR(kernel_name);
}

vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src) {
vkapi::ShaderInfo get_tensor_to_nchw_shader(
const api::vTensor& v_src,
bool int8_buffer_enabled) {
std::string kernel_name;
kernel_name.reserve(kShaderNameReserve);

if (v_src.dtype() == vkapi::kChar &&
v_src.storage_type() == utils::kTexture3D && !int8_buffer_enabled) {
return VK_KERNEL(int8_tensor_to_nchw_noint8);
}

kernel_name = "tensor_to_nchw";
add_dtype_suffix(kernel_name, v_src);
add_storage_type_suffix(kernel_name, v_src);
Expand Down
8 changes: 6 additions & 2 deletions backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ void set_staging_zeros(api::StorageBuffer& staging, const size_t nbytes);
// Functions to get shaders
//

vkapi::ShaderInfo get_nchw_to_tensor_shader(const api::vTensor& v_dst);
vkapi::ShaderInfo get_tensor_to_nchw_shader(const api::vTensor& v_src);
vkapi::ShaderInfo get_nchw_to_tensor_shader(
const api::vTensor& v_dst,
bool int8_buffer_enabled = true);
vkapi::ShaderInfo get_tensor_to_nchw_shader(
const api::vTensor& v_src,
bool int8_buffer_enabled = true);

} // namespace vkcompute
Loading
Loading