Skip to content

[ET-VK] Integrate axis mapping into staging <-> image transfer shaders #5093

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
efee990
[ET-VK] Add test to track sizes of various objects
SS-JIA Sep 3, 2024
d4e400c
[ET-VK] Add type for symbolic integers
SS-JIA Sep 3, 2024
35df318
[ET-VK] Add `TmpTensorVRef` struct to recycle temporary tensor memory
SS-JIA Sep 3, 2024
906bf65
Update base for Update on "[ET-VK] Add `TmpTensorVRef` struct to recy…
SS-JIA Sep 3, 2024
2832186
Update on "[ET-VK] Add `TmpTensorVRef` struct to recycle temporary te…
SS-JIA Sep 3, 2024
834c4e8
Update base for Update on "[ET-VK] Add `TmpTensorVRef` struct to recy…
SS-JIA Sep 4, 2024
9285a18
Update on "[ET-VK] Add `TmpTensorVRef` struct to recycle temporary te…
SS-JIA Sep 4, 2024
34dd324
Update base for Update on "[ET-VK] Add `TmpTensorVRef` struct to recy…
SS-JIA Sep 4, 2024
dd87612
Update on "[ET-VK] Add `TmpTensorVRef` struct to recycle temporary te…
SS-JIA Sep 4, 2024
1de43d2
[ET-VK][BE][ez] Enable automatic layout slot index incrementing
SS-JIA Sep 4, 2024
99e5105
[ET-VK] Introduce axis mapping for no-copy permute of texture-backed …
SS-JIA Sep 4, 2024
7c1ff3b
[ET-VK] Integrate axis mapping into staging <-> buffer transfer shaders
SS-JIA Sep 4, 2024
b2ab1d7
Update base for Update on "[ET-VK] Integrate axis mapping into stagin…
SS-JIA Sep 5, 2024
eb6d388
Update on "[ET-VK] Integrate axis mapping into staging <-> buffer tra…
SS-JIA Sep 5, 2024
7535ad3
Update base for Update on "[ET-VK] Integrate axis mapping into stagin…
SS-JIA Sep 6, 2024
06a9fa5
Update on "[ET-VK] Integrate axis mapping into staging <-> buffer tra…
SS-JIA Sep 6, 2024
d8fba6e
Update base for Update on "[ET-VK] Integrate axis mapping into stagin…
SS-JIA Sep 6, 2024
a2ae8dd
Update on "[ET-VK] Integrate axis mapping into staging <-> image tran…
SS-JIA Sep 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}

layout(std430) buffer;

${layout_declare_buffer(0, "w", "nchw_out", DTYPE)}
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
${layout_declare_ubo(2, "ivec4", "sizes")}
${layout_declare_buffer(B, "w", "nchw_out", DTYPE)}
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
${layout_declare_ubo(B, "ivec4", "sizes")}
${layout_declare_ubo(B, "ivec4", "axis_mapping")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down Expand Up @@ -51,7 +52,7 @@ void write_out_texel(VEC4_T texel, ivec4 tensor_idx) {

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);

if (any(greaterThanEqual(tensor_idx, sizes))) {
return;
Expand Down
93 changes: 93 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,42 @@ ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
return tensor_idx;
}

/*
* Derive (w,h,c,n) tensor indices from (x,y,z) texture position using axis
* mapping.
*/
ivec4 to_tensor_idx(
ivec3 pos,
ivec4 sizes,
const ivec4 axis_mapping,
const int packed_dim) {
// Align packed dim to next multiple of 4 to account for texel padding
sizes[packed_dim] = alignup4(sizes[packed_dim]);

// Packed dim contains 4 elements per texel, so moving 1 unit traverses 4
// elements in the tensor.
pos[axis_mapping[packed_dim]] *= 4;

ivec4 tensor_idx;
for (int dim = 0; dim < 3; ++dim) {
tensor_idx[dim] = pos[axis_mapping[dim]];
}

// Early return if batch is 1. Batch index will be 0.
if (sizes.w == 1) {
tensor_idx.w = 0;
return tensor_idx;
}

// Else, adjust the dim that's concatenated with batch. Note that the axis
// mapping for the batch dim indicates WHCN dim index of the dim that it is
// concatenated with, not a texture axis.
tensor_idx.w = tensor_idx[axis_mapping[3]] / sizes[axis_mapping[3]];
tensor_idx[axis_mapping[3]] %= sizes[axis_mapping[3]];

return tensor_idx;
}

/*
* Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
* is packed along a texel
Expand All @@ -199,6 +235,34 @@ ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
return pos;
}

/*
* Derive (x,y,z) texture position from (w,h,c,n) tensor indices using axis
* mapping.
*/
ivec3 to_texture_pos(
const ivec4 idx,
ivec4 sizes,
const ivec4 axis_mapping,
const int packed_dim) {
// Align packed dim to next multiple of 4 to account for texel padding
sizes[packed_dim] = alignup4(sizes[packed_dim]);

ivec3 pos;
for (int dim = 0; dim < 3; ++dim) {
pos[axis_mapping[dim]] = idx[dim];
}

// Adjust batch dim if needed
if (sizes.w > 1) {
pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
}

// Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
// tensor elements in that dim.
pos[axis_mapping[packed_dim]] /= 4;
return pos;
}

/*
* Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
* is packed along a texel
Expand All @@ -218,6 +282,35 @@ ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
return pos;
}

/*
* Derive (x,y,z,i) texel element position from the (w,h,c,n) tensor index using
* the axis mapping.
*/
ivec4 to_texture_elem_pos(
const ivec4 idx,
ivec4 sizes,
const ivec4 axis_mapping,
const int packed_dim) {
// Align packed dim to next multiple of 4 to account for texel padding
sizes[packed_dim] = alignup4(sizes[packed_dim]);

ivec4 pos;
for (int dim = 0; dim < 3; ++dim) {
pos[axis_mapping[dim]] = idx[dim];
}

// Adjust batch dim if needed
if (sizes.w > 1) {
pos[axis_mapping[axis_mapping[3]]] += idx.w * sizes.w;
}

// Adjust packed dim. Moving 1 texel unit along the packed dim traverses 4
// tensor elements in that dim.
pos[axis_mapping[packed_dim]] /= 4;
pos.w = idx[packed_dim] % 4;
return pos;
}

//
// Texel Access and Storage
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_buffer(0, "w", "nchw_out", "int")}
${layout_declare_tensor(1, "r", "t_in", "int8", "texture3d")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
${layout_declare_ubo(3, "int", "out_numel")}
${layout_declare_buffer(B, "w", "nchw_out", "int")}
${layout_declare_tensor(B, "r", "t_in", "int8", "texture3d")}
${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
${layout_declare_ubo(B, "ivec4", "axis_mapping")}
${layout_declare_ubo(B, "int", "out_numel")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down
9 changes: 5 additions & 4 deletions backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ ${define_required_extensions(DTYPE)}

layout(std430) buffer;

${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_buffer(1, "r", "nchw_in", DTYPE)}
${layout_declare_ubo(2, "ivec4", "sizes")}
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_buffer(B, "r", "nchw_in", DTYPE)}
${layout_declare_ubo(B, "ivec4", "sizes")}
${layout_declare_ubo(B, "ivec4", "axis_mapping")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand Down Expand Up @@ -53,7 +54,7 @@ VEC4_T read_texel(ivec4 tensor_idx) {

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec4 tensor_idx = to_tensor_idx(pos, sizes, packed_dim);
const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);
if (any(greaterThanEqual(tensor_idx, sizes))) {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ layout(std430) buffer;

#extension GL_EXT_control_flow_attributes : require

${layout_declare_tensor(0, "w", "t_out", "int8", "texture3d")}
${layout_declare_buffer(1, "r", "nchw_in", "int")}
${layout_declare_ubo(2, "ivec4", "tensor_sizes")}
${layout_declare_tensor(B, "w", "t_out", "int8", "texture3d")}
${layout_declare_buffer(B, "r", "nchw_in", "int")}
${layout_declare_ubo(B, "ivec4", "sizes")}
${layout_declare_ubo(B, "ivec4", "axis_mapping")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand All @@ -36,7 +37,7 @@ int extend_sign(int x) {

ivec4 read_texel(ivec4 tensor_idx) {
const ivec4 buf_indices = get_texel_nchw_buffer_ixs(
tensor_idx, tensor_sizes, packed_dim);
tensor_idx, sizes, packed_dim);

int shift = (1 << 8) - 1;
ivec4 masks;
Expand All @@ -51,7 +52,7 @@ ivec4 read_texel(ivec4 tensor_idx) {
ivec4 out_tex = ivec4(0);

[[unroll]] for (int i = 0; i < 4; ++i) {
if (tensor_idx[packed_dim] + i < tensor_sizes[packed_dim]) {
if (tensor_idx[packed_dim] + i < sizes[packed_dim]) {
int in_texel = nchw_in[buf_indices[i] / 4];
int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
extracted_val = extend_sign(extracted_val);
Expand All @@ -64,9 +65,9 @@ ivec4 read_texel(ivec4 tensor_idx) {

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec4 tensor_idx = to_tensor_idx(pos, tensor_sizes, packed_dim);
const ivec4 tensor_idx = to_tensor_idx(pos, sizes, axis_mapping, packed_dim);

if (any(greaterThanEqual(tensor_idx, tensor_sizes))) {
if (any(greaterThanEqual(tensor_idx, sizes))) {
return;
}

Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ ValueRef prepack_biases(
graph.create_local_wg_size(v),
vref,
v,
{t->sizes_ubo()},
{t->sizes_ubo(), t->axis_mapping_ubo()},
// Specialization constants
{SV(t->packed_dim_whcn_idx())}));

Expand Down
8 changes: 5 additions & 3 deletions backends/vulkan/runtime/graph/ops/impl/Staging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ void add_staging_to_tensor_node(
graph.strides_ubo(out_tensor),
graph.numel_ubo(out_tensor)});
} else {
ubos.append(graph.sizes_ubo(out_tensor));
ubos.append(
{graph.sizes_ubo(out_tensor), graph.axis_mapping_ubo(out_tensor)});
}

graph.execute_nodes().emplace_back(new ExecuteNode(
Expand Down Expand Up @@ -69,7 +70,8 @@ void add_tensor_to_staging_node(
graph.strides_ubo(in_tensor),
graph.numel_ubo(in_tensor)});
} else {
ubos.append(graph.sizes_ubo(in_tensor));
ubos.append(
{graph.sizes_ubo(in_tensor), graph.axis_mapping_ubo(in_tensor)});
}

// Normally, the image_to_nchw shader is structured so that each thread reads
Expand Down Expand Up @@ -113,7 +115,7 @@ ValueRef prepack(
if (graph.is_buffer_storage(v)) {
ubos.append({graph.sizes_ubo(v), graph.strides_ubo(v), graph.numel_ubo(v)});
} else {
ubos.append(graph.sizes_ubo(v));
ubos.append({graph.sizes_ubo(v), graph.axis_mapping_ubo(v)});
}

graph.prepack_nodes().emplace_back(new PrepackNode(
Expand Down
7 changes: 5 additions & 2 deletions backends/vulkan/test/utils/test_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ void record_nchw_to_image_op(
vkapi::PipelineStage::COMPUTE,
vkapi::MemoryAccessType::WRITE),
src_buffer,
v_dst.sizes_ubo());
v_dst.sizes_ubo(),
v_dst.axis_mapping_ubo());
}

void record_image_to_nchw_op(
Expand All @@ -106,7 +107,8 @@ void record_image_to_nchw_op(
0,
dst_buffer,
v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_src.sizes_ubo());
v_src.sizes_ubo(),
v_src.axis_mapping_ubo());
}

void record_int8_image_to_nchw_noint8_op(
Expand All @@ -127,6 +129,7 @@ void record_int8_image_to_nchw_noint8_op(
dst_buffer.buffer(),
v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
v_src.sizes_ubo(),
v_src.axis_mapping_ubo(),
v_src.numel_ubo());
}

Expand Down
15 changes: 9 additions & 6 deletions backends/vulkan/test/vulkan_compute_api_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1233,8 +1233,8 @@ TEST(VulkanComputeGraphTest, test_simple_graph) {
GraphConfig config;
ComputeGraph graph(config);

std::vector<int64_t> size_big = {8, 64, 124};
std::vector<int64_t> size_small = {8, 1, 124};
std::vector<int64_t> size_big = {1, 8, 8};
std::vector<int64_t> size_small = {1, 1, 8};

// Build graph

Expand Down Expand Up @@ -1415,8 +1415,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
/*shared_object_idx = */ 4);

// +2: t.sizes_ubo() for each staging shader
// +2: t.axis_mapping_ubo() for each staging shader
// +2: staging buffer for each input tensor
EXPECT_TRUE(get_vma_allocation_count() == 4);
EXPECT_TRUE(get_vma_allocation_count() == 6);

ValueRef c = graph.add_tensor(
size_big,
Expand All @@ -1433,8 +1434,9 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {

// +2: alpha UBO, broadcast UBO for arithmetic shader
// +1: t.sizes_ubo() uniform buffer for staging shader
// +1: t.axis_mapping_ubo() uniform buffer for staging shader
// +1: staging buffer for the input tensor
EXPECT_TRUE(get_vma_allocation_count() == 9);
EXPECT_TRUE(get_vma_allocation_count() == 12);

ValueRef e = graph.add_tensor(
size_big,
Expand All @@ -1450,14 +1452,15 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {

// +2: alpha UBO, broadcast UBO for arithmetic shader
// +1: t.sizes_ubo() for staging shader
// +1: t.axis_mapping_ubo() for staging shader
// +1 staging buffer for the input tensor
EXPECT_TRUE(get_vma_allocation_count() == 13);
EXPECT_TRUE(get_vma_allocation_count() == 17);

graph.prepare();
graph.encode_execute();

// +3: shared memory allocations for tensors
EXPECT_TRUE(get_vma_allocation_count() == 16);
EXPECT_TRUE(get_vma_allocation_count() == 20);

// Run graph

Expand Down
Loading