Skip to content

Commit 48e05ed

Browse files
yipjustinfacebook-github-bot
authored andcommitted
aten.select.int
Summary: Port over the `select.int` shaders to ET. 1. Since in ET, tensor-shape reasoning happens in AOT, therefore we can simplify the c++ caller code by a lot. 2. In this diff, we also try to use the same buffer object for passing arguments to all shaders. Not worry about perf cost, since cost difference between passing int and ivec4 is very minor. Differential Revision: D56082483
1 parent c095046 commit 48e05ed

18 files changed

+538
-0
lines changed

backends/vulkan/runtime/api/Tensor.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,14 @@ class vTensor final {
255255
return sizes_;
256256
}
257257

258+
inline const int64_t size(size_t dim) const {
259+
return sizes_[dim];
260+
}
261+
262+
inline const int64_t dim() const {
263+
return sizes_.size();
264+
}
265+
258266
inline const std::vector<int64_t>& strides() const {
259267
return strides_;
260268
}

backends/vulkan/runtime/graph/Logging.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <ostream>
1212
#include <vector>
13+
#include <executorch/backends/vulkan/runtime/api/Utils.h>
1314

1415
namespace vkcompute {
1516

@@ -23,4 +24,9 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
2324
return os; // Return the ostream to allow chaining
2425
}
2526

27+
28+
inline std::ostream& operator<<(std::ostream& os, const api::utils::uvec3& v) {
29+
return api::utils::operator<<(os, v);
30+
}
31+
2632
} // namespace vkcompute
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#version 450 core
2+
#define PRECISION ${PRECISION}
3+
4+
layout(std430) buffer;
5+
6+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
7+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
8+
9+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
10+
uvec4 data;
11+
}
12+
out_extents;
13+
14+
// index to select
15+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
16+
// data.x: index along width dim to select
17+
// data.y: number of batches
18+
// data.z: number of texels per batch
19+
// data.w: unused
20+
ivec4 data;
21+
}
22+
select_info;
23+
24+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
25+
26+
void main() {
27+
const int num_batches = select_info.data.y;
28+
const int num_texel_per_batch = select_info.data.z;
29+
const int index = select_info.data.x;
30+
31+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
32+
33+
const uint src_pos_z = (num_texel_per_batch * index) + pos.z;
34+
imageStore(
35+
image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0));
36+
}
37+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_batch_4d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_batch_4d
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
layout(std430) buffer;
6+
7+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
8+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
9+
10+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
11+
uvec4 data;
12+
}
13+
out_extents;
14+
15+
// index to select
16+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
17+
int data;
18+
}
19+
index;
20+
21+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
22+
23+
void main() {
24+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
25+
26+
if (all(lessThan(pos.xy, out_extents.data.xy))) {
27+
const int tex = index.data / 4;
28+
const int ind = index.data % 4;
29+
const float v = texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0)[ind];
30+
31+
imageStore(image_out, ivec3(pos.x, pos.y, 0), vec4(v, 0, 0, 0));
32+
}
33+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_depth_3d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_depth_3d
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
layout(std430) buffer;
6+
7+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
8+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
9+
10+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
11+
uvec4 data;
12+
}
13+
out_extents;
14+
15+
// index to select
16+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
17+
// data.x: index along width dim to select
18+
// data.y: number of batches
19+
// data.z: number of texels per batch
20+
// data.w: unused
21+
ivec4 data;
22+
}
23+
select_info;
24+
25+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
26+
27+
void main() {
28+
const int num_batches = select_info.data.y;
29+
const int num_texel_per_batch = select_info.data.z;
30+
const int index = select_info.data.x;
31+
32+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
33+
// read in the same channel from 4 separate batches
34+
vec4 out_texel = vec4(0, 0, 0, 0);
35+
for (int k = 0; k < 4; k++) {
36+
if ((k + pos.z * 4) >=
37+
num_batches) {
38+
break;
39+
}
40+
const uint src_pos_z = (4 * num_texel_per_batch * pos.z) +
41+
(k * num_texel_per_batch) + (index / 4);
42+
const uint src_pos_t = index % 4;
43+
out_texel[k] =
44+
texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0)[src_pos_t];
45+
}
46+
47+
imageStore(image_out, pos, out_texel);
48+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_depth_4d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_depth_4d
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
layout(std430) buffer;
6+
7+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
8+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
9+
10+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
11+
uvec4 data;
12+
}
13+
out_extents;
14+
15+
// index to select
16+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
17+
int data;
18+
}
19+
index;
20+
21+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
22+
23+
void main() {
24+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
25+
26+
// w
27+
const int src_x = pos.x;
28+
// h
29+
const int src_y = index.data;
30+
// c
31+
const int src_z = pos.y;
32+
33+
const vec4 v = texelFetch(image_in, ivec3(src_x, src_y, src_z), 0);
34+
35+
for (int i = 0; i < 4; i++) {
36+
ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
37+
38+
// When the C-channel exceeds original block size, exit early
39+
if (new_pos.y >= out_extents.data.y) {
40+
return;
41+
}
42+
43+
imageStore(image_out, new_pos, vec4(v[i], 0, 0, 0));
44+
}
45+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_height_3d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_height_3d
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#version 450 core
2+
#define PRECISION ${PRECISION}
3+
4+
layout(std430) buffer;
5+
6+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
7+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
8+
9+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
10+
uvec4 data;
11+
}
12+
out_extents;
13+
14+
// index to select
15+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
16+
// data.x: index along width dim to select
17+
// data.y: number of batches
18+
// data.z: number of texels per batch
19+
// data.w: unused
20+
ivec4 data;
21+
}
22+
select_info;
23+
24+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
25+
26+
void main() {
27+
const int num_batches = select_info.data.y;
28+
const int num_texel_per_batch = select_info.data.z;
29+
const int index = select_info.data.x;
30+
31+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
32+
vec4 out_texel = vec4(0, 0, 0, 0);
33+
// read in the same channel from 4 separate batches
34+
for (int k = 0; k < 4; k++) {
35+
if ((k + pos.z * 4) >= num_batches
36+
) { // < 4 batches for this texel, exit early
37+
break;
38+
}
39+
const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
40+
k * num_texel_per_batch + (pos.y / 4);
41+
out_texel[k] = texelFetch(
42+
image_in, ivec3(pos.x, index, src_pos_z), 0)[pos.y % 4];
43+
}
44+
imageStore(image_out, pos, out_texel);
45+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_height_4d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_height_4d
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#version 450 core
2+
3+
#define PRECISION ${PRECISION}
4+
5+
layout(std430) buffer;
6+
7+
layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
8+
layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
9+
10+
layout(set = 0, binding = 2) uniform PRECISION restrict OutExtents {
11+
uvec4 data;
12+
}
13+
out_extents;
14+
15+
// index to select
16+
layout(set = 0, binding = 3) uniform PRECISION restrict IndexVal {
17+
int data;
18+
}
19+
index;
20+
21+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
22+
23+
void main() {
24+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
25+
26+
// w
27+
const int src_x = index.data;
28+
// h
29+
const int src_y = pos.x;
30+
// c
31+
const int src_z = pos.y;
32+
33+
const vec4 v = texelFetch(image_in, ivec3(src_x, src_y, src_z), 0);
34+
35+
for (int i = 0; i < 4; i++) {
36+
ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
37+
38+
// When the C-channel exceeds original block size, exit early
39+
if (new_pos.y >= out_extents.data.y) {
40+
return;
41+
}
42+
43+
imageStore(image_out, new_pos, vec4(v[i], 0, 0, 0));
44+
}
45+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
select_width_3d:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
NDIM: 3
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
SUFFIX: half
9+
- VALUE: float
10+
SUFFIX: float
11+
shader_variants:
12+
- NAME: select_width_3d

0 commit comments

Comments
 (0)