16
16
17
17
#define op(X, A, B) ${OPERATOR}
18
18
19
- #include "indexing_utils_u16 .h"
19
+ #include "indexing_utils .h"
20
20
21
21
layout (std430) buffer ;
22
22
@@ -32,10 +32,8 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
32
32
33
33
layout (local_size_x_id = 0 , local_size_y_id = 1 , local_size_z_id = 2 ) in ;
34
34
35
- #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
36
-
37
35
// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
38
- shared u16vec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
36
+ shared ivec2 pos_shared[gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z * TILE_SIZE * TILE_SIZE];
39
37
40
38
/*
41
39
* Computes a 2D pointwise convolution of an NxN output tile. Calculating an
@@ -46,18 +44,18 @@ void main() {
46
44
const ivec2 out_limits_scaled = (out_limits.xy + TILE_SIZE - 1 ) / TILE_SIZE;
47
45
const uint shared_mem_stride = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
48
46
49
- const u16vec3 gpos = idx_to_u16pos_x_wise (gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
47
+ const ivec3 gpos = idx_to_ipos_x_wise (gl_GlobalInvocationID.x, out_limits_scaled.x, out_limits_scaled.y);
50
48
51
49
// Output position for TILE_SIZE = 2
52
50
// +--------+--------+
53
51
// | pos[0] | pos[1] |
54
52
// +--------+--------+
55
53
// | pos[2] | pos[3] |
56
54
// +--------+--------+
57
- u16vec2 pos[TILE_SIZE * TILE_SIZE];
55
+ ivec2 pos[TILE_SIZE * TILE_SIZE];
58
56
for (int y = 0 , i = 0 ; y < TILE_SIZE; ++ y) {
59
57
for (int x = 0 ; x < TILE_SIZE; ++ x) {
60
- pos[i] = u16vec2 (
58
+ pos[i] = ivec2 (
61
59
gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y);
62
60
pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex] = pos[i];
63
61
i++ ;
@@ -66,38 +64,38 @@ void main() {
66
64
67
65
// If the top left position is out of bounds, then this invocation will have
68
66
// no work to do.
69
- if (any (greaterThanEqual (u16vec3 (pos[0 ], gpos.z), out_limits))) {
67
+ if (any (greaterThanEqual (ivec3 (pos[0 ], gpos.z), out_limits))) {
70
68
return ;
71
69
}
72
70
73
71
// Compute the index of the input texture that needs to be loaded for each
74
72
// output position. Note that negative indices can be produced indicating that
75
73
// the top-left element is in a region added by padding.
76
- u16vec2 ipos[TILE_SIZE * TILE_SIZE];
74
+ ivec2 ipos[TILE_SIZE * TILE_SIZE];
77
75
for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
78
- ipos[i] = pos[i] * u16vec2( stride) - u16vec2( padding) ;
76
+ ipos[i] = pos[i] * stride - padding;
79
77
}
80
78
81
79
vec4 sum[TILE_SIZE * TILE_SIZE];
82
- sum[0 ] = texelFetch(t_bias, u16vec2 (gpos.z, 0 ), 0 );
80
+ sum[0 ] = texelFetch(t_bias, ivec2 (gpos.z, 0 ), 0 );
83
81
for (int i = 1 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
84
82
sum[i] = sum[0 ];
85
83
}
86
84
87
85
int z4 = 0 ;
88
86
// Since the kernel is 1x1, we only have to loop over the depth dimension.
89
- for (uint16_t z = uint16_t( 0 ) ; z < uint16_t( in_group_size) ; z += uint16_t( 4 ) , ++ z4) {
87
+ for (int z = 0 ; z < in_group_size; z += 4 , ++ z4) {
90
88
// During prepacking, the weight tensor has been permuted so that the
91
89
// channel (IC) dim is along the x-axis, and the batch (OC) dim is along
92
90
// the z-axis.
93
- const vec4 ktex_0 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (0 , 0 ));
94
- const vec4 ktex_1 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (1 , 0 ));
95
- const vec4 ktex_2 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (2 , 0 ));
96
- const vec4 ktex_3 = texelFetchOffset(t_kernel, u16vec2 (z, gpos.z), 0 , u16vec2 (3 , 0 ));
91
+ const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (0 , 0 ));
92
+ const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (1 , 0 ));
93
+ const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (2 , 0 ));
94
+ const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2 (z, gpos.z), 0 , ivec2 (3 , 0 ));
97
95
98
96
#pragma unroll
99
97
for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
100
- const vec4 in_tex = texelFetch(t_in, u16vec3 (ipos[i], z4), 0 );
98
+ const vec4 in_tex = texelFetch(t_in, ivec3 (ipos[i], z4), 0 );
101
99
// For 2x2 tile size algorithm works as follows.
102
100
// To explain the calculations below, the contents of one in_tex and the
103
101
// group of 4 texels loaded from t_kernel are shown:
@@ -139,9 +137,9 @@ void main() {
139
137
}
140
138
141
139
for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
142
- const u16vec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
143
- if (all (lessThan (u16vec3 (pos, gpos.z), out_limits))) {
144
- imageStore(t_out, u16vec3 (pos, gpos.z), op(sum[i], out_min, out_max));
140
+ const ivec2 pos = pos_shared[(shared_mem_stride * i) + gl_LocalInvocationIndex];
141
+ if (all (lessThan (ivec3 (pos, gpos.z), out_limits))) {
142
+ imageStore(t_out, ivec3 (pos, gpos.z), op(sum[i], out_min, out_max));
145
143
}
146
144
}
147
145
}
0 commit comments