12
12
13
13
#define VEC4_T ${texel_type(DTYPE)}
14
14
15
+ #define TILE_SIZE ${TILE_SIZE}
16
+
15
17
#define op(X, A, B) ${OPERATOR}
16
18
17
19
#include "indexing_utils.h"
@@ -65,11 +67,11 @@ void main() {
65
67
// +--------+--------+
66
68
// | pos[2] | pos[3] |
67
69
// +--------+--------+
68
- ivec3 pos[${ TILE_SIZE * TILE_SIZE} ];
69
- for (int y = 0 , i = 0 ; y < ${ TILE_SIZE} ; ++ y) {
70
- for (int x = 0 ; x < ${ TILE_SIZE} ; ++ x) {
70
+ ivec3 pos[TILE_SIZE * TILE_SIZE];
71
+ for (int y = 0 , i = 0 ; y < TILE_SIZE; ++ y) {
72
+ for (int x = 0 ; x < TILE_SIZE; ++ x) {
71
73
pos[i] = ivec3 (
72
- gpos.x * ${ TILE_SIZE} + x, gpos.y * ${ TILE_SIZE} + y, gpos.z);
74
+ gpos.x * TILE_SIZE + x, gpos.y * TILE_SIZE + y, gpos.z);
73
75
i++ ;
74
76
}
75
77
}
@@ -83,14 +85,14 @@ void main() {
83
85
// Compute the index of the input texture that needs to be loaded for each
84
86
// output position. Note that negative indices can be produced indicating that
85
87
// the top-left element is in a region added by padding.
86
- ivec2 ipos[${ TILE_SIZE * TILE_SIZE} ];
87
- for (int i = 0 ; i < ${ TILE_SIZE * TILE_SIZE} ; ++ i) {
88
+ ivec2 ipos[TILE_SIZE * TILE_SIZE];
89
+ for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
88
90
ipos[i] = pos[i].xy * stride - padding;
89
91
}
90
92
91
- vec4 sum[${ TILE_SIZE * TILE_SIZE} ];
93
+ vec4 sum[TILE_SIZE * TILE_SIZE];
92
94
sum[0 ] = texelFetch(bias_in, ivec2 (gpos.z, 0 ), 0 );
93
- for (int i = 1 ; i < ${ TILE_SIZE * TILE_SIZE} ; ++ i) {
95
+ for (int i = 1 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
94
96
sum[i] = sum[0 ];
95
97
}
96
98
@@ -99,17 +101,17 @@ void main() {
99
101
// During prepacking, the weight tensor has been permuted so that the
100
102
// channel (IC) dim is along the x-axis, and the batch (OC) dim is along
101
103
// the z-axis.
102
- vec4 in_tex[${ TILE_SIZE * TILE_SIZE} ];
104
+ vec4 in_tex[TILE_SIZE * TILE_SIZE];
103
105
const vec4 ktex_0 = texelFetch(kernel_in, ivec2 (z + 0 , gpos.z), 0 );
104
106
const vec4 ktex_1 = texelFetch(kernel_in, ivec2 (z + 1 , gpos.z), 0 );
105
107
const vec4 ktex_2 = texelFetch(kernel_in, ivec2 (z + 2 , gpos.z), 0 );
106
108
const vec4 ktex_3 = texelFetch(kernel_in, ivec2 (z + 3 , gpos.z), 0 );
107
109
108
- for (int i = 0 ; i < ${ TILE_SIZE * TILE_SIZE} ; ++ i) {
110
+ for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
109
111
in_tex[i] = texelFetch(image_in, ivec3 (ipos[i], z4), 0 );
110
112
}
111
113
112
- for (int i = 0 ; i < ${ TILE_SIZE * TILE_SIZE} ; ++ i) {
114
+ for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
113
115
// For 2x2 tile size algorithm works as follows.
114
116
// To explain the calculations below, the contents of one in_tex and the
115
117
// group of 4 texels loaded from kernel_in are shown:
@@ -150,7 +152,7 @@ void main() {
150
152
}
151
153
}
152
154
153
- for (int i = 0 ; i < ${ TILE_SIZE * TILE_SIZE} ; ++ i) {
155
+ for (int i = 0 ; i < TILE_SIZE * TILE_SIZE; ++ i) {
154
156
if (all (lessThan (pos[i], out_limits))) {
155
157
imageStore(image_out, pos[i], op(sum[i], out_min, out_max));
156
158
}
0 commit comments