@@ -43,8 +43,8 @@ layout(constant_id = 5) const int group_dim = 1;
43
43
// work group will write into its assigned element in the shared array.
44
44
#define MAX_NTHREADS 16
45
45
46
- shared vec4 shared_sum[MAX_NTHREADS];
47
- shared vec4 shared_sum_sq[MAX_NTHREADS];
46
+ shared VEC4_T shared_sum[MAX_NTHREADS];
47
+ shared VEC4_T shared_sum_sq[MAX_NTHREADS];
48
48
shared int shared_count[MAX_NTHREADS];
49
49
50
50
#include "indexing_utils.h"
@@ -53,9 +53,9 @@ int tid_to_smi(const ivec2 tid) {
53
53
return tid.x + tid.y * NWORKERS;
54
54
}
55
55
56
- vec4 calculate_variance(vec4 sum, vec4 sum_sq, int count) {
57
- vec4 mean = sum / float (count);
58
- vec4 variance = (sum_sq / float (count)) - (mean * mean);
56
+ VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) {
57
+ VEC4_T mean = sum / float (count);
58
+ VEC4_T variance = (sum_sq / float (count)) - (mean * mean);
59
59
60
60
if ((pc.unbiased != 0 ) && (count > 1 )) {
61
61
variance = variance * (float (count) / float (count - 1.0 ));
@@ -68,14 +68,14 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
68
68
// shared memory index of this thread
69
69
const int smi = tid_to_smi(tid);
70
70
71
- vec4 sum = VEC4_T(0 );
72
- vec4 sum_sq = VEC4_T(0 );
71
+ VEC4_T sum = VEC4_T(0 );
72
+ VEC4_T sum_sq = VEC4_T(0 );
73
73
int count = 0 ;
74
74
75
75
scan_pos[reduce_dim] = tid.x;
76
76
for (int i = tid.x; i < tin_sizes[reduce_dim];
77
77
i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
78
- vec4 val = load_texel(tin, scan_pos);
78
+ VEC4_T val = load_texel(tin, scan_pos);
79
79
sum += val;
80
80
sum_sq += val * val;
81
81
count += 1 ;
@@ -109,7 +109,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
109
109
const bool is_last_texel =
110
110
scan_pos[packed_dim] == (tin_limits[packed_dim] - 1 );
111
111
112
- vec4 variance = calculate_variance(sum, sum_sq, count);
112
+ VEC4_T variance = calculate_variance(sum, sum_sq, count);
113
113
114
114
// Explicitly set padding elements to 0
115
115
if (is_last_texel && nspill > 0 ) {
@@ -141,16 +141,16 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
141
141
// handled specially if it has padding elements.
142
142
const int reduce_len = tin_sizes[packed_dim] - nspill;
143
143
144
- vec4 sum = VEC4_T(0 );
145
- vec4 sum_sq = VEC4_T(0 );
144
+ VEC4_T sum = VEC4_T(0 );
145
+ VEC4_T sum_sq = VEC4_T(0 );
146
146
int count = 0 ;
147
147
148
148
// Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
149
149
// the reduction row
150
150
scan_pos[reduce_dim] = tid.x;
151
151
for (int i = tid.x * 4 ; i < reduce_len;
152
152
i += NWORKERS * 4 , scan_pos[reduce_dim] += NWORKERS) {
153
- vec4 val = load_texel(tin, scan_pos);
153
+ VEC4_T val = load_texel(tin, scan_pos);
154
154
sum += val;
155
155
sum_sq += val * val;
156
156
count += 4 ;
@@ -159,7 +159,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
159
159
// element of the texel needs to be processed individually such that the
160
160
// padding elements are ignored
161
161
if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0 ) {
162
- const vec4 val = load_texel(tin, scan_pos);
162
+ const VEC4_T val = load_texel(tin, scan_pos);
163
163
for (int i = 0 ; i < nspill; i++ ) {
164
164
sum.x += val[i];
165
165
sum_sq.x += val[i] * val[i];
@@ -198,7 +198,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
198
198
}
199
199
200
200
scan_pos[reduce_dim] = tid.x;
201
- write_texel(tout, scan_pos, vec4 (variance, 0 , 0 , 0 ));
201
+ write_texel(tout, scan_pos, VEC4_T (variance, 0 , 0 , 0 ));
202
202
}
203
203
}
204
204
0 commit comments