Skip to content

Commit 957d9d2

Browse files
authored
[CUDA][LIBCLC] Add bitwise reductions for CUDA (#5416)
SM80 and SMXX<SM80 versions of bitwise reductions switched on. This means all SM80 redux.sync instructions are now wired up. Tested by intel/llvm-test-suite#782. Signed-off-by: jack.kirk <[email protected]>
1 parent ec0385d commit 957d9d2

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

libclc/ptx-nvidiacl/libspirv/group/collectives.cl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ __clc__SubgroupBitwiseAny(uint op, bool predicate, bool *carry) {
151151
#define __CLC_MIN(x, y) ((x < y) ? (x) : (y))
152152
#define __CLC_MAX(x, y) ((x > y) ? (x) : (y))
153153
#define __CLC_OR(x, y) (x | y)
154+
#define __CLC_XOR(x, y) (x ^ y)
154155
#define __CLC_AND(x, y) (x & y)
155156
#define __CLC_MUL(x, y) (x * y)
156157

@@ -248,6 +249,13 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, half, -HALF_MAX)
248249
__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, float, -FLT_MAX)
249250
__CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
250251

252+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseAnd, __CLC_AND, and, uint, ~0)
253+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseOr, __CLC_OR, or, uint, 0)
254+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseXor, __CLC_XOR, xor, uint, 0)
255+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseAnd, __CLC_AND, and, int, ~0)
256+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseOr, __CLC_OR, or, int, 0)
257+
__CLC_SUBGROUP_COLLECTIVE_REDUX(NonUniformBitwiseXor, __CLC_XOR, xor, int, 0)
258+
251259
#undef __CLC_SUBGROUP_COLLECTIVE_BODY
252260
#undef __CLC_SUBGROUP_COLLECTIVE
253261
#undef __CLC_SUBGROUP_COLLECTIVE_REDUX
@@ -369,6 +377,13 @@ __CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, half, -HALF_MAX)
369377
__CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, float, -FLT_MAX)
370378
__CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
371379

380+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseAnd, __CLC_AND, uint, ~0)
381+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseOr, __CLC_OR, uint, 0)
382+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseXor, __CLC_XOR, uint, 0)
383+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseAnd, __CLC_AND, int, ~0)
384+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseOr, __CLC_OR, int, 0)
385+
__CLC_GROUP_COLLECTIVE(NonUniformBitwiseXor, __CLC_XOR, int, 0)
386+
372387
// half requires additional mangled entry points
373388
_CLC_DEF _CLC_CONVERGENT half _Z17__spirv_GroupFAddjjDF16_(uint scope, uint op,
374389
half x) {

0 commit comments

Comments
 (0)