Skip to content

Commit bd0edfe

Browse files
[SYCL][CUDA] Improve group scan/reduce algorithms performance (#2902)
Proposed changes should improve general performance of group inclusive/exclusive scans and reduces for NVidia backend. Main idea is to replace local memory usage/barriers with some more computations.
1 parent c55eb20 commit bd0edfe

File tree

1 file changed

+9
-13
lines changed

1 file changed

+9
-13
lines changed

libclc/ptx-nvidiacl/libspirv/group/collectives.cl

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -239,29 +239,25 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
239239
} \
240240
__spirv_ControlBarrier(Workgroup, 0, 0); \
241241
/* Perform InclusiveScan over sub-group results */ \
242-
/* FIXME: Ideally, use an alternative algorithm that doesn't require two \
243-
* calls to __syncthreads() */ \
244-
for (int o = 1; o < num_sg; o *= 2) { \
245-
TYPE contribution = IDENTITY; \
246-
if (sg_id >= o && sg_lid == 0) { \
247-
contribution = scratch[sg_id - o]; \
242+
TYPE sg_prefix; \
243+
TYPE sg_aggregate = scratch[0]; \
244+
_Pragma("unroll") for (int s = 1; s < num_sg; ++s) { \
245+
if (sg_id == s) { \
246+
sg_prefix = sg_aggregate; \
248247
} \
249-
__spirv_ControlBarrier(Workgroup, 0, 0); \
250-
if (sg_id >= o && sg_lid == 0) { \
251-
scratch[sg_id] = OP(scratch[sg_id], contribution); \
252-
} \
253-
__spirv_ControlBarrier(Workgroup, 0, 0); \
248+
TYPE addend = scratch[s]; \
249+
sg_aggregate = OP(sg_aggregate, addend); \
254250
} \
255251
/* For Reduce, broadcast result from final sub-group */ \
256252
/* For Scan, combine results from previous sub-groups */ \
257253
TYPE result; \
258254
if (op == Reduce) { \
259-
result = scratch[num_sg - 1]; \
255+
result = sg_aggregate; \
260256
} else if (op == InclusiveScan || op == ExclusiveScan) { \
261257
if (sg_id == 0) { \
262258
result = sg_x; \
263259
} else { \
264-
result = OP(sg_x, scratch[sg_id - 1]); \
260+
result = OP(sg_x, sg_prefix); \
265261
} \
266262
} \
267263
return result; \

0 commit comments

Comments
 (0)