Skip to content

Commit 4b5ad3c

Browse files
[NFCI][SYCL] Remove Reduction::getOutPointer (#7184)
Both USM pointers/accessors work with operator[] just fine.
1 parent f74664a commit 4b5ad3c

File tree

1 file changed

+50
-71
lines changed

1 file changed

+50
-71
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 50 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -676,14 +676,6 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
676676

677677
RedOutVar &getUserRedVar() { return MRedOut; }
678678

679-
static inline result_type *getOutPointer(result_type *OutPtr) {
680-
return OutPtr;
681-
}
682-
template <class AccessorType>
683-
static inline result_type *getOutPointer(const AccessorType &OutAcc) {
684-
return OutAcc.get_pointer().get();
685-
}
686-
687679
private:
688680
// Array reduction is performed element-wise to avoid stack growth, hence
689681
// 1-dimensional always.
@@ -885,7 +877,7 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
885877
for (size_t E = 0; E < NElements; ++E) {
886878
Reducer.getElement(E) = GroupSum[E];
887879
}
888-
Reducer.template atomic_combine(Reduction::getOutPointer(Out));
880+
Reducer.template atomic_combine(&Out[0]);
889881
}
890882
});
891883
return Reduction::is_usm || Redu.initializeToIdentity();
@@ -937,12 +929,11 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
937929
RedElem = reduce_over_group(Group, RedElem, BOp);
938930
if (LID == 0) {
939931
if (NWorkGroups == 1) {
940-
auto &OutElem = Reduction::getOutPointer(Out)[E];
941932
// Can avoid using partial sum and write the final result
942933
// immediately.
943934
if (IsUpdateOfUserVar)
944-
RedElem = BOp(RedElem, OutElem);
945-
OutElem = RedElem;
935+
RedElem = BOp(RedElem, Out[E]);
936+
Out[E] = RedElem;
946937
} else {
947938
PartialSums[NDId.get_group_linear_id() * NElements + E] =
948939
Reducer.getElement(E);
@@ -968,16 +959,15 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
968959
// Reduce each result separately
969960
// TODO: Opportunity to parallelize across elements.
970961
for (int E = 0; E < NElements; ++E) {
971-
auto &OutElem = Reduction::getOutPointer(Out)[E];
972962
auto LocalSum = Reducer.getIdentity();
973963
for (size_t I = LID; I < NWorkGroups; I += WGSize)
974964
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
975965
auto Result = reduce_over_group(Group, LocalSum, BOp);
976966

977967
if (LID == 0) {
978968
if (IsUpdateOfUserVar)
979-
Result = BOp(Result, OutElem);
980-
OutElem = Result;
969+
Result = BOp(Result, Out[E]);
970+
Out[E] = Result;
981971
}
982972
}
983973
}
@@ -1061,10 +1051,9 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10611051
if (LID == 0) {
10621052
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
10631053
if (NWorkGroups == 1 && IsUpdateOfUserVar)
1064-
V = BOp(V, Reduction::getOutPointer(Out)[E]);
1054+
V = BOp(V, Out[E]);
10651055
// if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1066-
Reduction::getOutPointer(
1067-
PartialSums)[NDId.get_group_linear_id() * NElements + E] = V;
1056+
PartialSums[NDId.get_group_linear_id() * NElements + E] = V;
10681057
}
10691058
}
10701059

@@ -1085,9 +1074,7 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10851074
for (int E = 0; E < NElements; ++E) {
10861075
auto LocalSum = Identity;
10871076
for (size_t I = LID; I < NWorkGroups; I += WGSize)
1088-
LocalSum =
1089-
BOp(LocalSum,
1090-
Reduction::getOutPointer(PartialSums)[I * NElements + E]);
1077+
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
10911078

10921079
LocalReds[LID] = LocalSum;
10931080
if (LID == 0)
@@ -1106,8 +1093,8 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11061093
if (LID == 0) {
11071094
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
11081095
if (IsUpdateOfUserVar)
1109-
V = BOp(V, Reduction::getOutPointer(Out)[E]);
1110-
Reduction::getOutPointer(Out)[E] = V;
1096+
V = BOp(V, Out[E]);
1097+
Out[E] = V;
11111098
}
11121099
}
11131100
}
@@ -1179,7 +1166,7 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
11791166
reduce_over_group(NDIt.get_group(), Reducer.getElement(E), BOp);
11801167
}
11811168
if (NDIt.get_local_linear_id() == 0)
1182-
Reducer.atomic_combine(Reduction::getOutPointer(Out));
1169+
Reducer.atomic_combine(&Out[0]);
11831170
});
11841171
}
11851172

@@ -1260,7 +1247,7 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, bool IsPow2WG,
12601247
}
12611248

12621249
if (LID == 0) {
1263-
Reducer.atomic_combine(Reduction::getOutPointer(Out));
1250+
Reducer.atomic_combine(&Out[0]);
12641251
}
12651252
});
12661253
}
@@ -1306,8 +1293,8 @@ void reduCGFuncForNDRangeFastReduceOnly(handler &CGH, KernelType KernelFunc,
13061293
PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
13071294
if (NDIt.get_local_linear_id() == 0) {
13081295
if (IsUpdateOfUserVar)
1309-
PSum = BOp(Reduction::getOutPointer(Out)[E], PSum);
1310-
Reduction::getOutPointer(Out)[WGID * NElements + E] = PSum;
1296+
PSum = BOp(Out[E], PSum);
1297+
Out[WGID * NElements + E] = PSum;
13111298
}
13121299
}
13131300
});
@@ -1387,8 +1374,8 @@ void reduCGFuncForNDRangeBasic(handler &CGH, bool IsPow2WG,
13871374
typename Reduction::result_type PSum =
13881375
IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
13891376
if (IsUpdateOfUserVar)
1390-
PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1391-
Reduction::getOutPointer(Out)[GrID * NElements + E] = PSum;
1377+
PSum = BOp(Out[0], PSum);
1378+
Out[GrID * NElements + E] = PSum;
13921379
}
13931380

13941381
// Ensure item 0 is finished with LocalReds before next iteration
@@ -1438,8 +1425,8 @@ void reduAuxCGFuncFastReduceImpl(handler &CGH, bool UniformWG,
14381425
PSum = reduce_over_group(NDIt.get_group(), PSum, BOp);
14391426
if (NDIt.get_local_linear_id() == 0) {
14401427
if (IsUpdateOfUserVar)
1441-
PSum = BOp(Reduction::getOutPointer(Out)[E], PSum);
1442-
Reduction::getOutPointer(Out)[WGID * NElements + E] = PSum;
1428+
PSum = BOp(Out[E], PSum);
1429+
Out[WGID * NElements + E] = PSum;
14431430
}
14441431
}
14451432
});
@@ -1515,8 +1502,8 @@ void reduAuxCGFuncNoFastReduceNorAtomicImpl(handler &CGH, bool UniformPow2WG,
15151502
typename Reduction::result_type PSum =
15161503
UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]);
15171504
if (IsUpdateOfUserVar)
1518-
PSum = BOp(*(Reduction::getOutPointer(Out)), PSum);
1519-
Reduction::getOutPointer(Out)[GrID * NElements + E] = PSum;
1505+
PSum = BOp(Out[0], PSum);
1506+
Out[GrID * NElements + E] = PSum;
15201507
}
15211508

15221509
// Ensure item 0 is finished with LocalReds before next iteration
@@ -1738,24 +1725,20 @@ void writeReduSumsToOutAccs(
17381725
// Add the initial value of user's variable to the final result.
17391726
if (IsOneWG)
17401727
std::tie(std::get<Is>(LocalAccs)[0]...) = std::make_tuple(std::get<Is>(
1741-
BOPs)(std::get<Is>(LocalAccs)[0],
1742-
IsInitializeToIdentity[Is]
1743-
? std::get<Is>(IdentityVals)
1744-
: std::tuple_element_t<Is, std::tuple<Reductions...>>::
1745-
getOutPointer(std::get<Is>(OutAccs))[0])...);
1728+
BOPs)(std::get<Is>(LocalAccs)[0], IsInitializeToIdentity[Is]
1729+
? std::get<Is>(IdentityVals)
1730+
: std::get<Is>(OutAccs)[0])...);
17461731

17471732
if (Pow2WG) {
17481733
// The partial sums for the work-group are stored in 0-th elements of local
17491734
// accessors. Simply write those sums to output accessors.
1750-
std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1751-
std::get<Is>(OutAccs))[OutAccIndex]...) =
1735+
std::tie(std::get<Is>(OutAccs)[OutAccIndex]...) =
17521736
std::make_tuple(std::get<Is>(LocalAccs)[0]...);
17531737
} else {
17541738
// Each of local accessors keeps two partial sums: in 0-th and WGsize-th
17551739
// elements. Combine them into final partial sums and write to output
17561740
// accessors.
1757-
std::tie(std::tuple_element_t<Is, std::tuple<Reductions...>>::getOutPointer(
1758-
std::get<Is>(OutAccs))[OutAccIndex]...) =
1741+
std::tie(std::get<Is>(OutAccs)[OutAccIndex]...) =
17591742
std::make_tuple(std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0],
17601743
std::get<Is>(LocalAccs)[WGSize])...);
17611744
}
@@ -1922,23 +1905,21 @@ void reduCGFuncImplArrayHelper(bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
19221905
if (LID == 0) {
19231906
if (IsOneWG) {
19241907
LocalReds[0] =
1925-
BOp(LocalReds[0], IsInitializeToIdentity
1926-
? Identity
1927-
: Reduction::getOutPointer(Out)[E]);
1908+
BOp(LocalReds[0], IsInitializeToIdentity ? Identity : Out[E]);
19281909
}
19291910

19301911
size_t GrID = NDIt.get_group_linear_id();
1931-
if (Pow2WG) {
1932-
// The partial sums for the work-group are stored in 0-th elements of
1933-
// local accessors. Simply write those sums to output accessors.
1934-
Reduction::getOutPointer(Out)[GrID * NElements + E] = LocalReds[0];
1935-
} else {
1936-
// Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1937-
// elements. Combine them into final partial sums and write to output
1938-
// accessors.
1939-
Reduction::getOutPointer(Out)[GrID * NElements + E] =
1940-
BOp(LocalReds[0], LocalReds[WGSize]);
1941-
}
1912+
Out[GrID * NElements + E] =
1913+
Pow2WG ?
1914+
// The partial sums for the work-group are stored in 0-th
1915+
// elements of local accessors. Simply write those sums to
1916+
// output accessors.
1917+
LocalReds[0]
1918+
:
1919+
// Each of local accessors keeps two partial sums: in 0-th
1920+
// and WGsize-th elements. Combine them into final partial
1921+
// sums and write to output accessors.
1922+
BOp(LocalReds[0], LocalReds[WGSize]);
19421923
}
19431924

19441925
// Ensure item 0 is finished with LocalReds before next iteration
@@ -2080,7 +2061,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
20802061
}
20812062

20822063
if (NDIt.get_local_linear_id() == 0) {
2083-
Reducer.atomic_combine(Reduction::getOutPointer(Out));
2064+
Reducer.atomic_combine(&Out[0]);
20842065
}
20852066
});
20862067
}
@@ -2189,23 +2170,21 @@ void reduAuxCGFuncImplArrayHelper(bool UniformPow2WG, bool IsOneWG,
21892170
if (LID == 0) {
21902171
if (IsOneWG) {
21912172
LocalReds[0] =
2192-
BOp(LocalReds[0], IsInitializeToIdentity
2193-
? Identity
2194-
: Reduction::getOutPointer(Out)[E]);
2173+
BOp(LocalReds[0], IsInitializeToIdentity ? Identity : Out[E]);
21952174
}
21962175

21972176
size_t GrID = NDIt.get_group_linear_id();
2198-
if (UniformPow2WG) {
2199-
// The partial sums for the work-group are stored in 0-th elements of
2200-
// local accessors. Simply write those sums to output accessors.
2201-
Reduction::getOutPointer(Out)[GrID * NElements + E] = LocalReds[0];
2202-
} else {
2203-
// Each of local accessors keeps two partial sums: in 0-th and WGsize-th
2204-
// elements. Combine them into final partial sums and write to output
2205-
// accessors.
2206-
Reduction::getOutPointer(Out)[GrID * NElements + E] =
2207-
BOp(LocalReds[0], LocalReds[WGSize]);
2208-
}
2177+
Out[GrID * NElements + E] =
2178+
UniformPow2WG ?
2179+
// The partial sums for the work-group are stored in
2180+
// 0-th elements of local accessors. Simply write those
2181+
// sums to output accessors.
2182+
LocalReds[0]
2183+
:
2184+
// Each of local accessors keeps two partial sums: in
2185+
// 0-th and WGsize-th elements. Combine them into final
2186+
// partial sums and write to output accessors.
2187+
BOp(LocalReds[0], LocalReds[WGSize]);
22092188
}
22102189

22112190
// Ensure item 0 is finished with LocalReds before next iteration

0 commit comments

Comments
 (0)