@@ -676,14 +676,6 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
676
676
677
677
RedOutVar &getUserRedVar () { return MRedOut; }
678
678
679
- static inline result_type *getOutPointer (result_type *OutPtr) {
680
- return OutPtr;
681
- }
682
- template <class AccessorType >
683
- static inline result_type *getOutPointer (const AccessorType &OutAcc) {
684
- return OutAcc.get_pointer ().get ();
685
- }
686
-
687
679
private:
688
680
// Array reduction is performed element-wise to avoid stack growth, hence
689
681
// 1-dimensional always.
@@ -885,7 +877,7 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
885
877
for (size_t E = 0 ; E < NElements; ++E) {
886
878
Reducer.getElement (E) = GroupSum[E];
887
879
}
888
- Reducer.template atomic_combine (Reduction::getOutPointer ( Out) );
880
+ Reducer.template atomic_combine (& Out[ 0 ] );
889
881
}
890
882
});
891
883
return Reduction::is_usm || Redu.initializeToIdentity ();
@@ -937,12 +929,11 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
937
929
RedElem = reduce_over_group (Group, RedElem, BOp);
938
930
if (LID == 0 ) {
939
931
if (NWorkGroups == 1 ) {
940
- auto &OutElem = Reduction::getOutPointer (Out)[E];
941
932
// Can avoid using partial sum and write the final result
942
933
// immediately.
943
934
if (IsUpdateOfUserVar)
944
- RedElem = BOp (RedElem, OutElem );
945
- OutElem = RedElem;
935
+ RedElem = BOp (RedElem, Out[E] );
936
+ Out[E] = RedElem;
946
937
} else {
947
938
PartialSums[NDId.get_group_linear_id () * NElements + E] =
948
939
Reducer.getElement (E);
@@ -968,16 +959,15 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
968
959
// Reduce each result separately
969
960
// TODO: Opportunity to parallelize across elements.
970
961
for (int E = 0 ; E < NElements; ++E) {
971
- auto &OutElem = Reduction::getOutPointer (Out)[E];
972
962
auto LocalSum = Reducer.getIdentity ();
973
963
for (size_t I = LID; I < NWorkGroups; I += WGSize)
974
964
LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
975
965
auto Result = reduce_over_group (Group, LocalSum, BOp);
976
966
977
967
if (LID == 0 ) {
978
968
if (IsUpdateOfUserVar)
979
- Result = BOp (Result, OutElem );
980
- OutElem = Result;
969
+ Result = BOp (Result, Out[E] );
970
+ Out[E] = Result;
981
971
}
982
972
}
983
973
}
@@ -1061,10 +1051,9 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1061
1051
if (LID == 0 ) {
1062
1052
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
1063
1053
if (NWorkGroups == 1 && IsUpdateOfUserVar)
1064
- V = BOp (V, Reduction::getOutPointer ( Out) [E]);
1054
+ V = BOp (V, Out[E]);
1065
1055
// if NWorkGroups == 1, then PartialsSum and Out point to same memory.
1066
- Reduction::getOutPointer (
1067
- PartialSums)[NDId.get_group_linear_id () * NElements + E] = V;
1056
+ PartialSums[NDId.get_group_linear_id () * NElements + E] = V;
1068
1057
}
1069
1058
}
1070
1059
@@ -1085,9 +1074,7 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1085
1074
for (int E = 0 ; E < NElements; ++E) {
1086
1075
auto LocalSum = Identity;
1087
1076
for (size_t I = LID; I < NWorkGroups; I += WGSize)
1088
- LocalSum =
1089
- BOp (LocalSum,
1090
- Reduction::getOutPointer (PartialSums)[I * NElements + E]);
1077
+ LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
1091
1078
1092
1079
LocalReds[LID] = LocalSum;
1093
1080
if (LID == 0 )
@@ -1106,8 +1093,8 @@ bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1106
1093
if (LID == 0 ) {
1107
1094
auto V = BOp (LocalReds[0 ], LocalReds[WGSize]);
1108
1095
if (IsUpdateOfUserVar)
1109
- V = BOp (V, Reduction::getOutPointer ( Out) [E]);
1110
- Reduction::getOutPointer ( Out) [E] = V;
1096
+ V = BOp (V, Out[E]);
1097
+ Out[E] = V;
1111
1098
}
1112
1099
}
1113
1100
}
@@ -1179,7 +1166,7 @@ void reduCGFuncForNDRangeBothFastReduceAndAtomics(handler &CGH,
1179
1166
reduce_over_group (NDIt.get_group (), Reducer.getElement (E), BOp);
1180
1167
}
1181
1168
if (NDIt.get_local_linear_id () == 0 )
1182
- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
1169
+ Reducer.atomic_combine (& Out[ 0 ] );
1183
1170
});
1184
1171
}
1185
1172
@@ -1260,7 +1247,7 @@ void reduCGFuncForNDRangeFastAtomicsOnly(handler &CGH, bool IsPow2WG,
1260
1247
}
1261
1248
1262
1249
if (LID == 0 ) {
1263
- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
1250
+ Reducer.atomic_combine (& Out[ 0 ] );
1264
1251
}
1265
1252
});
1266
1253
}
@@ -1306,8 +1293,8 @@ void reduCGFuncForNDRangeFastReduceOnly(handler &CGH, KernelType KernelFunc,
1306
1293
PSum = reduce_over_group (NDIt.get_group (), PSum, BOp);
1307
1294
if (NDIt.get_local_linear_id () == 0 ) {
1308
1295
if (IsUpdateOfUserVar)
1309
- PSum = BOp (Reduction::getOutPointer ( Out) [E], PSum);
1310
- Reduction::getOutPointer ( Out) [WGID * NElements + E] = PSum;
1296
+ PSum = BOp (Out[E], PSum);
1297
+ Out[WGID * NElements + E] = PSum;
1311
1298
}
1312
1299
}
1313
1300
});
@@ -1387,8 +1374,8 @@ void reduCGFuncForNDRangeBasic(handler &CGH, bool IsPow2WG,
1387
1374
typename Reduction::result_type PSum =
1388
1375
IsPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
1389
1376
if (IsUpdateOfUserVar)
1390
- PSum = BOp (*( Reduction::getOutPointer ( Out)) , PSum);
1391
- Reduction::getOutPointer ( Out) [GrID * NElements + E] = PSum;
1377
+ PSum = BOp (Out[ 0 ] , PSum);
1378
+ Out[GrID * NElements + E] = PSum;
1392
1379
}
1393
1380
1394
1381
// Ensure item 0 is finished with LocalReds before next iteration
@@ -1438,8 +1425,8 @@ void reduAuxCGFuncFastReduceImpl(handler &CGH, bool UniformWG,
1438
1425
PSum = reduce_over_group (NDIt.get_group (), PSum, BOp);
1439
1426
if (NDIt.get_local_linear_id () == 0 ) {
1440
1427
if (IsUpdateOfUserVar)
1441
- PSum = BOp (Reduction::getOutPointer ( Out) [E], PSum);
1442
- Reduction::getOutPointer ( Out) [WGID * NElements + E] = PSum;
1428
+ PSum = BOp (Out[E], PSum);
1429
+ Out[WGID * NElements + E] = PSum;
1443
1430
}
1444
1431
}
1445
1432
});
@@ -1515,8 +1502,8 @@ void reduAuxCGFuncNoFastReduceNorAtomicImpl(handler &CGH, bool UniformPow2WG,
1515
1502
typename Reduction::result_type PSum =
1516
1503
UniformPow2WG ? LocalReds[0 ] : BOp (LocalReds[0 ], LocalReds[WGSize]);
1517
1504
if (IsUpdateOfUserVar)
1518
- PSum = BOp (*( Reduction::getOutPointer ( Out)) , PSum);
1519
- Reduction::getOutPointer ( Out) [GrID * NElements + E] = PSum;
1505
+ PSum = BOp (Out[ 0 ] , PSum);
1506
+ Out[GrID * NElements + E] = PSum;
1520
1507
}
1521
1508
1522
1509
// Ensure item 0 is finished with LocalReds before next iteration
@@ -1738,24 +1725,20 @@ void writeReduSumsToOutAccs(
1738
1725
// Add the initial value of user's variable to the final result.
1739
1726
if (IsOneWG)
1740
1727
std::tie (std::get<Is>(LocalAccs)[0 ]...) = std::make_tuple (std::get<Is>(
1741
- BOPs)(std::get<Is>(LocalAccs)[0 ],
1742
- IsInitializeToIdentity[Is]
1743
- ? std::get<Is>(IdentityVals)
1744
- : std::tuple_element_t <Is, std::tuple<Reductions...>>::
1745
- getOutPointer (std::get<Is>(OutAccs))[0 ])...);
1728
+ BOPs)(std::get<Is>(LocalAccs)[0 ], IsInitializeToIdentity[Is]
1729
+ ? std::get<Is>(IdentityVals)
1730
+ : std::get<Is>(OutAccs)[0 ])...);
1746
1731
1747
1732
if (Pow2WG) {
1748
1733
// The partial sums for the work-group are stored in 0-th elements of local
1749
1734
// accessors. Simply write those sums to output accessors.
1750
- std::tie (std::tuple_element_t <Is, std::tuple<Reductions...>>::getOutPointer (
1751
- std::get<Is>(OutAccs))[OutAccIndex]...) =
1735
+ std::tie (std::get<Is>(OutAccs)[OutAccIndex]...) =
1752
1736
std::make_tuple (std::get<Is>(LocalAccs)[0 ]...);
1753
1737
} else {
1754
1738
// Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1755
1739
// elements. Combine them into final partial sums and write to output
1756
1740
// accessors.
1757
- std::tie (std::tuple_element_t <Is, std::tuple<Reductions...>>::getOutPointer (
1758
- std::get<Is>(OutAccs))[OutAccIndex]...) =
1741
+ std::tie (std::get<Is>(OutAccs)[OutAccIndex]...) =
1759
1742
std::make_tuple (std::get<Is>(BOPs)(std::get<Is>(LocalAccs)[0 ],
1760
1743
std::get<Is>(LocalAccs)[WGSize])...);
1761
1744
}
@@ -1922,23 +1905,21 @@ void reduCGFuncImplArrayHelper(bool Pow2WG, bool IsOneWG, nd_item<Dims> NDIt,
1922
1905
if (LID == 0 ) {
1923
1906
if (IsOneWG) {
1924
1907
LocalReds[0 ] =
1925
- BOp (LocalReds[0 ], IsInitializeToIdentity
1926
- ? Identity
1927
- : Reduction::getOutPointer (Out)[E]);
1908
+ BOp (LocalReds[0 ], IsInitializeToIdentity ? Identity : Out[E]);
1928
1909
}
1929
1910
1930
1911
size_t GrID = NDIt.get_group_linear_id ();
1931
- if (Pow2WG) {
1932
- // The partial sums for the work-group are stored in 0-th elements of
1933
- // local accessors. Simply write those sums to output accessors.
1934
- Reduction::getOutPointer (Out)[GrID * NElements + E] = LocalReds[ 0 ];
1935
- } else {
1936
- // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
1937
- // elements. Combine them into final partial sums and write to output
1938
- // accessors.
1939
- Reduction::getOutPointer (Out)[GrID * NElements + E] =
1940
- BOp (LocalReds[ 0 ], LocalReds[WGSize]);
1941
- }
1912
+ Out[GrID * NElements + E] =
1913
+ Pow2WG ?
1914
+ // The partial sums for the work-group are stored in 0-th
1915
+ // elements of local accessors. Simply write those sums to
1916
+ // output accessors.
1917
+ LocalReds[ 0 ]
1918
+ :
1919
+ // Each of local accessors keeps two partial sums: in 0-th
1920
+ // and WGsize-th elements. Combine them into final partial
1921
+ // sums and write to output accessors.
1922
+ BOp (LocalReds[ 0 ], LocalReds[WGSize]);
1942
1923
}
1943
1924
1944
1925
// Ensure item 0 is finished with LocalReds before next iteration
@@ -2080,7 +2061,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
2080
2061
}
2081
2062
2082
2063
if (NDIt.get_local_linear_id () == 0 ) {
2083
- Reducer.atomic_combine (Reduction::getOutPointer ( Out) );
2064
+ Reducer.atomic_combine (& Out[ 0 ] );
2084
2065
}
2085
2066
});
2086
2067
}
@@ -2189,23 +2170,21 @@ void reduAuxCGFuncImplArrayHelper(bool UniformPow2WG, bool IsOneWG,
2189
2170
if (LID == 0 ) {
2190
2171
if (IsOneWG) {
2191
2172
LocalReds[0 ] =
2192
- BOp (LocalReds[0 ], IsInitializeToIdentity
2193
- ? Identity
2194
- : Reduction::getOutPointer (Out)[E]);
2173
+ BOp (LocalReds[0 ], IsInitializeToIdentity ? Identity : Out[E]);
2195
2174
}
2196
2175
2197
2176
size_t GrID = NDIt.get_group_linear_id ();
2198
- if (UniformPow2WG) {
2199
- // The partial sums for the work-group are stored in 0-th elements of
2200
- // local accessors. Simply write those sums to output accessors.
2201
- Reduction::getOutPointer (Out)[GrID * NElements + E] = LocalReds[ 0 ];
2202
- } else {
2203
- // Each of local accessors keeps two partial sums: in 0-th and WGsize-th
2204
- // elements. Combine them into final partial sums and write to output
2205
- // accessors.
2206
- Reduction::getOutPointer (Out)[GrID * NElements + E] =
2207
- BOp (LocalReds[ 0 ], LocalReds[WGSize]);
2208
- }
2177
+ Out[GrID * NElements + E] =
2178
+ UniformPow2WG ?
2179
+ // The partial sums for the work-group are stored in
2180
+ // 0-th elements of local accessors. Simply write those
2181
+ // sums to output accessors.
2182
+ LocalReds[ 0 ]
2183
+ :
2184
+ // Each of local accessors keeps two partial sums: in
2185
+ // 0-th and WGsize-th elements. Combine them into final
2186
+ // partial sums and write to output accessors.
2187
+ BOp (LocalReds[ 0 ], LocalReds[WGSize]);
2209
2188
}
2210
2189
2211
2190
// Ensure item 0 is finished with LocalReds before next iteration
0 commit comments