Skip to content

Commit 2e6b7d6

Browse files
committed
removed all calls into PSTL internal logic. This is what was causing fails between beta08 and beta09.
Signed-off-by: todd.erdner <[email protected]>
1 parent 51a0543 commit 2e6b7d6

File tree

1 file changed

+127
-12
lines changed
  • DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src

1 file changed

+127
-12
lines changed

DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp

Lines changed: 127 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,126 @@ struct slice_area {
162162
};
163163
};
164164

165+
166+
// a way to get value_type from both accessors and USM that is needed for transform_init
167+
template <typename Unknown>
168+
struct accessor_traits
169+
{
170+
};
171+
172+
template <typename T, int Dim, sycl::access::mode AccMode, sycl::access::target AccTarget,
173+
sycl::access::placeholder Placeholder>
174+
struct accessor_traits<sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>>
175+
{
176+
using value_type = typename sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>::value_type;
177+
};
178+
179+
template <typename RawArrayValueType>
180+
struct accessor_traits<RawArrayValueType*>
181+
{
182+
using value_type = RawArrayValueType;
183+
};
184+
185+
// calculate shift where we should start processing on current item
186+
template <typename NDItemId, typename GlobalIdx, typename SizeNIter, typename SizeN>
187+
SizeN
188+
calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n)
189+
{
190+
auto global_range_size = item_id.get_global_range().size();
191+
192+
auto start = n_iter * global_idx;
193+
auto global_shift = global_idx + n_iter * global_range_size;
194+
if (n_iter > 0 && global_shift > n)
195+
{
196+
start += n % global_range_size - global_idx;
197+
}
198+
else if (global_shift < n)
199+
{
200+
n_iter++;
201+
}
202+
return start;
203+
}
204+
205+
206+
template <typename ExecutionPolicy, typename Operation1, typename Operation2>
207+
struct transform_init
208+
{
209+
Operation1 binary_op;
210+
Operation2 unary_op;
211+
212+
template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal, typename... Acc>
213+
void
214+
operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem,
215+
const Acc&... acc)
216+
{
217+
auto local_idx = item_id.get_local_id(0);
218+
auto global_range_size = item_id.get_global_range().size();
219+
auto n_iter = n / global_range_size;
220+
auto start = calc_shift(item_id, global_idx, n_iter, n);
221+
auto shifted_global_idx = global_idx + start;
222+
223+
typename accessor_traits<AccLocal>::value_type res;
224+
if (global_idx < n)
225+
{
226+
res = unary_op(shifted_global_idx, acc...);
227+
}
228+
// Add neighbour to the current local_mem
229+
for (decltype(n_iter) i = 1; i < n_iter; ++i)
230+
{
231+
res = binary_op(res, unary_op(shifted_global_idx + i, acc...));
232+
}
233+
if (global_idx < n)
234+
{
235+
local_mem[local_idx] = res;
236+
}
237+
}
238+
};
239+
240+
241+
// Reduce on local memory
242+
template <typename ExecutionPolicy, typename BinaryOperation1, typename Tp>
243+
struct reduce
244+
{
245+
BinaryOperation1 bin_op1;
246+
247+
template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal>
248+
Tp
249+
operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem)
250+
{
251+
auto local_idx = item_id.get_local_id(0);
252+
auto group_size = item_id.get_local_range().size();
253+
254+
auto k = 1;
255+
do
256+
{
257+
item_id.barrier(sycl::access::fence_space::local_space);
258+
if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n &&
259+
global_idx + k < n)
260+
{
261+
local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]);
262+
}
263+
k *= 2;
264+
} while (k < group_size);
265+
return local_mem[local_idx];
266+
}
267+
};
268+
269+
270+
// walk through the data
271+
template <typename ExecutionPolicy, typename F>
272+
struct walk_n
273+
{
274+
F f;
275+
276+
template <typename ItemId, typename... Ranges>
277+
auto
278+
operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...))
279+
{
280+
return f(rngs[idx]...);
281+
}
282+
};
283+
284+
165285
// This option uses a parallel for to fill the buffer and then
166286
// uses a tranform_init with plus/no_op and then
167287
// a local reduction then global reduction.
@@ -187,21 +307,18 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) {
187307
auto calc_begin = oneapi::dpl::begin(buf);
188308
auto calc_end = oneapi::dpl::end(buf);
189309

190-
using Functor = oneapi::dpl::unseq_backend::walk_n<Policy, my_no_op>;
310+
using Functor = walk_n<Policy, my_no_op>;
191311
float result;
192312

193313
// Functor will do nothing for tranform_init and will use plus for reduce.
194314
// In this example we have done the calculation and filled the buffer above
195315
// The way transform_init works is that you need to have the value already
196316
// populated in the buffer.
197-
auto tf_init =
198-
oneapi::dpl::unseq_backend::transform_init<Policy, std::plus<float>,
199-
Functor>{std::plus<float>(),
200-
Functor{my_no_op()}};
317+
auto tf_init = transform_init<Policy, std::plus<float>,
318+
Functor>{std::plus<float>(), Functor{my_no_op()}};
201319

202320
auto combine = std::plus<float>();
203-
auto brick_reduce =
204-
oneapi::dpl::unseq_backend::reduce<Policy, std::plus<float>, float>{
321+
auto brick_reduce = reduce<Policy, std::plus<float>, float>{
205322
std::plus<float>()};
206323
auto workgroup_size =
207324
policy.queue()
@@ -293,19 +410,17 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) {
293410
auto calc_begin = oneapi::dpl::begin(buf2);
294411
auto calc_end = oneapi::dpl::end(buf2);
295412

296-
using Functor2 = oneapi::dpl::unseq_backend::walk_n<Policy, slice_area>;
413+
using Functor2 = walk_n<Policy, slice_area>;
297414

298415
// The buffer has 1...num it at and now we will use that as an input
299416
// to the slice structue which will calculate the area of each
300417
// rectangle.
301-
auto tf_init =
302-
oneapi::dpl::unseq_backend::transform_init<Policy, std::plus<float>,
418+
auto tf_init = transform_init<Policy, std::plus<float>,
303419
Functor2>{
304420
std::plus<float>(), Functor2{slice_area(num_steps)}};
305421

306422
auto combine = std::plus<float>();
307-
auto brick_reduce =
308-
oneapi::dpl::unseq_backend::reduce<Policy, std::plus<float>, float>{
423+
auto brick_reduce = reduce<Policy, std::plus<float>, float>{
309424
std::plus<float>()};
310425

311426
// get workgroup_size from the device

0 commit comments

Comments
 (0)