@@ -264,15 +264,15 @@ using dpctl::tensor::sycl_utils::choose_workgroup_size;
264
264
265
265
template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
266
266
sycl::event
267
- boolean_reduction_contig_impl (sycl::queue exec_q,
268
- size_t iter_nelems,
269
- size_t reduction_nelems,
270
- const char *arg_cp,
271
- char *res_cp,
272
- py::ssize_t iter_arg_offset,
273
- py::ssize_t iter_res_offset,
274
- py::ssize_t red_arg_offset,
275
- const std::vector<sycl::event> &depends)
267
+ boolean_reduction_axis1_contig_impl (sycl::queue exec_q,
268
+ size_t iter_nelems,
269
+ size_t reduction_nelems,
270
+ const char *arg_cp,
271
+ char *res_cp,
272
+ py::ssize_t iter_arg_offset,
273
+ py::ssize_t iter_res_offset,
274
+ py::ssize_t red_arg_offset,
275
+ const std::vector<sycl::event> &depends)
276
276
{
277
277
const argTy *arg_tp = reinterpret_cast <const argTy *>(arg_cp) +
278
278
iter_arg_offset + red_arg_offset;
@@ -315,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
315
315
});
316
316
}
317
317
else {
318
- sycl::event init_ev = exec_q.submit ([&](sycl::handler &cgh) {
319
- using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
320
-
321
- IndexerT res_indexer{};
322
-
323
- cgh.depends_on (depends);
324
-
325
- cgh.parallel_for (sycl::range<1 >(iter_nelems), [=](sycl::id<1 > id) {
326
- auto res_offset = res_indexer (id[0 ]);
327
- res_tp[res_offset] = identity_val;
328
- });
329
- });
318
+ sycl::event init_ev = exec_q.fill <resTy>(res_tp, resTy (identity_val),
319
+ iter_nelems, depends);
330
320
red_ev = exec_q.submit ([&](sycl::handler &cgh) {
331
321
cgh.depends_on (init_ev);
332
322
@@ -356,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
356
346
return red_ev;
357
347
}
358
348
359
- template <typename fnT, typename srcTy> struct AllContigFactory
349
+ template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
360
350
{
361
351
fnT get () const
362
352
{
@@ -365,12 +355,12 @@ template <typename fnT, typename srcTy> struct AllContigFactory
365
355
using GroupOpT =
366
356
all_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
367
357
368
- return dpctl::tensor::kernels::boolean_reduction_contig_impl <
358
+ return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl <
369
359
srcTy, resTy, RedOpT, GroupOpT>;
370
360
}
371
361
};
372
362
373
- template <typename fnT, typename srcTy> struct AnyContigFactory
363
+ template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
374
364
{
375
365
fnT get () const
376
366
{
@@ -379,7 +369,7 @@ template <typename fnT, typename srcTy> struct AnyContigFactory
379
369
using GroupOpT =
380
370
any_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
381
371
382
- return dpctl::tensor::kernels::boolean_reduction_contig_impl <
372
+ return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl <
383
373
srcTy, resTy, RedOpT, GroupOpT>;
384
374
}
385
375
};
@@ -463,6 +453,113 @@ struct StridedBooleanReduction
463
453
}
464
454
};
465
455
456
+ template <typename T1,
457
+ typename T2,
458
+ typename T3,
459
+ typename T4,
460
+ typename T5,
461
+ typename T6>
462
+ class boolean_reduction_axis0_contig_krn ;
463
+
464
+ template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
465
+ sycl::event
466
+ boolean_reduction_axis0_contig_impl (sycl::queue exec_q,
467
+ size_t iter_nelems,
468
+ size_t reduction_nelems,
469
+ const char *arg_cp,
470
+ char *res_cp,
471
+ py::ssize_t iter_arg_offset,
472
+ py::ssize_t iter_res_offset,
473
+ py::ssize_t red_arg_offset,
474
+ const std::vector<sycl::event> &depends)
475
+ {
476
+ const argTy *arg_tp = reinterpret_cast <const argTy *>(arg_cp) +
477
+ iter_arg_offset + red_arg_offset;
478
+ resTy *res_tp = reinterpret_cast <resTy *>(res_cp) + iter_res_offset;
479
+
480
+ constexpr resTy identity_val = sycl::known_identity<RedOpT, resTy>::value;
481
+
482
+ const sycl::device &d = exec_q.get_device ();
483
+ const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
484
+ size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
485
+
486
+ {
487
+ sycl::event init_ev = exec_q.fill <resTy>(res_tp, resTy (identity_val),
488
+ iter_nelems, depends);
489
+ sycl::event red_ev = exec_q.submit ([&](sycl::handler &cgh) {
490
+ cgh.depends_on (init_ev);
491
+
492
+ constexpr std::uint8_t dim = 1 ;
493
+
494
+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
495
+ using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
496
+ using InputOutputIterIndexerT =
497
+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
498
+ NoOpIndexerT, NoOpIndexerT>;
499
+ using ReductionIndexerT = ColsIndexerT;
500
+
501
+ NoOpIndexerT columns_indexer{};
502
+ NoOpIndexerT result_indexer{};
503
+ InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
504
+ result_indexer};
505
+ ReductionIndexerT reduction_indexer{
506
+ 0 , static_cast <py::ssize_t >(reduction_nelems),
507
+ static_cast <py::ssize_t >(iter_nelems)};
508
+
509
+ constexpr size_t preferred_reductions_per_wi = 4 ;
510
+ size_t reductions_per_wi =
511
+ (reduction_nelems < preferred_reductions_per_wi * wg)
512
+ ? ((reduction_nelems + wg - 1 ) / wg)
513
+ : preferred_reductions_per_wi;
514
+
515
+ size_t reduction_groups =
516
+ (reduction_nelems + reductions_per_wi * wg - 1 ) /
517
+ (reductions_per_wi * wg);
518
+
519
+ auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
520
+ auto lws = sycl::range<dim>{wg};
521
+
522
+ cgh.parallel_for <class boolean_reduction_axis0_contig_krn <
523
+ argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
524
+ ReductionIndexerT>>(
525
+ sycl::nd_range<dim>(gws, lws),
526
+ StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
527
+ InputOutputIterIndexerT,
528
+ ReductionIndexerT>(
529
+ arg_tp, res_tp, RedOpT (), GroupOpT (), identity_val,
530
+ in_out_iter_indexer, reduction_indexer, reduction_nelems,
531
+ iter_nelems, reductions_per_wi));
532
+ });
533
+ return red_ev;
534
+ }
535
+ }
536
+
537
+ template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
538
+ {
539
+ fnT get () const
540
+ {
541
+ using resTy = std::int32_t ;
542
+ using RedOpT = sycl::logical_and<resTy>;
543
+ using GroupOpT = all_reduce_wg_strided<resTy>;
544
+
545
+ return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
546
+ srcTy, resTy, RedOpT, GroupOpT>;
547
+ }
548
+ };
549
+
550
+ template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
551
+ {
552
+ fnT get () const
553
+ {
554
+ using resTy = std::int32_t ;
555
+ using RedOpT = sycl::logical_or<resTy>;
556
+ using GroupOpT = any_reduce_wg_strided<resTy>;
557
+
558
+ return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
559
+ srcTy, resTy, RedOpT, GroupOpT>;
560
+ }
561
+ };
562
+
466
563
template <typename T1,
467
564
typename T2,
468
565
typename T3,
@@ -542,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
542
639
});
543
640
}
544
641
else {
545
- sycl::event res_init_ev = exec_q.submit ([&](sycl::handler &cgh) {
642
+ sycl::event init_ev = exec_q.submit ([&](sycl::handler &cgh) {
546
643
using IndexerT =
547
644
dpctl::tensor::offset_utils::UnpackedStridedIndexer;
548
645
@@ -560,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
560
657
});
561
658
});
562
659
red_ev = exec_q.submit ([&](sycl::handler &cgh) {
563
- cgh.depends_on (res_init_ev );
660
+ cgh.depends_on (init_ev );
564
661
565
662
constexpr std::uint8_t dim = 1 ;
566
663
0 commit comments