@@ -486,14 +486,18 @@ void dpnp_rng_negative_binomial_c(void* result, const double a, const double p,
486
486
event_out.wait ();
487
487
}
488
488
489
+ template <typename _KernelNameSpecialization>
490
+ class dpnp_rng_noncentral_chisquare_c_kernel1 ;
491
+ template <typename _KernelNameSpecialization>
492
+ class dpnp_rng_noncentral_chisquare_c_kernel2 ;
489
493
template <typename _DataType>
490
494
void dpnp_rng_noncentral_chisquare_c (void * result, const _DataType df, const _DataType nonc, const size_t size)
491
495
{
492
496
if (!size || !result)
493
497
{
494
498
return ;
495
499
}
496
- DPNPC_ptr_adapter<_DataType> result1_ptr (result, size, true , true );
500
+ DPNPC_ptr_adapter<_DataType> result1_ptr (result, size, false , true );
497
501
_DataType* result1 = result1_ptr.get_ptr ();
498
502
499
503
const _DataType d_zero = _DataType (0.0 );
@@ -540,14 +544,23 @@ void dpnp_rng_noncentral_chisquare_c(void* result, const _DataType df, const _Da
540
544
event_out.wait ();
541
545
542
546
shape = 0.5 * df;
543
-
544
547
if (0.125 * size > sqrt (lambda))
545
548
{
546
549
size_t * idx = nullptr ;
547
550
_DataType* tmp = nullptr ;
548
551
idx = reinterpret_cast <size_t *>(dpnp_memory_alloc_c (size * sizeof (size_t )));
549
- for (i = 0 ; i < size; i++)
552
+
553
+ cl::sycl::range<1 > gws1 (size);
554
+ auto kernel_parallel_for_func1 = [=](cl::sycl::id<1 > global_id) {
555
+ size_t i = global_id[0 ];
550
556
idx[i] = i;
557
+ };
558
+ auto kernel_func1 = [&](cl::sycl::handler& cgh) {
559
+ cgh.parallel_for <class dpnp_rng_noncentral_chisquare_c_kernel1 <_DataType>>(gws1,
560
+ kernel_parallel_for_func1);
561
+ };
562
+ event_out = DPNP_QUEUE.submit (kernel_func1);
563
+ event_out.wait ();
551
564
552
565
std::sort (idx, idx + size, [pvec](size_t i1, size_t i2) { return pvec[i1] < pvec[i2]; });
553
566
/* idx now contains original indexes of ordered Poisson outputs */
@@ -556,14 +569,13 @@ void dpnp_rng_noncentral_chisquare_c(void* result, const _DataType df, const _Da
556
569
tmp = reinterpret_cast <_DataType*>(dpnp_memory_alloc_c (size * sizeof (_DataType)));
557
570
for (i = 0 ; i < size;)
558
571
{
559
- size_t k, j;
572
+ size_t j;
560
573
int cv = pvec[idx[i]];
561
-
562
574
// TODO vectorize
563
575
for (j = i + 1 ; (j < size) && (pvec[idx[j]] == cv); j++)
564
576
{
565
577
}
566
- // assert(j > i);
578
+
567
579
if (j <= i)
568
580
{
569
581
throw std::runtime_error (" DPNP RNG Error: dpnp_rng_noncentral_chisquare_c() failed." );
@@ -572,13 +584,20 @@ void dpnp_rng_noncentral_chisquare_c(void* result, const _DataType df, const _Da
572
584
event_out = mkl_rng::generate (gamma_distribution, DPNP_RNG_ENGINE, j - i, tmp);
573
585
event_out.wait ();
574
586
575
- // TODO vectorize
576
- for (k = i; k < j; k++)
577
- result1[idx[k]] = tmp[k - i];
587
+ cl::sycl::range<1 > gws2 (j - i);
588
+ auto kernel_parallel_for_func2 = [=](cl::sycl::id<1 > global_id) {
589
+ size_t index = global_id[0 ];
590
+ result1[idx[index + i]] = tmp[index];
591
+ };
592
+ auto kernel_func2 = [&](cl::sycl::handler& cgh) {
593
+ cgh.parallel_for <class dpnp_rng_noncentral_chisquare_c_kernel2 <_DataType>>(gws2,
594
+ kernel_parallel_for_func2);
595
+ };
596
+ event_out = DPNP_QUEUE.submit (kernel_func2);
597
+ event_out.wait ();
578
598
579
599
i = j;
580
600
}
581
-
582
601
dpnp_memory_free_c (tmp);
583
602
dpnp_memory_free_c (idx);
584
603
}
0 commit comments