@@ -468,6 +468,244 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
468
468
return Passed;
469
469
}
470
470
471
+ template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
472
+ bool UseProperties, typename PropertiesT>
473
+ bool testSLM (queue Q, uint32_t MaskStride, PropertiesT) {
474
+
475
+ static_assert (VS > 0 && N % VS == 0 ,
476
+ " Incorrect VS parameter. N must be divisible by VS." );
477
+ constexpr int NOffsets = N / VS;
478
+ static_assert (!UsePassThru || UseMask,
479
+ " PassThru cannot be used without using mask" );
480
+
481
+ constexpr uint32_t Groups = 8 ;
482
+ constexpr uint32_t Threads = 16 ;
483
+
484
+ std::cout << " Running slm_gather case: T=" << esimd_test::type_name<T>()
485
+ << " , N=" << N << " , VS=" << VS << " , MaskStride=" << MaskStride
486
+ << " , Groups=" << Groups << " , Threads=" << Threads
487
+ << " , use_mask=" << UseMask << " , use_pass_thru=" << UsePassThru
488
+ << " , use_properties=" << UseProperties << std::endl;
489
+
490
+ constexpr uint16_t Size = Groups * Threads * N;
491
+ using Tuint = esimd_test::uint_type_t <sizeof (T)>;
492
+
493
+ sycl::range<1 > GlobalRange{Groups};
494
+ sycl::range<1 > LocalRange{Threads};
495
+ sycl::nd_range<1 > Range{GlobalRange * LocalRange, LocalRange};
496
+
497
+ T *Out = sycl::malloc_shared<T>(Size, Q);
498
+ std::memset (Out, 0 , Size * sizeof (T));
499
+
500
+ T *In = sycl::malloc_shared<T>(Size * 2 , Q);
501
+ for (int I = 0 ; I < Size; I++)
502
+ In[I] = esimd_test::getRandomValue<T>();
503
+
504
+ try {
505
+ Q.parallel_for (Range, [=](sycl::nd_item<1 > NDI) SYCL_ESIMD_KERNEL {
506
+ uint16_t GlobalID = NDI.get_global_id (0 );
507
+ uint16_t LocalID = NDI.get_local_id (0 );
508
+ uint32_t GlobalElemOffset = GlobalID * N;
509
+ uint32_t LocalElemOffset = LocalID * N;
510
+
511
+ // Allocate a bit more to safely initialize it with 4-element chunks.
512
+ constexpr uint32_t SLMSize = (Threads * N + 8 ) * sizeof (T);
513
+ slm_init<SLMSize>();
514
+
515
+ if (LocalID == 0 ) {
516
+ for (int I = 0 ; I < Threads * N; I += 8 ) {
517
+ simd<T, 8 > InVec (In + GlobalElemOffset + I);
518
+ simd<uint32_t , 8 > offsets (I * sizeof (T), sizeof (T));
519
+ slm_scatter<T>(offsets, InVec);
520
+ }
521
+ }
522
+ barrier ();
523
+
524
+ PropertiesT Props{};
525
+
526
+ simd<uint32_t , NOffsets> ByteOffsets (LocalElemOffset * sizeof (T),
527
+ VS * sizeof (T));
528
+ simd_view ByteOffsetsView = ByteOffsets.template select <NOffsets, 1 >();
529
+
530
+ simd_mask<NOffsets> Pred;
531
+ for (int I = 0 ; I < NOffsets; I++)
532
+ Pred[I] = (I % MaskStride == 0 ) ? 1 : 0 ;
533
+
534
+ using Tuint = esimd_test::uint_type_t <sizeof (T)>;
535
+ simd<Tuint, N> PassThruInt (GlobalElemOffset, 1 );
536
+ simd<T, N> PassThru = PassThruInt.template bit_cast_view <T>();
537
+ auto PassThruView = PassThru.template select <N, 1 >(0 );
538
+
539
+ simd<T, N> Vals;
540
+ if constexpr (VS > 1 ) { // VS > 1 requires specifying <T, N, VS>
541
+ if constexpr (UsePassThru) {
542
+ if constexpr (UseProperties) {
543
+ if (GlobalID % 4 == 0 ) // ByteOffset - simd, PassThru - simd
544
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru, Props);
545
+ else if (GlobalID % 4 == 1 ) // ByteOffset - simd, PassThru - view
546
+ Vals =
547
+ slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView, Props);
548
+ else if (GlobalID % 4 == 2 ) // ByteOffset - view, PassThru - simd
549
+ Vals =
550
+ slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru, Props);
551
+ else // ByteOffset - view, PassThru - view
552
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView,
553
+ Props);
554
+ } else { // UseProperties is false
555
+ if (GlobalID % 4 == 0 ) // ByteOffset - simd, PassThru - simd
556
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru);
557
+ else if (GlobalID % 4 == 1 ) // ByteOffset - simd, PassThru - view
558
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView);
559
+ else if (GlobalID % 4 == 2 ) // ByteOffset - view, PassThru - simd
560
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru);
561
+ else // ByteOffset - view, PassThru - view
562
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView);
563
+ }
564
+ } else if constexpr (UseMask) { // UsePassThru is false
565
+ if constexpr (UseProperties) {
566
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
567
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, Props);
568
+ else // ByteOffset - simd_view
569
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, Props);
570
+ } else { // UseProperties is false
571
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
572
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Pred);
573
+ else // ByteOffset - simd_view
574
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred);
575
+ }
576
+ } else { // UseMask is false, UsePassThru is false
577
+ if constexpr (UseProperties) {
578
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
579
+ Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
580
+ else // ByteOffset - simd_view
581
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView, Props);
582
+ } else { // UseProperties is false
583
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
584
+ Vals = slm_gather<T, N, VS>(ByteOffsets);
585
+ else // ByteOffset - simd_view
586
+ Vals = slm_gather<T, N, VS>(ByteOffsetsView);
587
+ }
588
+ }
589
+ } else {
590
+ // if (VS == 1) then <T, N, VS> can often be omitted - test it here.
591
+ // The variants accepting simd_view for 'PassThru' operand though
592
+ // still require <T, N> to be specified explicitly to help
593
+ // C++ FE do simd to simd_view matching.
594
+ if constexpr (UsePassThru) {
595
+ if constexpr (UseProperties) {
596
+ if (GlobalID % 4 == 0 ) // ByteOffset - simd, PassThru - simd
597
+ Vals = slm_gather<T>(ByteOffsets, Pred, PassThru, Props);
598
+ else if (GlobalID % 4 == 1 ) // ByteOffset - simd, PassThru - view
599
+ Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView, Props);
600
+ else if (GlobalID % 4 == 2 ) // ByteOffset - view, PassThru - simd
601
+ Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru, Props);
602
+ else // ByteOffset - view, PassThru - view
603
+ Vals =
604
+ slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView, Props);
605
+ } else { // UseProperties is false
606
+ if (GlobalID % 4 == 0 ) // ByteOffset - simd, PassThru - simd
607
+ Vals = slm_gather<T>(ByteOffsets, Pred, PassThru);
608
+ else if (GlobalID % 4 == 1 ) // ByteOffset - simd, PassThru - view
609
+ Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView);
610
+ else if (GlobalID % 4 == 2 ) // ByteOffset - view, PassThru - simd
611
+ Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru);
612
+ else // ByteOffset - view, PassThru - view
613
+ Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView);
614
+ }
615
+ } else if constexpr (UseMask) { // UsePassThru is false
616
+ if constexpr (UseProperties) {
617
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
618
+ Vals = slm_gather<T>(ByteOffsets, Pred, Props);
619
+ else // ByteOffset - simd_view
620
+ Vals = slm_gather<T, N>(ByteOffsetsView, Pred, Props);
621
+ } else { // UseProperties is false
622
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
623
+ Vals = slm_gather<T>(ByteOffsets, Pred);
624
+ else // ByteOffset - simd_view
625
+ Vals = slm_gather<T, N>(ByteOffsetsView, Pred);
626
+ }
627
+ } else { // UsePassThru is false, UseMask is false
628
+ if constexpr (UseProperties) {
629
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
630
+ Vals = slm_gather<T>(ByteOffsets, Props);
631
+ else // ByteOffset - simd_view
632
+ Vals = slm_gather<T, N>(ByteOffsetsView, Props);
633
+ } else {
634
+ if (GlobalID % 2 == 0 ) // ByteOffset - simd
635
+ Vals = slm_gather<T>(ByteOffsets);
636
+ else // ByteOffset - simd_view
637
+ Vals = slm_gather<T, N>(ByteOffsetsView);
638
+ }
639
+ }
640
+ } // end if (VS == 1)
641
+ Vals.copy_to (Out + GlobalElemOffset);
642
+ }).wait ();
643
+ } catch (sycl::exception const &e) {
644
+ std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
645
+ sycl::free (In, Q);
646
+ sycl::free (Out, Q);
647
+ return false ;
648
+ }
649
+
650
+ bool Passed = verify (In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
651
+ if (!Passed)
652
+ std::cout << " Case FAILED" << std::endl;
653
+
654
+ sycl::free (In, Q);
655
+ sycl::free (Out, Q);
656
+ return Passed;
657
+ }
658
+
659
+ template <typename T, TestFeatures Features> bool testSLM (queue Q) {
660
+ constexpr bool UseMask = true ;
661
+ constexpr bool UsePassThru = true ;
662
+ constexpr bool UseProperties = true ;
663
+
664
+ properties AlignElemProps{alignment<sizeof (T)>};
665
+
666
+ bool Passed = true ;
667
+ Passed &= testSLM<T, 1 , 1 , !UseMask, !UsePassThru, !UseProperties>(
668
+ Q, 2 , AlignElemProps);
669
+ Passed &= testSLM<T, 2 , 1 , UseMask, !UsePassThru, !UseProperties>(
670
+ Q, 2 , AlignElemProps);
671
+ Passed &= testSLM<T, 4 , 1 , UseMask, !UsePassThru, !UseProperties>(
672
+ Q, 2 , AlignElemProps);
673
+ Passed &= testSLM<T, 8 , 1 , UseMask, !UsePassThru, UseProperties>(
674
+ Q, 3 , AlignElemProps);
675
+ // UsePassThru requires either DG2/PVC or support of llvm.masked.gather LLVM
676
+ // IR.
677
+ #ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
678
+ Passed &= testSLM<T, 16 , 1 , UseMask, UsePassThru, UseProperties>(
679
+ Q, 2 , AlignElemProps);
680
+ Passed &= testSLM<T, 32 , 1 , UseMask, UsePassThru, !UseProperties>(
681
+ Q, 3 , AlignElemProps);
682
+ #endif
683
+
684
+ // TODO: test non-power-of-2 N
685
+ // Such cases were promised to be supported, but in fact they fail.
686
+ // Create some test cases here after the issue in GPU driver is resolved.
687
+
688
+ if constexpr (Features == TestFeatures::PVC ||
689
+ Features == TestFeatures::DG2) {
690
+
691
+ // Check VS > 1. GPU supports only dwords and qwords in this mode.
692
+ if constexpr (sizeof (T) >= 4 ) {
693
+ // TODO: This test case causes flaky fail. Enable it after the issue
694
+ // in GPU driver is fixed.
695
+ // Passed &= testUSM<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
696
+ // Q, 3, AlignElemProps);
697
+
698
+ Passed &= testSLM<T, 32 , 2 , !UseMask, !UsePassThru, UseProperties>(
699
+ Q, 3 , AlignElemProps);
700
+ Passed &= testSLM<T, 32 , 2 , UseMask, !UsePassThru, UseProperties>(
701
+ Q, 3 , AlignElemProps);
702
+ Passed &= testSLM<T, 32 , 2 , UseMask, UsePassThru, UseProperties>(
703
+ Q, 3 , AlignElemProps);
704
+ }
705
+ }
706
+ return Passed;
707
+ }
708
+
471
709
template <typename T, TestFeatures Features> bool testACC (queue Q) {
472
710
constexpr bool UseMask = true ;
473
711
constexpr bool UsePassThru = true ;
@@ -478,12 +716,10 @@ template <typename T, TestFeatures Features> bool testACC(queue Q) {
478
716
bool Passed = true ;
479
717
Passed &= testACC<T, 1 , 1 , !UseMask, !UsePassThru, !UseProperties>(
480
718
Q, 2 , AlignElemProps);
481
- #ifdef __ESIMD_FORCE_STATELESS_MEM
482
719
Passed &= testACC<T, 2 , 1 , UseMask, !UsePassThru, !UseProperties>(
483
720
Q, 2 , AlignElemProps);
484
721
Passed &= testACC<T, 4 , 1 , UseMask, !UsePassThru, !UseProperties>(
485
722
Q, 2 , AlignElemProps);
486
- #endif // __ESIMD_FORCE_STATELESS_MEM
487
723
Passed &= testACC<T, 8 , 1 , UseMask, !UsePassThru, !UseProperties>(
488
724
Q, 3 , AlignElemProps);
489
725
Passed &= testACC<T, 16 , 1 , UseMask, !UsePassThru, UseProperties>(
0 commit comments