@@ -649,25 +649,73 @@ inline ESIMD_NODEBUG void esimd_sbarrier(split_barrier_action flag) {
649
649
// / Declare per-work-group slm size.
650
650
SYCL_EXTERNAL SYCL_ESIMD_FUNCTION void slm_init (uint32_t size);
651
651
652
- // / SLM gather.
653
- // /
654
- // / Only allow simd-16 and simd-32.
655
- template <typename T, int n>
652
+ // / SLM gather (version for 4-byte block size).
653
+ // / \tparam T element type of the input vector.
654
+ // / \tparam N size of the \p offsets , \p pred and returned vectors. Must be 16
655
+ // / or 32.
656
+ // / @param offsets byte-offsets within the SLM.
657
+ // / @param pred predication control used for masking lanes.
658
+ // / @return vector of read values of type \p T.
659
+ // / \ingroup sycl_esimd
660
+ template <typename T, int N>
656
661
ESIMD_INLINE ESIMD_NODEBUG
657
- typename sycl::detail::enable_if_t <(n == 16 || n == 32 ) && (sizeof (T) < = 4 ),
658
- simd<T, n >>
659
- slm_load (simd<uint32_t , n > offsets, simd<uint16_t , n > pred = 1 ) {
660
- return __esimd_slm_read<T, n >(offsets.data (), pred.data ());
662
+ typename sycl::detail::enable_if_t <(N == 16 || N == 32 ) && (sizeof (T) = = 4 ),
663
+ simd<T, N >>
664
+ slm_load (simd<uint32_t , N > offsets, simd<uint16_t , N > pred = 1 ) {
665
+ return __esimd_slm_read<T, N >(offsets.data (), pred.data ());
661
666
}
662
667
663
- // / SLM scatter.
664
- template <typename T, int n>
668
+ // / SLM gather (version for 1- and 2-byte block size).
669
+ // / \tparam T element type of the input vector.
670
+ // / \tparam N size of the \p offsets , \p pred and returned vectors. Must be 16
671
+ // / or 32.
672
+ // / @param offsets byte-offsets within the SLM.
673
+ // / @param pred predication control used for masking lanes.
674
+ // / @return vector of read values of type \p T.
675
+ // / \ingroup sycl_esimd
676
+ template <typename T, int N>
677
+ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t <
678
+ (N == 16 || N == 32 ) && (sizeof (T) == 1 || sizeof (T) == 2 ), simd<T, N>>
679
+ slm_load (simd<uint32_t , N> offsets, simd<uint16_t , N> pred = 1 ) {
680
+ typedef typename detail::dword_type<T>::type T1;
681
+ simd<T1, N> temp = __esimd_slm_read<T1, N>(offsets.data (), pred.data ());
682
+ simd<T, N> res = temp;
683
+ return res;
684
+ }
685
+
686
+ // / SLM scatter (version for 4-byte block size).
687
+ // / \tparam T element type of the input vector.
688
+ // / \tparam N size of the \p offsets , \p pred and \p vals vectors. Must be 16
689
+ // / or 32.
690
+ // / @param vals values to be written.
691
+ // / @param offsets byte-offsets within the SLM.
692
+ // / @param pred predication control used for masking lanes.
693
+ // / \ingroup sycl_esimd
694
+ template <typename T, int N>
665
695
ESIMD_INLINE ESIMD_NODEBUG
666
- typename sycl::detail::enable_if_t <(n == 16 || n == 32 ) && (sizeof (T) < = 4 ),
696
+ typename sycl::detail::enable_if_t <(N == 16 || N == 32 ) && (sizeof (T) = = 4 ),
667
697
void >
668
- slm_store (simd<T, n> vals, simd<uint32_t , n> offsets,
669
- simd<uint16_t , n> pred = 1 ) {
670
- __esimd_slm_write<T, n>(offsets.data (), vals.data (), pred.data ());
698
+ slm_store (simd<T, N> vals, simd<uint32_t , N> offsets,
699
+ simd<uint16_t , N> pred = 1 ) {
700
+ __esimd_slm_write<T, N>(offsets.data (), vals.data (), pred.data ());
701
+ }
702
+
703
+ // / SLM scatter (version for 1- and 2-byte block size).
704
+ // / \tparam T element type of the input vector.
705
+ // / \tparam N size of the \p offsets , \p pred and \p vals vectors. Must be 16
706
+ // / or 32.
707
+ // / @param vals values to be written.
708
+ // / @param offsets byte-offsets within the SLM.
709
+ // / @param pred predication control used for masking lanes.
710
+ // / \ingroup sycl_esimd
711
+ template <typename T, int N>
712
+ ESIMD_INLINE ESIMD_NODEBUG typename sycl::detail::enable_if_t <
713
+ (N == 16 || N == 32 ) && (sizeof (T) == 1 || sizeof (T) == 2 ), void >
714
+ slm_store (simd<T, N> vals, simd<uint32_t , N> offsets,
715
+ simd<uint16_t , N> pred = 1 ) {
716
+ typedef typename detail::dword_type<T>::type T1;
717
+ simd<T1, N> temp = vals;
718
+ __esimd_slm_write<T1, N>(offsets.data (), temp.data (), pred.data ());
671
719
}
672
720
673
721
// / SLM gather4.
0 commit comments