@@ -341,16 +341,17 @@ constexpr unsigned loadstoreAlignMask() {
341
341
template <typename Ty, uint16_t AddressScale, int ImmOffset,
342
342
__ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
343
343
__ESIMD_EDNS::lsc_data_order _Transposed, int N, uint32_t MASK>
344
- auto __esimd_emu_lsc_offset_read (
344
+ auto __esimd_emu_lsc_offset_read_merge (
345
345
__ESIMD_DNS::simd_mask_storage_t <N> Pred,
346
346
__ESIMD_DNS::vector_type_t <uint32_t , N> Offsets, char *ReadBase,
347
+ __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues,
347
348
int BufByteWidth = INT_MAX) {
348
349
// TODO : Support AddressScale, ImmOffset
349
350
static_assert (AddressScale == 1 );
350
351
static_assert (ImmOffset == 0 );
351
352
static_assert (DS != __ESIMD_ENS::lsc_data_size::u16u32h);
352
353
353
- __ESIMD_DNS:: vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> Output = 0 ;
354
+ auto Output = OldValues ;
354
355
355
356
constexpr int ChanlCount = __ESIMD_EDNS::to_int<VS>();
356
357
@@ -372,6 +373,10 @@ auto __esimd_emu_lsc_offset_read(
372
373
373
374
if ((ByteDistance >= 0 ) && (ByteDistance < BufByteWidth)) {
374
375
Output[VecIdx] = *((Ty *)(ReadBase + ByteDistance));
376
+ if constexpr (DS == __ESIMD_ENS::lsc_data_size::u8u32)
377
+ Output[VecIdx] &= 0xff ;
378
+ else if constexpr (DS == __ESIMD_ENS::lsc_data_size::u16u32)
379
+ Output[VecIdx] &= 0xffff ;
375
380
}
376
381
}
377
382
}
@@ -788,25 +793,49 @@ auto __esimd_emu_lsc_xatomic_offset_access_2(
788
793
// / @tparam N is the SIMD size of operation (the number of addresses to access)
789
794
// / @param pred is predicates.
790
795
// / @param offsets is the zero-based offsets for SLM buffer in bytes.
796
+ // / @param OldValues contains the vector which elements are copied
797
+ // / to the returned result when the corresponding element of \p pred is 0.
791
798
// / @return is a vector of type T and size N * to_int<VS>()
792
799
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
793
800
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
794
801
__ESIMD_EDNS::lsc_vector_size VS,
795
802
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
796
803
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()>
797
- __esimd_lsc_load_slm (__ESIMD_DNS::simd_mask_storage_t <N> pred,
798
- __ESIMD_DNS::vector_type_t <uint32_t , N> offsets)
804
+ __esimd_lsc_load_merge_slm (
805
+ __ESIMD_DNS::simd_mask_storage_t <N> pred,
806
+ __ESIMD_DNS::vector_type_t <uint32_t , N> offsets,
807
+ __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
808
+ 0)
799
809
#ifdef __SYCL_DEVICE_ONLY__
800
810
;
801
811
#else // __SYCL_DEVICE_ONLY__
802
812
{
803
813
sycl::detail::ESIMDDeviceInterface *I =
804
814
sycl::detail::getESIMDDeviceInterface ();
805
815
806
- return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
807
- _Transposed, N,
808
- loadstoreAlignMask<Ty, VS, DS, N>()>(
809
- pred, offsets, I->__cm_emu_get_slm_ptr ());
816
+ return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
817
+ _Transposed, N,
818
+ loadstoreAlignMask<Ty, VS, DS, N>()>(
819
+ pred, offsets, I->__cm_emu_get_slm_ptr (), OldValues);
820
+ }
821
+ #endif // __SYCL_DEVICE_ONLY__
822
+
823
+ // / Similar to __esimd_lsc_load_merge_slm(), but the argument OldValues is not
824
+ // / explicitly specified, which results into random values in those elements of
825
+ // / the returned result for which the corresponding element in \p pred is 0.
826
+ template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
827
+ uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
828
+ __ESIMD_EDNS::lsc_vector_size VS,
829
+ __ESIMD_EDNS::lsc_data_order _Transposed, int N>
830
+ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()>
831
+ __esimd_lsc_load_slm (__ESIMD_DNS::simd_mask_storage_t <N> pred,
832
+ __ESIMD_DNS::vector_type_t <uint32_t , N> offsets)
833
+ #ifdef __SYCL_DEVICE_ONLY__
834
+ ;
835
+ #else // __SYCL_DEVICE_ONLY__
836
+ {
837
+ return __esimd_lsc_load_merge_slm<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
838
+ VS, _Transposed, N>(pred, offsets);
810
839
}
811
840
#endif // __SYCL_DEVICE_ONLY__
812
841
@@ -829,16 +858,20 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
829
858
// / @param pred is predicates.
830
859
// / @param offsets is the zero-based offsets in bytes.
831
860
// / @param surf_ind is the surface index.
861
+ // / @param OldValues contains the vector which elements are copied
862
+ // / to the returned result when the corresponding element of \p pred is 0.
832
863
// / @return is a vector of type T and N * to_int<VS>()
833
864
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
834
865
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
835
866
__ESIMD_EDNS::lsc_vector_size VS,
836
867
__ESIMD_EDNS::lsc_data_order _Transposed, int N,
837
868
typename SurfIndAliasTy>
838
869
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()>
839
- __esimd_lsc_load_bti (__ESIMD_DNS::simd_mask_storage_t <N> pred,
840
- __ESIMD_DNS::vector_type_t <uint32_t , N> offsets,
841
- SurfIndAliasTy surf_ind)
870
+ __esimd_lsc_load_merge_bti (
871
+ __ESIMD_DNS::simd_mask_storage_t <N> pred,
872
+ __ESIMD_DNS::vector_type_t <uint32_t , N> offsets, SurfIndAliasTy surf_ind,
873
+ __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
874
+ 0)
842
875
#ifdef __SYCL_DEVICE_ONLY__
843
876
;
844
877
#else // __SYCL_DEVICE_ONLY__
@@ -854,10 +887,32 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
854
887
855
888
std::lock_guard<std::mutex> lock (*mutexLock);
856
889
857
- return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
858
- _Transposed, N,
859
- loadstoreAlignMask<Ty, VS, DS, N>()>(
860
- pred, offsets, readBase, width);
890
+ return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
891
+ _Transposed, N,
892
+ loadstoreAlignMask<Ty, VS, DS, N>()>(
893
+ pred, offsets, readBase, OldValues, width);
894
+ }
895
+ #endif // __SYCL_DEVICE_ONLY__
896
+
897
+ // / Similar to __esimd_lsc_load_merge_bti(), but the argument OldValues is not
898
+ // / explicitly specified, which results into random values in those elements of
899
+ // / the returned result for which the corresponding element in \p pred is 0.
900
+ template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
901
+ uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
902
+ __ESIMD_EDNS::lsc_vector_size VS,
903
+ __ESIMD_EDNS::lsc_data_order _Transposed, int N,
904
+ typename SurfIndAliasTy>
905
+ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()>
906
+ __esimd_lsc_load_bti (__ESIMD_DNS::simd_mask_storage_t <N> pred,
907
+ __ESIMD_DNS::vector_type_t <uint32_t , N> offsets,
908
+ SurfIndAliasTy surf_ind)
909
+ #ifdef __SYCL_DEVICE_ONLY__
910
+ ;
911
+ #else // __SYCL_DEVICE_ONLY__
912
+ {
913
+ return __esimd_lsc_load_merge_bti<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
914
+ VS, _Transposed, N, SurfIndAliasTy>(
915
+ pred, offsets, surf_ind);
861
916
}
862
917
#endif // __SYCL_DEVICE_ONLY__
863
918
@@ -889,7 +944,8 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
889
944
__esimd_lsc_load_merge_stateless (
890
945
__ESIMD_DNS::simd_mask_storage_t <N> pred,
891
946
__ESIMD_DNS::vector_type_t <uintptr_t , N> addrs,
892
- __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values)
947
+ __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values =
948
+ 0)
893
949
#ifdef __SYCL_DEVICE_ONLY__
894
950
;
895
951
#else // __SYCL_DEVICE_ONLY__
@@ -957,10 +1013,8 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
957
1013
;
958
1014
#else // __SYCL_DEVICE_ONLY__
959
1015
{
960
- __ESIMD_DNS::vector_type_t <Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues = 0 ;
961
1016
return __esimd_lsc_load_merge_stateless<Ty, L1H, L3H, AddressScale, ImmOffset,
962
- DS, VS, _Transposed, N>(pred, addrs,
963
- OldValues);
1017
+ DS, VS, _Transposed, N>(pred, addrs);
964
1018
}
965
1019
#endif // __SYCL_DEVICE_ONLY__
966
1020
0 commit comments