@@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
474
474
// / Supported platforms: DG2, PVC
475
475
// / VISA instruction: lsc_load.ugm
476
476
// /
477
- // / Collects elements located at specified address and returns them
478
- // / as a single \ref simd object.
477
+ // / Accesses contiguous block of memory of `NElts * S` bytes starting from
478
+ // / given address, where S is a byte size of an "element" defined by the \c DS
479
+ // / template parameter. The maximum size of accessed block is 512 bytes for PVC
480
+ // / and 256 bytes for ACM (DG2).
481
+ // / When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
482
+ // / otherwise - 4-bytes aligned. Allowed values for the data size are
483
+ // / \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
484
+ // / 1, 2, 3, 4, 8, 16, 32, 64.
485
+ // / Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
486
+ // / must be 64.
479
487
// /
480
488
// / @tparam T is element type.
481
489
// / @tparam NElts is the number of elements to load per address.
@@ -492,22 +500,34 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
492
500
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
493
501
__ESIMD_API __ESIMD_NS::simd<T, NElts>
494
502
lsc_block_load (const T *p, __ESIMD_NS::simd_mask<1 > pred = 1 ) {
495
- detail::check_lsc_vector_size<NElts>();
496
503
detail::check_lsc_data_size<T, DS>();
497
504
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
498
505
constexpr uint16_t _AddressScale = 1 ;
499
506
constexpr int _ImmOffset = 0 ;
500
507
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
501
- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
502
- " Transposed load is supported only for data size u32 or u64" );
503
- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
504
508
constexpr detail::lsc_data_order _Transposed =
505
509
detail::lsc_data_order::transpose;
506
510
constexpr int N = 1 ;
507
511
__ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
508
- return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
509
- _VS, _Transposed, N>(pred.data (),
510
- addrs.data ());
512
+ constexpr int SmallIntFactor =
513
+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
514
+ static_assert (NElts % SmallIntFactor == 0 ,
515
+ " Number of elements is not supported by Transposed load" );
516
+
517
+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
518
+ constexpr detail::lsc_vector_size _VS =
519
+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
520
+ if constexpr (SmallIntFactor == 1 ) {
521
+ return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
522
+ _DS, _VS, _Transposed, N>(pred.data (),
523
+ addrs.data ());
524
+ } else {
525
+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
526
+ __esimd_lsc_load_stateless<uint32_t , L1H, L3H, _AddressScale,
527
+ _ImmOffset, lsc_data_size::u32 , _VS,
528
+ _Transposed, N>(pred.data (), addrs.data ());
529
+ return result.template bit_cast_view <T>();
530
+ }
511
531
}
512
532
513
533
// / Accessor-based transposed gather with 1 channel.
@@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
516
536
// /
517
537
// / Collects elements located at surface and returns them
518
538
// / as a single \ref simd object.
539
+ // / See comments in the \ref lsc_block_load API for description and parameter
540
+ // / constraints.
519
541
// /
520
542
// / @tparam T is element type.
521
543
// / @tparam NElts is the number of elements to load per address.
@@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
541
563
return lsc_block_load<T, NElts, DS, L1H, L3H>(
542
564
__ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
543
565
#else
544
- detail::check_lsc_vector_size<NElts>();
545
566
detail::check_lsc_data_size<T, DS>();
546
567
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
547
568
constexpr uint16_t _AddressScale = 1 ;
548
569
constexpr int _ImmOffset = 0 ;
549
570
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
550
- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
551
- " Transposed load is supported only for data size u32 or u64" );
552
- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
553
571
constexpr detail::lsc_data_order _Transposed =
554
572
detail::lsc_data_order::transpose;
555
573
constexpr int N = 1 ;
556
574
__ESIMD_NS::simd<uint32_t , N> offsets = offset;
557
575
auto si = __ESIMD_NS::get_surface_index (acc);
558
- return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
559
- _Transposed, N>(pred.data (), offsets.data (), si);
576
+ constexpr int SmallIntFactor =
577
+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
578
+ static_assert (NElts % SmallIntFactor == 0 ,
579
+ " Number of elements is not supported by Transposed load" );
580
+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
581
+ constexpr detail::lsc_vector_size _VS =
582
+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
583
+
584
+ if constexpr (SmallIntFactor == 1 ) {
585
+ return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
586
+ _VS, _Transposed, N>(pred.data (),
587
+ offsets.data (), si);
588
+ } else {
589
+
590
+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> result =
591
+ __esimd_lsc_load_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
592
+ lsc_data_size::u32 , _VS, _Transposed, N>(
593
+ pred.data (), offsets.data (), si);
594
+ return result.template bit_cast_view <T>();
595
+ }
560
596
#endif
561
597
}
562
598
@@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
622
658
constexpr uint16_t _AddressScale = 1 ;
623
659
constexpr int _ImmOffset = 0 ;
624
660
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
661
+
625
662
static_assert (
626
663
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
627
664
" Transposed prefetch is supported only for data size u32 or u64" );
@@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
630
667
detail::lsc_data_order::transpose;
631
668
constexpr int N = 1 ;
632
669
__ESIMD_NS::simd_mask<N> pred = 1 ;
670
+
633
671
__ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
634
672
__esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
635
673
_VS, _Transposed, N>(pred.data (),
@@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
894
932
// / VISA instruction: lsc_store.ugm
895
933
// /
896
934
// / Scatters elements to specific address.
935
+ // / See comments in the \ref lsc_block_load API for description and parameter
936
+ // / constraints.
897
937
// /
898
938
// / @tparam T is element type.
899
939
// / @tparam NElts is the number of elements to store per address.
@@ -910,29 +950,44 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
910
950
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
911
951
__ESIMD_API void lsc_block_store (T *p, __ESIMD_NS::simd<T, NElts> vals,
912
952
__ESIMD_NS::simd_mask<1 > pred = 1 ) {
913
- detail::check_lsc_vector_size<NElts>();
914
953
detail::check_lsc_data_size<T, DS>();
915
954
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
916
955
constexpr uint16_t _AddressScale = 1 ;
917
956
constexpr int _ImmOffset = 0 ;
918
957
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
919
- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
920
- " Transposed store is supported only for data size u32 or u64" );
921
- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
922
958
constexpr detail::lsc_data_order _Transposed =
923
959
detail::lsc_data_order::transpose;
924
960
constexpr int N = 1 ;
925
961
__ESIMD_NS::simd<uintptr_t , N> addrs = reinterpret_cast <uintptr_t >(p);
926
- __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
927
- _Transposed, N>(pred.data (), addrs.data (),
928
- vals.data ());
962
+ constexpr int SmallIntFactor =
963
+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
964
+ static_assert (NElts % SmallIntFactor == 0 ,
965
+ " Number of elements is not supported by Transposed store" );
966
+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
967
+ constexpr detail::lsc_vector_size _VS =
968
+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
969
+ if constexpr (SmallIntFactor == 1 ) {
970
+
971
+ __esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
972
+ _VS, _Transposed, N>(pred.data (), addrs.data (),
973
+ vals.data ());
974
+ } else {
975
+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> tmp =
976
+ vals.template bit_cast_view <uint32_t >();
977
+
978
+ __esimd_lsc_store_stateless<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
979
+ lsc_data_size::u32 , _VS, _Transposed, N>(
980
+ pred.data (), addrs.data (), tmp.data ());
981
+ }
929
982
}
930
983
931
984
// / Accessor-based transposed scatter with 1 channel.
932
985
// / Supported platforms: DG2, PVC
933
986
// / VISA instruction: lsc_store.ugm
934
987
// /
935
988
// / Scatters elements to surface.
989
+ // / See comments in the \ref lsc_block_load API for description and parameter
990
+ // / constraints.
936
991
// /
937
992
// / @tparam T is element type.
938
993
// / @tparam NElts is the number of elements to store per address.
@@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
958
1013
lsc_block_store<T, NElts, DS, L1H>(
959
1014
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
960
1015
#else
961
- detail::check_lsc_vector_size<NElts>();
962
1016
detail::check_lsc_data_size<T, DS>();
963
1017
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
964
1018
constexpr uint16_t _AddressScale = 1 ;
965
1019
constexpr int _ImmOffset = 0 ;
966
1020
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
967
- static_assert (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64 ,
968
- " Transposed store is supported only for data size u32 or u64" );
969
- constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
970
1021
constexpr detail::lsc_data_order _Transposed =
971
1022
detail::lsc_data_order::transpose;
972
1023
constexpr int N = 1 ;
1024
+
973
1025
__ESIMD_NS::simd<uint32_t , N> offsets = offset;
974
1026
auto si = __ESIMD_NS::get_surface_index (acc);
975
- __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
976
- _Transposed, N>(pred.data (), offsets.data (),
977
- vals.data (), si);
1027
+ constexpr int SmallIntFactor =
1028
+ (_DS == lsc_data_size::u16 ) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1 );
1029
+
1030
+ detail::check_lsc_vector_size<NElts / SmallIntFactor>();
1031
+ static_assert (NElts % SmallIntFactor == 0 ,
1032
+ " Number of elements is not supported by Transposed store" );
1033
+ constexpr detail::lsc_vector_size _VS =
1034
+ detail::to_lsc_vector_size<NElts / SmallIntFactor>();
1035
+ if constexpr (SmallIntFactor > 1 ) {
1036
+ __ESIMD_NS::simd<uint32_t , NElts / SmallIntFactor> Tmp =
1037
+ vals.template bit_cast_view <uint32_t >();
1038
+ __esimd_lsc_store_bti<uint32_t , L1H, L3H, _AddressScale, _ImmOffset,
1039
+ lsc_data_size::u32 , _VS, _Transposed, N>(
1040
+ pred.data (), offsets.data (), Tmp.data (), si);
1041
+ } else {
1042
+ __esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1043
+ _Transposed, N>(pred.data (), offsets.data (),
1044
+ vals.data (), si);
1045
+ }
978
1046
#endif
979
1047
}
980
1048
0 commit comments