@@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
471
471
}
472
472
}
473
473
474
+ template <typename T> constexpr uint32_t get_lsc_data_size () {
475
+ switch (sizeof (T)) {
476
+ case 1 :
477
+ return 0 ;
478
+ case 2 :
479
+ return 1 ;
480
+ case 4 :
481
+ return 2 ;
482
+ case 8 :
483
+ return 3 ;
484
+ default :
485
+ static_assert (true , " Unsupported data type." );
486
+ }
487
+ }
488
+
474
489
template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
475
490
constexpr uint32_t get_lsc_load_cache_mask () {
476
491
if constexpr (L1H == cache_hint::read_invalidate &&
@@ -1992,16 +2007,17 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
1992
2007
__ESIMD_API __ESIMD_NS::simd<T, N>
1993
2008
lsc_load_2d (const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
1994
2009
unsigned SurfacePitch, int X, int Y) {
2010
+ using RawT = __ESIMD_DNS::__raw_t <T>;
1995
2011
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1996
- detail::check_lsc_block_2d_restrictions<T , BlockWidth, BlockHeight, NBlocks ,
1997
- Transposed, Transformed,
2012
+ detail::check_lsc_block_2d_restrictions<RawT , BlockWidth, BlockHeight,
2013
+ NBlocks, Transposed, Transformed,
1998
2014
detail::block_2d_op::load>();
1999
2015
// For Load BlockWidth is padded up to the next power-of-two value.
2000
2016
// For Load with Transpose the pre-operation BlockHeight is padded up
2001
2017
// to the next power-of-two value.
2002
2018
// For Load with Transform pre-operation BlockHeight is padded up to
2003
2019
// multiple of K, where K = 4B / sizeof(T).
2004
- constexpr int ElemsPerDword = 4 / sizeof (T );
2020
+ constexpr int ElemsPerDword = 4 / sizeof (RawT );
2005
2021
constexpr int GRFRowSize = Transposed ? BlockHeight
2006
2022
: Transformed ? BlockWidth * ElemsPerDword
2007
2023
: BlockWidth;
@@ -2013,7 +2029,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
2013
2029
: BlockHeight);
2014
2030
constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
2015
2031
constexpr int GRFBlockPitch =
2016
- detail::roundUpNextMultiple<64 / sizeof (T ), GRFBlockSize>();
2032
+ detail::roundUpNextMultiple<64 / sizeof (RawT ), GRFBlockSize>();
2017
2033
constexpr int ActualN = NBlocks * GRFBlockPitch;
2018
2034
2019
2035
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
@@ -2022,14 +2038,14 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
2022
2038
static_assert (N == ActualN || N == DstElements, " Incorrect element count" );
2023
2039
2024
2040
constexpr lsc_data_size DS =
2025
- detail::finalize_data_size<T , lsc_data_size::default_size>();
2041
+ detail::finalize_data_size<RawT , lsc_data_size::default_size>();
2026
2042
__ESIMD_NS::simd_mask<ActualN> pred = 1 ;
2027
2043
uintptr_t surf_addr = reinterpret_cast <uintptr_t >(Ptr);
2028
2044
constexpr detail::lsc_data_order _Transposed =
2029
2045
Transposed ? detail::lsc_data_order::transpose
2030
2046
: detail::lsc_data_order::nontranspose;
2031
- __ESIMD_NS::simd<T , ActualN> Raw =
2032
- __esimd_lsc_load2d_stateless<T , L1H, L3H, DS, _Transposed, NBlocks,
2047
+ __ESIMD_NS::simd<RawT , ActualN> Raw =
2048
+ __esimd_lsc_load2d_stateless<RawT , L1H, L3H, DS, _Transposed, NBlocks,
2033
2049
BlockWidth, BlockHeight, Transformed,
2034
2050
ActualN>(pred.data (), surf_addr,
2035
2051
SurfaceWidth, SurfaceHeight,
@@ -2055,16 +2071,17 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
2055
2071
// +----+----+----+----+----+----+-----+-----+
2056
2072
// * signifies the padded element.
2057
2073
2058
- __ESIMD_NS::simd<T , DstElements> Dst;
2074
+ __ESIMD_NS::simd<RawT , DstElements> Dst;
2059
2075
2060
2076
for (auto i = 0 ; i < NBlocks; i++) {
2061
2077
auto DstBlock =
2062
2078
Dst.template select <DstBlockElements, 1 >(i * DstBlockElements);
2063
2079
2064
2080
auto RawBlock = Raw.template select <GRFBlockSize, 1 >(i * GRFBlockPitch);
2065
- DstBlock = RawBlock.template bit_cast_view <T, GRFColSize, GRFRowPitch>()
2066
- .template select <GRFColSize, 1 , GRFRowSize, 1 >(0 , 0 )
2067
- .template bit_cast_view <T>();
2081
+ DstBlock =
2082
+ RawBlock.template bit_cast_view <RawT, GRFColSize, GRFRowPitch>()
2083
+ .template select <GRFColSize, 1 , GRFRowSize, 1 >(0 , 0 )
2084
+ .template bit_cast_view <RawT>();
2068
2085
}
2069
2086
2070
2087
return Dst;
@@ -2146,30 +2163,32 @@ template <typename T, int BlockWidth, int BlockHeight = 1,
2146
2163
__ESIMD_API void lsc_store_2d (T *Ptr, unsigned SurfaceWidth,
2147
2164
unsigned SurfaceHeight, unsigned SurfacePitch,
2148
2165
int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
2166
+ using RawT = __ESIMD_DNS::__raw_t <T>;
2149
2167
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2150
- detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1 , false ,
2151
- false , detail::block_2d_op::store>();
2168
+ detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1 ,
2169
+ false , false ,
2170
+ detail::block_2d_op::store>();
2152
2171
constexpr lsc_data_size DS =
2153
- detail::finalize_data_size<T , lsc_data_size::default_size>();
2172
+ detail::finalize_data_size<RawT , lsc_data_size::default_size>();
2154
2173
uintptr_t surf_addr = reinterpret_cast <uintptr_t >(Ptr);
2155
2174
constexpr detail::lsc_data_order _Transposed =
2156
2175
detail::lsc_data_order::nontranspose;
2157
2176
2158
2177
constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
2159
- __ESIMD_NS::simd<T , BlockHeight * Pitch> Raw;
2178
+ __ESIMD_NS::simd<RawT , BlockHeight * Pitch> Raw;
2160
2179
2161
2180
if constexpr (BlockHeight * Pitch == N) {
2162
2181
Raw = Vals;
2163
2182
} else {
2164
2183
// For store with padding, allocate the block with padding, and place
2165
2184
// original data there.
2166
- auto Data2D = Vals.template bit_cast_view <T , BlockHeight, BlockWidth>();
2167
- auto Raw2D = Raw.template bit_cast_view <T , BlockHeight, Pitch>();
2185
+ auto Data2D = Vals.template bit_cast_view <RawT , BlockHeight, BlockWidth>();
2186
+ auto Raw2D = Raw.template bit_cast_view <RawT , BlockHeight, Pitch>();
2168
2187
Raw2D.template select <BlockHeight, 1 , BlockWidth, 1 >(0 , 0 ) = Data2D;
2169
2188
}
2170
2189
2171
2190
__ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1 ;
2172
- __esimd_lsc_store2d_stateless<T , L1H, L3H, DS, _Transposed, 1u , BlockWidth,
2191
+ __esimd_lsc_store2d_stateless<RawT , L1H, L3H, DS, _Transposed, 1u , BlockWidth,
2173
2192
BlockHeight, false , BlockHeight * Pitch>(
2174
2193
pred.data (), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
2175
2194
Raw.data ());
@@ -2428,17 +2447,25 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
2428
2447
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
2429
2448
constexpr int DstElements = DstBlockElements * NBlocks;
2430
2449
2450
+ constexpr uint32_t GrfBytes = 64 ;
2451
+ constexpr uint32_t DstBlockSize =
2452
+ detail::roundUpNextMultiple<DstElements * sizeof (T), GrfBytes>();
2453
+ constexpr uint32_t DstLength =
2454
+ (DstBlockSize / GrfBytes) > 31 ? 31 : (DstBlockSize / GrfBytes);
2455
+ constexpr uint32_t DstLengthMask = DstLength << 20 ;
2456
+
2431
2457
static_assert (N == ActualN || N == DstElements, " Incorrect element count" );
2432
2458
2433
2459
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2434
2460
<< 17 ;
2435
- constexpr uint32_t base_desc = 0x2800403 ;
2461
+ constexpr uint32_t base_desc = 0x2000003 ;
2436
2462
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0 ;
2437
2463
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0 ;
2464
+ constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9 ;
2438
2465
__ESIMD_NS::simd<T, N> oldDst;
2439
2466
constexpr uint32_t exDesc = 0x0 ;
2440
- constexpr uint32_t desc =
2441
- base_desc | cache_mask | transformMask | transposeMask ;
2467
+ constexpr uint32_t desc = base_desc | cache_mask | transformMask |
2468
+ transposeMask | dataSizeMask | DstLengthMask ;
2442
2469
constexpr uint8_t execSize = 1 ;
2443
2470
constexpr uint8_t sfid = 0xF ;
2444
2471
constexpr uint8_t numSrc0 = 0x1 ;
@@ -2500,12 +2527,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
2500
2527
" Transposed and transformed is not supported" );
2501
2528
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
2502
2529
<< 17 ;
2503
- constexpr uint32_t base_desc = 0x2000403 ;
2530
+ constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9 ;
2531
+ constexpr uint32_t base_desc = 0x2000003 ;
2504
2532
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0 ;
2505
2533
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0 ;
2506
2534
constexpr uint32_t exDesc = 0x0 ;
2507
2535
constexpr uint32_t desc =
2508
- base_desc | cache_mask | transformMask | transposeMask;
2536
+ base_desc | cache_mask | transformMask | transposeMask | dataSizeMask ;
2509
2537
constexpr uint8_t execSize = 1 ;
2510
2538
constexpr uint8_t sfid = 0xF ;
2511
2539
constexpr uint8_t numDst = (N * sizeof (T)) / 64 ;
@@ -2542,10 +2570,11 @@ lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,
2542
2570
2543
2571
constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
2544
2572
<< 17 ;
2545
- constexpr uint32_t base_desc = 0x2000407 ;
2573
+ constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9 ;
2574
+ constexpr uint32_t base_desc = 0x2000007 ;
2546
2575
2547
2576
constexpr uint32_t exDesc = 0x0 ;
2548
- constexpr uint32_t desc = base_desc | cache_mask;
2577
+ constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask ;
2549
2578
constexpr uint8_t execSize = 1 ;
2550
2579
constexpr uint8_t sfid = 0xF ;
2551
2580
constexpr uint8_t numSrc0 = 0x1 ;
0 commit comments