Skip to content

Commit 0913045

Browse files
authored
[SYCL][ESIMD] Fix lsc_load_2d API issue that prevented usage for different types (#12244)
1 parent 865092f commit 0913045

File tree

3 files changed

+180
-57
lines changed

3 files changed

+180
-57
lines changed

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,21 @@ lsc_format_ret(__ESIMD_NS::simd<T1, N> Vals) {
471471
}
472472
}
473473

474+
template <typename T> constexpr uint32_t get_lsc_data_size() {
475+
switch (sizeof(T)) {
476+
case 1:
477+
return 0;
478+
case 2:
479+
return 1;
480+
case 4:
481+
return 2;
482+
case 8:
483+
return 3;
484+
default:
485+
static_assert(true, "Unsupported data type.");
486+
}
487+
}
488+
474489
template <cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
475490
constexpr uint32_t get_lsc_load_cache_mask() {
476491
if constexpr (L1H == cache_hint::read_invalidate &&
@@ -1992,16 +2007,17 @@ template <typename T, int BlockWidth, int BlockHeight = 1, int NBlocks = 1,
19922007
__ESIMD_API __ESIMD_NS::simd<T, N>
19932008
lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
19942009
unsigned SurfacePitch, int X, int Y) {
2010+
using RawT = __ESIMD_DNS::__raw_t<T>;
19952011
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
1996-
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, NBlocks,
1997-
Transposed, Transformed,
2012+
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight,
2013+
NBlocks, Transposed, Transformed,
19982014
detail::block_2d_op::load>();
19992015
// For Load BlockWidth is padded up to the next power-of-two value.
20002016
// For Load with Transpose the pre-operation BlockHeight is padded up
20012017
// to the next power-of-two value.
20022018
// For Load with Transform pre-operation BlockHeight is padded up to
20032019
// multiple of K, where K = 4B / sizeof(T).
2004-
constexpr int ElemsPerDword = 4 / sizeof(T);
2020+
constexpr int ElemsPerDword = 4 / sizeof(RawT);
20052021
constexpr int GRFRowSize = Transposed ? BlockHeight
20062022
: Transformed ? BlockWidth * ElemsPerDword
20072023
: BlockWidth;
@@ -2013,7 +2029,7 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
20132029
: BlockHeight);
20142030
constexpr int GRFBlockSize = GRFRowPitch * GRFColSize;
20152031
constexpr int GRFBlockPitch =
2016-
detail::roundUpNextMultiple<64 / sizeof(T), GRFBlockSize>();
2032+
detail::roundUpNextMultiple<64 / sizeof(RawT), GRFBlockSize>();
20172033
constexpr int ActualN = NBlocks * GRFBlockPitch;
20182034

20192035
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
@@ -2022,14 +2038,14 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
20222038
static_assert(N == ActualN || N == DstElements, "Incorrect element count");
20232039

20242040
constexpr lsc_data_size DS =
2025-
detail::finalize_data_size<T, lsc_data_size::default_size>();
2041+
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
20262042
__ESIMD_NS::simd_mask<ActualN> pred = 1;
20272043
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
20282044
constexpr detail::lsc_data_order _Transposed =
20292045
Transposed ? detail::lsc_data_order::transpose
20302046
: detail::lsc_data_order::nontranspose;
2031-
__ESIMD_NS::simd<T, ActualN> Raw =
2032-
__esimd_lsc_load2d_stateless<T, L1H, L3H, DS, _Transposed, NBlocks,
2047+
__ESIMD_NS::simd<RawT, ActualN> Raw =
2048+
__esimd_lsc_load2d_stateless<RawT, L1H, L3H, DS, _Transposed, NBlocks,
20332049
BlockWidth, BlockHeight, Transformed,
20342050
ActualN>(pred.data(), surf_addr,
20352051
SurfaceWidth, SurfaceHeight,
@@ -2055,16 +2071,17 @@ lsc_load_2d(const T *Ptr, unsigned SurfaceWidth, unsigned SurfaceHeight,
20552071
// +----+----+----+----+----+----+-----+-----+
20562072
// * signifies the padded element.
20572073

2058-
__ESIMD_NS::simd<T, DstElements> Dst;
2074+
__ESIMD_NS::simd<RawT, DstElements> Dst;
20592075

20602076
for (auto i = 0; i < NBlocks; i++) {
20612077
auto DstBlock =
20622078
Dst.template select<DstBlockElements, 1>(i * DstBlockElements);
20632079

20642080
auto RawBlock = Raw.template select<GRFBlockSize, 1>(i * GRFBlockPitch);
2065-
DstBlock = RawBlock.template bit_cast_view<T, GRFColSize, GRFRowPitch>()
2066-
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2067-
.template bit_cast_view<T>();
2081+
DstBlock =
2082+
RawBlock.template bit_cast_view<RawT, GRFColSize, GRFRowPitch>()
2083+
.template select<GRFColSize, 1, GRFRowSize, 1>(0, 0)
2084+
.template bit_cast_view<RawT>();
20682085
}
20692086

20702087
return Dst;
@@ -2146,30 +2163,32 @@ template <typename T, int BlockWidth, int BlockHeight = 1,
21462163
__ESIMD_API void lsc_store_2d(T *Ptr, unsigned SurfaceWidth,
21472164
unsigned SurfaceHeight, unsigned SurfacePitch,
21482165
int X, int Y, __ESIMD_NS::simd<T, N> Vals) {
2166+
using RawT = __ESIMD_DNS::__raw_t<T>;
21492167
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
2150-
detail::check_lsc_block_2d_restrictions<T, BlockWidth, BlockHeight, 1, false,
2151-
false, detail::block_2d_op::store>();
2168+
detail::check_lsc_block_2d_restrictions<RawT, BlockWidth, BlockHeight, 1,
2169+
false, false,
2170+
detail::block_2d_op::store>();
21522171
constexpr lsc_data_size DS =
2153-
detail::finalize_data_size<T, lsc_data_size::default_size>();
2172+
detail::finalize_data_size<RawT, lsc_data_size::default_size>();
21542173
uintptr_t surf_addr = reinterpret_cast<uintptr_t>(Ptr);
21552174
constexpr detail::lsc_data_order _Transposed =
21562175
detail::lsc_data_order::nontranspose;
21572176

21582177
constexpr int Pitch = __ESIMD_DNS::getNextPowerOf2<BlockWidth>();
2159-
__ESIMD_NS::simd<T, BlockHeight * Pitch> Raw;
2178+
__ESIMD_NS::simd<RawT, BlockHeight * Pitch> Raw;
21602179

21612180
if constexpr (BlockHeight * Pitch == N) {
21622181
Raw = Vals;
21632182
} else {
21642183
// For store with padding, allocate the block with padding, and place
21652184
// original data there.
2166-
auto Data2D = Vals.template bit_cast_view<T, BlockHeight, BlockWidth>();
2167-
auto Raw2D = Raw.template bit_cast_view<T, BlockHeight, Pitch>();
2185+
auto Data2D = Vals.template bit_cast_view<RawT, BlockHeight, BlockWidth>();
2186+
auto Raw2D = Raw.template bit_cast_view<RawT, BlockHeight, Pitch>();
21682187
Raw2D.template select<BlockHeight, 1, BlockWidth, 1>(0, 0) = Data2D;
21692188
}
21702189

21712190
__ESIMD_NS::simd_mask<BlockHeight * Pitch> pred = 1;
2172-
__esimd_lsc_store2d_stateless<T, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
2191+
__esimd_lsc_store2d_stateless<RawT, L1H, L3H, DS, _Transposed, 1u, BlockWidth,
21732192
BlockHeight, false, BlockHeight * Pitch>(
21742193
pred.data(), surf_addr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y,
21752194
Raw.data());
@@ -2428,17 +2447,25 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION __ESIMD_NS::simd<T, N> lsc_load_2d(
24282447
constexpr int DstBlockElements = GRFColSize * GRFRowSize;
24292448
constexpr int DstElements = DstBlockElements * NBlocks;
24302449

2450+
constexpr uint32_t GrfBytes = 64;
2451+
constexpr uint32_t DstBlockSize =
2452+
detail::roundUpNextMultiple<DstElements * sizeof(T), GrfBytes>();
2453+
constexpr uint32_t DstLength =
2454+
(DstBlockSize / GrfBytes) > 31 ? 31 : (DstBlockSize / GrfBytes);
2455+
constexpr uint32_t DstLengthMask = DstLength << 20;
2456+
24312457
static_assert(N == ActualN || N == DstElements, "Incorrect element count");
24322458

24332459
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
24342460
<< 17;
2435-
constexpr uint32_t base_desc = 0x2800403;
2461+
constexpr uint32_t base_desc = 0x2000003;
24362462
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
24372463
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
2464+
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
24382465
__ESIMD_NS::simd<T, N> oldDst;
24392466
constexpr uint32_t exDesc = 0x0;
2440-
constexpr uint32_t desc =
2441-
base_desc | cache_mask | transformMask | transposeMask;
2467+
constexpr uint32_t desc = base_desc | cache_mask | transformMask |
2468+
transposeMask | dataSizeMask | DstLengthMask;
24422469
constexpr uint8_t execSize = 1;
24432470
constexpr uint8_t sfid = 0xF;
24442471
constexpr uint8_t numSrc0 = 0x1;
@@ -2500,12 +2527,13 @@ ESIMD_INLINE SYCL_ESIMD_FUNCTION void lsc_prefetch_2d(
25002527
"Transposed and transformed is not supported");
25012528
constexpr uint32_t cache_mask = detail::get_lsc_load_cache_mask<L1H, L3H>()
25022529
<< 17;
2503-
constexpr uint32_t base_desc = 0x2000403;
2530+
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
2531+
constexpr uint32_t base_desc = 0x2000003;
25042532
constexpr uint32_t transformMask = Transformed ? 1 << 7 : 0;
25052533
constexpr uint32_t transposeMask = Transposed ? 1 << 15 : 0;
25062534
constexpr uint32_t exDesc = 0x0;
25072535
constexpr uint32_t desc =
2508-
base_desc | cache_mask | transformMask | transposeMask;
2536+
base_desc | cache_mask | transformMask | transposeMask | dataSizeMask;
25092537
constexpr uint8_t execSize = 1;
25102538
constexpr uint8_t sfid = 0xF;
25112539
constexpr uint8_t numDst = (N * sizeof(T)) / 64;
@@ -2542,10 +2570,11 @@ lsc_store_2d(config_2d_mem_access<T, BlockWidth, BlockHeight, NBlocks> &payload,
25422570

25432571
constexpr uint32_t cache_mask = detail::get_lsc_store_cache_mask<L1H, L3H>()
25442572
<< 17;
2545-
constexpr uint32_t base_desc = 0x2000407;
2573+
constexpr uint32_t dataSizeMask = detail::get_lsc_data_size<T>() << 9;
2574+
constexpr uint32_t base_desc = 0x2000007;
25462575

25472576
constexpr uint32_t exDesc = 0x0;
2548-
constexpr uint32_t desc = base_desc | cache_mask;
2577+
constexpr uint32_t desc = base_desc | cache_mask | dataSizeMask;
25492578
constexpr uint8_t execSize = 1;
25502579
constexpr uint8_t sfid = 0xF;
25512580
constexpr uint8_t numSrc0 = 0x1;
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//==----- lsc_load_2d_compare.cpp - DPC++ ESIMD on-device test ------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc
9+
// RUN: %{build} -o %t.out
10+
// RUN: %{run} %t.out
11+
12+
// The tests makes sure old and new load_2d API produce identical
13+
// results.
14+
#include <iostream>
15+
#include <sycl/ext/intel/esimd.hpp>
16+
#include <sycl/sycl.hpp>
17+
18+
using bf16 = sycl::ext::oneapi::bfloat16;
19+
using namespace sycl;
20+
using namespace sycl::ext::intel::esimd;
21+
using namespace sycl::ext::intel::experimental::esimd;
22+
template <typename T> bool test() {
23+
sycl::queue Q(sycl::gpu_selector_v);
24+
auto dev = Q.get_device();
25+
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
26+
<< "\n";
27+
28+
constexpr int TM = 8;
29+
constexpr int TN = 8;
30+
constexpr int NBLOCKS = 2;
31+
constexpr int WIDTH = 2 * TN;
32+
constexpr int HEIGHT = TM;
33+
constexpr int PITCH = WIDTH;
34+
constexpr int SIZE = WIDTH * HEIGHT;
35+
36+
auto *A = malloc_shared<T>(SIZE, Q);
37+
auto *B = malloc_shared<T>(SIZE, Q);
38+
auto *C = malloc_shared<T>(SIZE, Q);
39+
auto *C1 = malloc_shared<T>(SIZE, Q);
40+
41+
for (int i = 0; i < SIZE; i++) {
42+
A[i] = static_cast<T>(i);
43+
}
44+
45+
Q.parallel_for(sycl::nd_range<1>(1, 1), [=](sycl::nd_item<1>
46+
item) SYCL_ESIMD_KERNEL {
47+
config_2d_mem_access<T, TN, TM, NBLOCKS> my_config(
48+
A, WIDTH * sizeof(T) - 1, HEIGHT - 1, PITCH * sizeof(T) - 1, 0, 0);
49+
50+
simd<T, NBLOCKS * TM * TN> tmp =
51+
lsc_load_2d<T, TN, TM, NBLOCKS, false, false>(my_config);
52+
simd<T, NBLOCKS * TM * TN> tmp1 = lsc_load_2d<T, TN, TM, NBLOCKS>(
53+
my_config.get_data_pointer(), my_config.get_surface_width(),
54+
my_config.get_surface_height(), my_config.get_surface_pitch(),
55+
my_config.get_x(), my_config.get_y());
56+
57+
tmp.copy_to(C);
58+
tmp1.copy_to(C1);
59+
}).wait();
60+
61+
bool error = false;
62+
for (auto i = 0; i < SIZE; ++i)
63+
error |= C[i] != C1[i];
64+
65+
free(A, Q);
66+
free(C, Q);
67+
free(C1, Q);
68+
return error;
69+
}
70+
71+
int main() {
72+
bool result = false;
73+
result |= test<float>();
74+
result |= test<uint32_t>();
75+
result |= test<uint16_t>();
76+
result |= test<uint8_t>();
77+
result |= test<sycl::half>();
78+
79+
std::cout << (result ? "FAILED" : "passed") << std::endl;
80+
return 0;
81+
}

0 commit comments

Comments
 (0)