Skip to content

[ABI-Break][SYCL] Restrict nd_range parallel_for to nd_item #13198

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 0 additions & 11 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1326,8 +1326,6 @@ class __SYCL_EXPORT handler {
"Kernel argument cannot have a sycl::nd_item type in "
"sycl::parallel_for with sycl::range");

#if defined(SYCL2020_CONFORMANT_APIS) || \
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
std::is_convertible_v<item<Dims, false>, LambdaArgType>,
"sycl::parallel_for(sycl::range) kernel must have the "
Expand All @@ -1340,7 +1338,6 @@ class __SYCL_EXPORT handler {
std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
"SYCL kernel lambda/functor has an unexpected signature, it should be "
"invocable with sycl::item and optionally sycl::kernel_handler");
#endif

// TODO: Properties may change the kernel function, so in order to avoid
// conflicts they should be included in the name.
Expand Down Expand Up @@ -1432,19 +1429,11 @@ class __SYCL_EXPORT handler {
verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
using LambdaArgType =
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
#if defined(SYCL2020_CONFORMANT_APIS) || \
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
static_assert(
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;
#else
// If user type is convertible from sycl::item/sycl::nd_item, use
// sycl::item/sycl::nd_item to transport item information
using TransformedArgType =
typename TransformUserItemType<Dims, LambdaArgType>::type;
#endif

(void)ExecutionRange;
(void)Props;
Expand Down
7 changes: 4 additions & 3 deletions sycl/test-e2e/Assert/assert_in_multiple_tus.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ void enqueueKernel_1_fromFile1(queue *Q) {

CGH.parallel_for<class Kernel_1>(
sycl::nd_range(Buf.get_range(), sycl::range<1>(4)),
[=](sycl::id<1> wiID) {
[=](sycl::nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
int X = 0;
if (wiID == 5)
if (gid == 5)
X = checkFunction();
Acc[wiID] = X;
Acc[gid] = X;
});
});
}
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/DiscardEvents/discard_events_accessors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ int main(int Argc, const char *Argv[]) {
sycl::local_accessor<int, 1> LocalAcc(LocalMemSize, CGH);

CGH.parallel_for<class kernel_using_local_memory>(
NDRange, [=](sycl::item<1> itemID) {
size_t i = itemID.get_id(0);
NDRange, [=](sycl::nd_item<1> ndi) {
size_t i = ndi.get_global_id(0);
int *Ptr = LocalAcc.get_pointer();
Ptr[i] = i + 5;
Harray[i] = Ptr[i] + 5;
Expand Down
7 changes: 4 additions & 3 deletions sycl/test-e2e/ESIMD/bfn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,17 @@ struct DeviceFunc {
DeviceFunc(const T *In0, const T *In1, const T *In2, T *Out)
: In0(In0), In1(In1), In2(In2), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N;
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N;
esimd::simd<T, N> V0;
esimd::simd<T, N> V1;
esimd::simd<T, N> V2;
V0.copy_from(In0 + Offset);
V1.copy_from(In1 + Offset);
V2.copy_from(In2 + Offset);

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
for (int J = 0; J < N; J++) {
Kernel<T, N, Op, AllSca> DevF{};
T Val0 = V0[J];
Expand Down
14 changes: 8 additions & 6 deletions sycl/test-e2e/ESIMD/ext_math.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,12 +238,13 @@ struct UnaryDeviceFunc {

UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N * sizeof(T);
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N * sizeof(T);
esimd::simd<T, N> Vx;
Vx.copy_from(In, Offset);

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
for (int J = 0; J < N; J++) {
Kernel<T, N, Op, AllSca> DevF{};
T Val = Vx[J];
Expand All @@ -269,13 +270,14 @@ struct BinaryDeviceFunc {
BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out)
: In1(In1), In2(In2), Out(Out) {}

void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
unsigned int Offset = I * N * sizeof(T);
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
auto gid = ndi.get_global_id(0);
unsigned int Offset = gid * N * sizeof(T);
esimd::simd<T, N> V1(In1, Offset);
esimd::simd<T, N> V2(In2, Offset);
esimd::simd<T, N> V;

if (I.get(0) % 2 == 0) {
if (gid % 2 == 0) {
int Ind = 0;
{
Kernel<T, N, Op, AllSca> DevF{};
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/ESIMD/local_accessor_gather_scatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
q.submit([&](handler &cgh) {
auto acc = buf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
simd<T, VL> valsIn;
valsIn.copy_from(acc, 0);
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
auto OutAcc = OutBuf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(VL * NUM_RGBA_CHANNELS, cgh);

cgh.parallel_for(Range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
constexpr int numChannels = get_num_channels_enabled(CH_MASK);

Expand Down Expand Up @@ -135,7 +135,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
-1;
}

uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS;
uint32_t global_offset = ndi.get_global_id(0) * VL * NUM_RGBA_CHANNELS;
valsOut.copy_to(OutAcc, global_offset);
});
}).wait();
Expand Down
8 changes: 4 additions & 4 deletions sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ bool test(queue q, const Config &cfg) {
try {
auto e = q.submit([&](handler &cgh) {
cgh.parallel_for<TestID<T, N, ImplF>>(
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
#ifndef USE_SCALAR_OFFSET
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
Expand Down Expand Up @@ -332,8 +332,8 @@ bool test(queue q, const Config &cfg) {
auto e = q.submit([&](handler &cgh) {
auto accessor = buf.template get_access<access::mode::read_write>(cgh);
cgh.parallel_for<TestID<T, N, ImplF>>(
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
rng, [=](nd_item<1> gid) SYCL_ESIMD_KERNEL {
int i = gid.get_global_id(0);
#ifndef USE_SCALAR_OFFSET
simd<Toffset, N> offsets(start * sizeof(T), stride * sizeof(T));
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
q.submit([&](handler &cgh) {
auto acc = buf.template get_access<access::mode::read_write>(cgh);
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
using namespace sycl::ext::intel::esimd;
simd<T, VL> valsIn;
valsIn.copy_from(acc, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ bool test_usm(queue q, const Config &cfg) {

try {
auto e = q.submit([&](handler &cgh) {
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
simd_mask<N> m = 1;
Expand Down Expand Up @@ -287,8 +287,8 @@ bool test_acc(queue q, const Config &cfg) {
auto e = q.submit([&](handler &cgh) {
auto arr_acc =
arr_buf.template get_access<access::mode::read_write>(cgh);
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
int i = ii;
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
int i = ndi.get_global_id(0);
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
cfg.stride * sizeof(T));
simd_mask<N> m = 1;
Expand Down
18 changes: 11 additions & 7 deletions sycl/test-e2e/KernelFusion/abort_fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
return r.get_global_range().size();
}

template <int N> auto global_linear_id(sycl::nd_item<N> ndi) {
return ndi.get_global_linear_id();
}
template <int N> auto global_linear_id(sycl::item<N> i) {
return i.get_linear_id();
}

template <typename Kernel1Name, typename Kernel2Name, typename Range1,
typename Range2>
void performFusion(queue &q, Range1 R1, Range2 R2) {
using IndexTy1 = item<Range1::dimensions>;
using IndexTy2 = item<Range2::dimensions>;

int in[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
Expand All @@ -51,17 +55,17 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
q.submit([&](handler &cgh) {
auto accIn = bIn.get_access(cgh);
auto accTmp = bTmp.get_access(cgh);
cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
size_t j = i.get_linear_id();
cgh.parallel_for<Kernel1Name>(R1, [=](auto i) {
size_t j = global_linear_id(i);
accTmp[j] = accIn[j] + 5;
});
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
size_t j = i.get_linear_id();
cgh.parallel_for<Kernel2Name>(R2, [=](auto i) {
size_t j = global_linear_id(i);
accOut[j] = accTmp[j] * 2;
});
});
Expand Down
12 changes: 8 additions & 4 deletions sycl/test-e2e/KernelFusion/abort_internalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ void performFusion(queue &q, Internalization intKernel1,

if (localSizeKernel1 > 0) {
cgh.parallel_for<class Kernel1>(
nd_range<1>{{dataSize}, {localSizeKernel1}},
[=](id<1> i) { accTmp[i] = accIn[i] + 5; });
nd_range<1>{{dataSize}, {localSizeKernel1}}, [=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp[i] = accIn[i] + 5;
});
} else {
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
Expand All @@ -70,8 +72,10 @@ void performFusion(queue &q, Internalization intKernel1,
auto accOut = bOut.get_access(cgh);
if (localSizeKernel2 > 0) {
cgh.parallel_for<class Kernel2>(
nd_range<1>{{dataSize}, {localSizeKernel2}},
[=](id<1> i) { accOut[i] = accTmp[i] * 2; });
nd_range<1>{{dataSize}, {localSizeKernel2}}, [=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * 2;
});
} else {
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
Expand Down
21 changes: 14 additions & 7 deletions sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ struct AddKernel {
accessor<int, 1> accIn2;
accessor<int, 1> accOut;

void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
void operator()(nd_item<1> ndi) const {
auto i = ndi.get_global_id(0);
accOut[i] = accIn1[i] + accIn2[i];
}
};

int main() {
Expand Down Expand Up @@ -71,17 +74,21 @@ int main() {
auto accTmp1 = bTmp1.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accTmp2 = bTmp2.get_access(cgh);
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp2[i] = accTmp1[i] * accIn3[i];
});
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(cgh);
auto accTmp3 = bTmp3.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp3[i] = accTmp1[i] * 5;
});
});

q.submit([&](handler &cgh) {
Expand Down
8 changes: 5 additions & 3 deletions sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ int main() {
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * accIn3[i];
});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ int main() {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
auto id = ndi.get_global_id();
const auto &accIn1Wrapp = accIn1[id];
const auto &accIn2Wrapp = accIn2[id];
auto &accTmpWrapp = accTmp[id];
Expand All @@ -105,7 +106,8 @@ int main() {
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
auto id = ndi.get_global_id();
const auto &tmpWrapp = accTmp[id];
const auto &accIn3Wrapp = accIn3[id];
auto &accOutWrapp = accOut[id];
Expand Down
16 changes: 10 additions & 6 deletions sycl/test-e2e/KernelFusion/local_internalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,23 @@ int main() {
auto accIn2 = bIn2.get_access(cgh);
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accTmp[i] = accIn1[i] + accIn2[i];
});
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
[=](nd_item<1> ndi) {
auto i = ndi.get_global_id(0);
accOut[i] = accTmp[i] * accIn3[i];
});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
Expand Down
8 changes: 4 additions & 4 deletions sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ int main() {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
cgh.parallel_for<class KernelOne>(
nd_range<1>{{128}, {8}}, [=](item<1> i) {
auto baseOffset = i.get_linear_id() * 4;
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
auto baseOffset = ndi.get_global_linear_id() * 4;
for (size_t j = 0; j < 4; ++j) {
accTmp[baseOffset + j] =
accIn1[baseOffset + j] + accIn2[baseOffset + j];
Expand All @@ -57,8 +57,8 @@ int main() {
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{128}, {8}}, [=](item<1> i) {
auto baseOffset = i.get_linear_id() * 4;
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
auto baseOffset = ndi.get_global_linear_id() * 4;
for (size_t j = 0; j < 4; ++j) {
accOut[baseOffset + j] =
accTmp[baseOffset + j] * accIn3[baseOffset + j];
Expand Down
Loading