Skip to content

Commit 3bf8a8b

Browse files
[ABI-Break][SYCL] Restrict nd_range parallel_for to nd_item
Enables the change from #11067 by default.
1 parent ff9e48a commit 3bf8a8b

23 files changed

+105
-84
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,8 +1326,6 @@ class __SYCL_EXPORT handler {
13261326
"Kernel argument cannot have a sycl::nd_item type in "
13271327
"sycl::parallel_for with sycl::range");
13281328

1329-
#if defined(SYCL2020_CONFORMANT_APIS) || \
1330-
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
13311329
static_assert(std::is_convertible_v<item<Dims>, LambdaArgType> ||
13321330
std::is_convertible_v<item<Dims, false>, LambdaArgType>,
13331331
"sycl::parallel_for(sycl::range) kernel must have the "
@@ -1340,7 +1338,6 @@ class __SYCL_EXPORT handler {
13401338
std::is_invocable_v<KernelType, RefLambdaArgType, kernel_handler>),
13411339
"SYCL kernel lambda/functor has an unexpected signature, it should be "
13421340
"invocable with sycl::item and optionally sycl::kernel_handler");
1343-
#endif
13441341

13451342
// TODO: Properties may change the kernel function, so in order to avoid
13461343
// conflicts they should be included in the name.
@@ -1432,19 +1429,11 @@ class __SYCL_EXPORT handler {
14321429
verifyUsedKernelBundle(detail::KernelInfo<NameT>::getName());
14331430
using LambdaArgType =
14341431
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
1435-
#if defined(SYCL2020_CONFORMANT_APIS) || \
1436-
defined(__INTEL_PREVIEW_BREAKING_CHANGES)
14371432
static_assert(
14381433
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
14391434
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
14401435
"must be either sycl::nd_item or be convertible from sycl::nd_item");
14411436
using TransformedArgType = sycl::nd_item<Dims>;
1442-
#else
1443-
// If user type is convertible from sycl::item/sycl::nd_item, use
1444-
// sycl::item/sycl::nd_item to transport item information
1445-
using TransformedArgType =
1446-
typename TransformUserItemType<Dims, LambdaArgType>::type;
1447-
#endif
14481437

14491438
(void)ExecutionRange;
14501439
(void)Props;

sycl/test-e2e/Assert/assert_in_multiple_tus.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,12 @@ void enqueueKernel_1_fromFile1(queue *Q) {
3232

3333
CGH.parallel_for<class Kernel_1>(
3434
sycl::nd_range(Buf.get_range(), sycl::range<1>(4)),
35-
[=](sycl::id<1> wiID) {
35+
[=](sycl::nd_item<1> ndi) {
36+
auto gid = ndi.get_global_id(0);
3637
int X = 0;
37-
if (wiID == 5)
38+
if (gid == 5)
3839
X = checkFunction();
39-
Acc[wiID] = X;
40+
Acc[gid] = X;
4041
});
4142
});
4243
}

sycl/test-e2e/DiscardEvents/discard_events_accessors.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ int main(int Argc, const char *Argv[]) {
5959
sycl::local_accessor<int, 1> LocalAcc(LocalMemSize, CGH);
6060

6161
CGH.parallel_for<class kernel_using_local_memory>(
62-
NDRange, [=](sycl::item<1> itemID) {
63-
size_t i = itemID.get_id(0);
62+
NDRange, [=](sycl::nd_item<1> ndi) {
63+
size_t i = ndi.get_global_id(0);
6464
int *Ptr = LocalAcc.get_pointer();
6565
Ptr[i] = i + 5;
6666
Harray[i] = Ptr[i] + 5;

sycl/test-e2e/ESIMD/bfn.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,16 +107,17 @@ struct DeviceFunc {
107107
DeviceFunc(const T *In0, const T *In1, const T *In2, T *Out)
108108
: In0(In0), In1(In1), In2(In2), Out(Out) {}
109109

110-
void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
111-
unsigned int Offset = I * N;
110+
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
111+
auto gid = ndi.get_global_id(0);
112+
unsigned int Offset = gid * N;
112113
esimd::simd<T, N> V0;
113114
esimd::simd<T, N> V1;
114115
esimd::simd<T, N> V2;
115116
V0.copy_from(In0 + Offset);
116117
V1.copy_from(In1 + Offset);
117118
V2.copy_from(In2 + Offset);
118119

119-
if (I.get(0) % 2 == 0) {
120+
if (gid % 2 == 0) {
120121
for (int J = 0; J < N; J++) {
121122
Kernel<T, N, Op, AllSca> DevF{};
122123
T Val0 = V0[J];

sycl/test-e2e/ESIMD/ext_math.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -238,12 +238,13 @@ struct UnaryDeviceFunc {
238238

239239
UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {}
240240

241-
void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
242-
unsigned int Offset = I * N * sizeof(T);
241+
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
242+
auto gid = ndi.get_global_id(0);
243+
unsigned int Offset = gid * N * sizeof(T);
243244
esimd::simd<T, N> Vx;
244245
Vx.copy_from(In, Offset);
245246

246-
if (I.get(0) % 2 == 0) {
247+
if (gid % 2 == 0) {
247248
for (int J = 0; J < N; J++) {
248249
Kernel<T, N, Op, AllSca> DevF{};
249250
T Val = Vx[J];
@@ -269,13 +270,14 @@ struct BinaryDeviceFunc {
269270
BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out)
270271
: In1(In1), In2(In2), Out(Out) {}
271272

272-
void operator()(id<1> I) const SYCL_ESIMD_KERNEL {
273-
unsigned int Offset = I * N * sizeof(T);
273+
void operator()(nd_item<1> ndi) const SYCL_ESIMD_KERNEL {
274+
auto gid = ndi.get_global_id(0);
275+
unsigned int Offset = gid * N * sizeof(T);
274276
esimd::simd<T, N> V1(In1, Offset);
275277
esimd::simd<T, N> V2(In2, Offset);
276278
esimd::simd<T, N> V;
277279

278-
if (I.get(0) % 2 == 0) {
280+
if (gid % 2 == 0) {
279281
int Ind = 0;
280282
{
281283
Kernel<T, N, Op, AllSca> DevF{};

sycl/test-e2e/ESIMD/local_accessor_gather_scatter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
5050
q.submit([&](handler &cgh) {
5151
auto acc = buf.template get_access<access::mode::read_write>(cgh);
5252
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
53-
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
53+
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
5454
using namespace sycl::ext::intel::esimd;
5555
simd<T, VL> valsIn;
5656
valsIn.copy_from(acc, 0);

sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
8989
auto OutAcc = OutBuf.template get_access<access::mode::read_write>(cgh);
9090
auto LocalAcc = local_accessor<T, 1>(VL * NUM_RGBA_CHANNELS, cgh);
9191

92-
cgh.parallel_for(Range, [=](id<1> i) SYCL_ESIMD_KERNEL {
92+
cgh.parallel_for(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
9393
using namespace sycl::ext::intel::esimd;
9494
constexpr int numChannels = get_num_channels_enabled(CH_MASK);
9595

@@ -135,7 +135,7 @@ template <typename T, unsigned VL, auto CH_MASK> bool test(queue q) {
135135
-1;
136136
}
137137

138-
uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS;
138+
uint32_t global_offset = ndi.get_global_id(0) * VL * NUM_RGBA_CHANNELS;
139139
valsOut.copy_to(OutAcc, global_offset);
140140
});
141141
}).wait();

sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,8 @@ bool test(queue q, const Config &cfg) {
212212
try {
213213
auto e = q.submit([&](handler &cgh) {
214214
cgh.parallel_for<TestID<T, N, ImplF>>(
215-
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
216-
int i = ii;
215+
rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
216+
int i = ndi.get_global_id(0);
217217
#ifndef USE_SCALAR_OFFSET
218218
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
219219
cfg.stride * sizeof(T));
@@ -332,8 +332,8 @@ bool test(queue q, const Config &cfg) {
332332
auto e = q.submit([&](handler &cgh) {
333333
auto accessor = buf.template get_access<access::mode::read_write>(cgh);
334334
cgh.parallel_for<TestID<T, N, ImplF>>(
335-
rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
336-
int i = ii;
335+
rng, [=](nd_item<1> gid) SYCL_ESIMD_KERNEL {
336+
int i = gid.get_global_id(0);
337337
#ifndef USE_SCALAR_OFFSET
338338
simd<Toffset, N> offsets(start * sizeof(T), stride * sizeof(T));
339339
#else

sycl/test-e2e/ESIMD/lsc/lsc_local_accessor_gather_scatter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
5252
q.submit([&](handler &cgh) {
5353
auto acc = buf.template get_access<access::mode::read_write>(cgh);
5454
auto LocalAcc = local_accessor<T, 1>(size * STRIDE, cgh);
55-
cgh.parallel_for(glob_range, [=](id<1> i) SYCL_ESIMD_KERNEL {
55+
cgh.parallel_for(glob_range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
5656
using namespace sycl::ext::intel::esimd;
5757
simd<T, VL> valsIn;
5858
valsIn.copy_from(acc, 0);

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ bool test_usm(queue q, const Config &cfg) {
153153

154154
try {
155155
auto e = q.submit([&](handler &cgh) {
156-
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
157-
int i = ii;
156+
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
157+
int i = ndi.get_global_id(0);
158158
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
159159
cfg.stride * sizeof(T));
160160
simd_mask<N> m = 1;
@@ -287,8 +287,8 @@ bool test_acc(queue q, const Config &cfg) {
287287
auto e = q.submit([&](handler &cgh) {
288288
auto arr_acc =
289289
arr_buf.template get_access<access::mode::read_write>(cgh);
290-
cgh.parallel_for(rng, [=](id<1> ii) SYCL_ESIMD_KERNEL {
291-
int i = ii;
290+
cgh.parallel_for(rng, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
291+
int i = ndi.get_global_id(0);
292292
simd<Toffset, N> offsets(cfg.start_ind * sizeof(T),
293293
cfg.stride * sizeof(T));
294294
simd_mask<N> m = 1;

sycl/test-e2e/KernelFusion/abort_fusion.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,16 @@ template <int Dimensions> size_t getSize(nd_range<Dimensions> r) {
2525
return r.get_global_range().size();
2626
}
2727

28+
template <int N> auto global_linear_id(sycl::nd_item<N> ndi) {
29+
return ndi.get_global_linear_id();
30+
}
31+
template <int N> auto global_linear_id(sycl::item<N> i) {
32+
return i.get_linear_id();
33+
}
34+
2835
template <typename Kernel1Name, typename Kernel2Name, typename Range1,
2936
typename Range2>
3037
void performFusion(queue &q, Range1 R1, Range2 R2) {
31-
using IndexTy1 = item<Range1::dimensions>;
32-
using IndexTy2 = item<Range2::dimensions>;
33-
3438
int in[dataSize], tmp[dataSize], out[dataSize];
3539

3640
for (size_t i = 0; i < dataSize; ++i) {
@@ -51,17 +55,17 @@ void performFusion(queue &q, Range1 R1, Range2 R2) {
5155
q.submit([&](handler &cgh) {
5256
auto accIn = bIn.get_access(cgh);
5357
auto accTmp = bTmp.get_access(cgh);
54-
cgh.parallel_for<Kernel1Name>(R1, [=](IndexTy1 i) {
55-
size_t j = i.get_linear_id();
58+
cgh.parallel_for<Kernel1Name>(R1, [=](auto i) {
59+
size_t j = global_linear_id(i);
5660
accTmp[j] = accIn[j] + 5;
5761
});
5862
});
5963

6064
q.submit([&](handler &cgh) {
6165
auto accTmp = bTmp.get_access(cgh);
6266
auto accOut = bOut.get_access(cgh);
63-
cgh.parallel_for<Kernel2Name>(R2, [=](IndexTy2 i) {
64-
size_t j = i.get_linear_id();
67+
cgh.parallel_for<Kernel2Name>(R2, [=](auto i) {
68+
size_t j = global_linear_id(i);
6569
accOut[j] = accTmp[j] * 2;
6670
});
6771
});

sycl/test-e2e/KernelFusion/abort_internalization.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ void performFusion(queue &q, Internalization intKernel1,
4949

5050
if (localSizeKernel1 > 0) {
5151
cgh.parallel_for<class Kernel1>(
52-
nd_range<1>{{dataSize}, {localSizeKernel1}},
53-
[=](id<1> i) { accTmp[i] = accIn[i] + 5; });
52+
nd_range<1>{{dataSize}, {localSizeKernel1}}, [=](nd_item<1> ndi) {
53+
auto i = ndi.get_global_id(0);
54+
accTmp[i] = accIn[i] + 5;
55+
});
5456
} else {
5557
cgh.parallel_for<class KernelOne>(
5658
dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
@@ -70,8 +72,10 @@ void performFusion(queue &q, Internalization intKernel1,
7072
auto accOut = bOut.get_access(cgh);
7173
if (localSizeKernel2 > 0) {
7274
cgh.parallel_for<class Kernel2>(
73-
nd_range<1>{{dataSize}, {localSizeKernel2}},
74-
[=](id<1> i) { accOut[i] = accTmp[i] * 2; });
75+
nd_range<1>{{dataSize}, {localSizeKernel2}}, [=](nd_item<1> ndi) {
76+
auto i = ndi.get_global_id(0);
77+
accOut[i] = accTmp[i] * 2;
78+
});
7579
} else {
7680
cgh.parallel_for<class KernelTwo>(
7781
dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });

sycl/test-e2e/KernelFusion/diamond_shape_local.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ struct AddKernel {
1616
accessor<int, 1> accIn2;
1717
accessor<int, 1> accOut;
1818

19-
void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
19+
void operator()(nd_item<1> ndi) const {
20+
auto i = ndi.get_global_id(0);
21+
accOut[i] = accIn1[i] + accIn2[i];
22+
}
2023
};
2124

2225
int main() {
@@ -71,17 +74,21 @@ int main() {
7174
auto accTmp1 = bTmp1.get_access(cgh);
7275
auto accIn3 = bIn3.get_access(cgh);
7376
auto accTmp2 = bTmp2.get_access(cgh);
74-
cgh.parallel_for<class KernelOne>(
75-
nd_range<1>{{dataSize}, {16}},
76-
[=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
77+
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
78+
[=](nd_item<1> ndi) {
79+
auto i = ndi.get_global_id(0);
80+
accTmp2[i] = accTmp1[i] * accIn3[i];
81+
});
7782
});
7883

7984
q.submit([&](handler &cgh) {
8085
auto accTmp1 = bTmp1.get_access(cgh);
8186
auto accTmp3 = bTmp3.get_access(cgh);
82-
cgh.parallel_for<class KernelTwo>(
83-
nd_range<1>{{dataSize}, {16}},
84-
[=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
87+
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
88+
[=](nd_item<1> ndi) {
89+
auto i = ndi.get_global_id(0);
90+
accTmp3[i] = accTmp1[i] * 5;
91+
});
8592
});
8693

8794
q.submit([&](handler &cgh) {

sycl/test-e2e/KernelFusion/existing_local_accessor.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,11 @@ int main() {
5656
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
5757
auto accIn3 = bIn3.get_access(cgh);
5858
auto accOut = bOut.get_access(cgh);
59-
cgh.parallel_for<class KernelTwo>(
60-
nd_range<1>{{dataSize}, {16}},
61-
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
59+
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
60+
[=](nd_item<1> ndi) {
61+
auto i = ndi.get_global_id(0);
62+
accOut[i] = accTmp[i] * accIn3[i];
63+
});
6264
});
6365

6466
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,8 @@ int main() {
8484
auto accTmp = bTmp.get_access(
8585
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
8686
cgh.parallel_for<class KernelOne>(
87-
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
87+
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
88+
auto id = ndi.get_global_id();
8889
const auto &accIn1Wrapp = accIn1[id];
8990
const auto &accIn2Wrapp = accIn2[id];
9091
auto &accTmpWrapp = accTmp[id];
@@ -105,7 +106,8 @@ int main() {
105106
auto accIn3 = bIn3.get_access(cgh);
106107
auto accOut = bOut.get_access(cgh);
107108
cgh.parallel_for<class KernelTwo>(
108-
nd_range<1>{{dataSize}, {4}}, [=](id<1> id) {
109+
nd_range<1>{{dataSize}, {4}}, [=](nd_item<1> ndi) {
110+
auto id = ndi.get_global_id();
109111
const auto &tmpWrapp = accTmp[id];
110112
const auto &accIn3Wrapp = accIn3[id];
111113
auto &accOutWrapp = accOut[id];

sycl/test-e2e/KernelFusion/local_internalization.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,23 @@ int main() {
4141
auto accIn2 = bIn2.get_access(cgh);
4242
auto accTmp = bTmp.get_access(
4343
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
44-
cgh.parallel_for<class KernelOne>(
45-
nd_range<1>{{dataSize}, {16}},
46-
[=](id<1> i) { accTmp[i] = accIn1[i] + accIn2[i]; });
44+
cgh.parallel_for<class KernelOne>(nd_range<1>{{dataSize}, {16}},
45+
[=](nd_item<1> ndi) {
46+
auto i = ndi.get_global_id(0);
47+
accTmp[i] = accIn1[i] + accIn2[i];
48+
});
4749
});
4850

4951
q.submit([&](handler &cgh) {
5052
auto accTmp = bTmp.get_access(
5153
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
5254
auto accIn3 = bIn3.get_access(cgh);
5355
auto accOut = bOut.get_access(cgh);
54-
cgh.parallel_for<class KernelTwo>(
55-
nd_range<1>{{dataSize}, {16}},
56-
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
56+
cgh.parallel_for<class KernelTwo>(nd_range<1>{{dataSize}, {16}},
57+
[=](nd_item<1> ndi) {
58+
auto i = ndi.get_global_id(0);
59+
accOut[i] = accTmp[i] * accIn3[i];
60+
});
5761
});
5862

5963
fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

sycl/test-e2e/KernelFusion/non_unit_local_size.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ int main() {
4242
auto accTmp = bTmp.get_access(
4343
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
4444
cgh.parallel_for<class KernelOne>(
45-
nd_range<1>{{128}, {8}}, [=](item<1> i) {
46-
auto baseOffset = i.get_linear_id() * 4;
45+
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
46+
auto baseOffset = ndi.get_global_linear_id() * 4;
4747
for (size_t j = 0; j < 4; ++j) {
4848
accTmp[baseOffset + j] =
4949
accIn1[baseOffset + j] + accIn2[baseOffset + j];
@@ -57,8 +57,8 @@ int main() {
5757
auto accIn3 = bIn3.get_access(cgh);
5858
auto accOut = bOut.get_access(cgh);
5959
cgh.parallel_for<class KernelTwo>(
60-
nd_range<1>{{128}, {8}}, [=](item<1> i) {
61-
auto baseOffset = i.get_linear_id() * 4;
60+
nd_range<1>{{128}, {8}}, [=](nd_item<1> ndi) {
61+
auto baseOffset = ndi.get_global_linear_id() * 4;
6262
for (size_t j = 0; j < 4; ++j) {
6363
accOut[baseOffset + j] =
6464
accTmp[baseOffset + j] * accIn3[baseOffset + j];

0 commit comments

Comments
 (0)