Skip to content

Commit 1d46bfe

Browse files
authored
Merge pull request #720 from ilyastepykin/adjust-map
[SYCL] Adjust mapping to OpenCL to simplify tuning.
2 parents 16505bd + d7cb44e commit 1d46bfe

File tree

9 files changed

+230
-53
lines changed

9 files changed

+230
-53
lines changed

sycl/include/CL/__spirv/spirv_vars.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupL
5555
}; \
5656
\
5757
template <class DstT> struct InitSizesST##POSTFIX<2, DstT> { \
58-
static DstT initSize() { return {get##POSTFIX<0>(), get##POSTFIX<1>()}; } \
58+
static DstT initSize() { return {get##POSTFIX<1>(), get##POSTFIX<0>()}; } \
5959
}; \
6060
\
6161
template <class DstT> struct InitSizesST##POSTFIX<3, DstT> { \
6262
static DstT initSize() { \
63-
return {get##POSTFIX<0>(), get##POSTFIX<1>(), get##POSTFIX<2>()}; \
63+
return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()}; \
6464
} \
6565
}; \
6666
\

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 18 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -185,19 +185,12 @@ class HostKernel : public HostKernelBase {
185185
template <class ArgT = KernelArgType>
186186
typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value>::type
187187
runOnHost(const NDRDescT &NDRDesc) {
188-
size_t XYZ[3] = {0};
189-
sycl::id<Dims> ID;
190-
for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
191-
XYZ[1] = 0;
192-
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
193-
XYZ[0] = 0;
194-
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
195-
for (int I = 0; I < Dims; ++I)
196-
ID[I] = XYZ[I];
197-
MKernel(ID);
198-
}
199-
}
200-
}
188+
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
189+
for (int I = 0; I < Dims; ++I)
190+
Range[I] = NDRDesc.GlobalSize[I];
191+
192+
detail::NDLoop<Dims>::iterate(
193+
Range, [&](const sycl::id<Dims> &ID) { MKernel(ID); });
201194
}
202195

203196
template <class ArgT = KernelArgType>
@@ -210,20 +203,11 @@ class HostKernel : public HostKernelBase {
210203
for (int I = 0; I < Dims; ++I)
211204
Range[I] = NDRDesc.GlobalSize[I];
212205

213-
for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
214-
XYZ[1] = 0;
215-
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
216-
XYZ[0] = 0;
217-
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
218-
for (int I = 0; I < Dims; ++I)
219-
ID[I] = XYZ[I];
220-
221-
sycl::item<Dims, /*Offset=*/false> Item =
222-
IDBuilder::createItem<Dims, false>(Range, ID);
223-
MKernel(Item);
224-
}
225-
}
226-
}
206+
detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
207+
sycl::item<Dims, /*Offset=*/false> Item =
208+
IDBuilder::createItem<Dims, false>(Range, ID);
209+
MKernel(Item);
210+
});
227211
}
228212

229213
template <class ArgT = KernelArgType>
@@ -236,22 +220,13 @@ class HostKernel : public HostKernelBase {
236220
Range[I] = NDRDesc.GlobalSize[I];
237221
Offset[I] = NDRDesc.GlobalOffset[I];
238222
}
239-
size_t XYZ[3] = {0};
240-
sycl::id<Dims> ID;
241-
for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
242-
XYZ[1] = 0;
243-
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
244-
XYZ[0] = 0;
245-
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
246-
for (int I = 0; I < Dims; ++I)
247-
ID[I] = XYZ[I] + Offset[I];
248-
249-
sycl::item<Dims, /*Offset=*/true> Item =
250-
IDBuilder::createItem<Dims, true>(Range, ID, Offset);
251-
MKernel(Item);
252-
}
253-
}
254-
}
223+
224+
detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> &ID) {
225+
sycl::id<Dims> OffsetID = ID + Offset;
226+
sycl::item<Dims, /*Offset=*/true> Item =
227+
IDBuilder::createItem<Dims, true>(Range, OffsetID, Offset);
228+
MKernel(Item);
229+
});
255230
}
256231

257232
template <class ArgT = KernelArgType>

sycl/include/CL/sycl/detail/common.hpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,9 @@ struct NDLoopIterateImpl {
158158
const LoopBoundTy<NDIMS> &Stride,
159159
const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
160160
LoopIndexTy<NDIMS> &Index) {
161-
162-
for (Index[DIM] = LowerBound[DIM]; Index[DIM] < UpperBound[DIM];
163-
Index[DIM] += Stride[DIM]) {
161+
constexpr size_t AdjIdx = NDIMS - 1 - DIM;
162+
for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
163+
Index[AdjIdx] += Stride[AdjIdx]) {
164164

165165
NDLoopIterateImpl<NDIMS, DIM - 1, LoopBoundTy, FuncTy, LoopIndexTy>{
166166
LowerBound, Stride, UpperBound, f, Index};
@@ -177,8 +177,9 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
177177
const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
178178
LoopIndexTy<NDIMS> &Index) {
179179

180-
for (Index[0] = LowerBound[0]; Index[0] < UpperBound[0];
181-
Index[0] += Stride[0]) {
180+
constexpr size_t AdjIdx = NDIMS - 1;
181+
for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
182+
Index[AdjIdx] += Stride[AdjIdx]) {
182183

183184
f(Index);
184185
}
@@ -190,6 +191,7 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
190191
/// over a multi-dimensional space - it allows to avoid generating unnecessary
191192
/// outer loops like 'for (int z=0; z<1; z++)' in case of 1D and 2D iteration
192193
/// spaces or writing specializations of the algorithms for 1D, 2D and 3D cases.
194+
/// Loop is unrolled in a reverse directions, i.e. ID = 0 is the inner-most one.
193195
template <int NDIMS> struct NDLoop {
194196
/// Generates ND loop nest with {0,..0} .. \c UpperBound bounds with unit
195197
/// stride. Applies \c f at each iteration, passing current index of

sycl/include/CL/sycl/id.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
2121
private:
2222
using base = detail::array<dimensions>;
2323
static_assert(dimensions >= 1 && dimensions <= 3,
24-
"id can only be 1, 2, or 3 dimentional.");
24+
"id can only be 1, 2, or 3 dimensional.");
2525
template <int N, int val, typename T>
2626
using ParamTy = detail::enable_if_t<(N == val), T>;
2727

sycl/include/CL/sycl/range.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ namespace sycl {
1717
template <int dimensions> class id;
1818
template <int dimensions = 1> class range : public detail::array<dimensions> {
1919
static_assert(dimensions >= 1 && dimensions <= 3,
20-
"range can only be 1, 2, or 3 dimentional.");
20+
"range can only be 1, 2, or 3 dimensional.");
2121
using base = detail::array<dimensions>;
2222

2323
public:

sycl/source/detail/scheduler/commands.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,23 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
578578
NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
579579
}
580580

581+
// We have the following mapping between dimensions with SPIRV builtins:
582+
// 1D: id[0] -> x
583+
// 2D: id[0] -> y, id[1] -> x
584+
// 3D: id[0] -> z, id[1] -> y, id[2] -> x
585+
// So in order to ensure the correctness we update all the kernel
586+
// parameters accordingly.
587+
// Initially we keep the order of NDRDescT as it provided by the user, this
588+
// simplifies overall handling and do the reverse only when
589+
// the kernel is enqueued.
590+
static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
591+
if (NDR.Dims > 1) {
592+
std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
593+
std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
594+
std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
595+
}
596+
}
597+
581598
// The function initialize accessors and calls lambda.
582599
// The function is used as argument to piEnqueueNativeKernel which requires
583600
// that the passed function takes one void* argument.
@@ -803,10 +820,15 @@ cl_int ExecCGCommand::enqueueImp() {
803820
getSyclObjImpl(Context)->getUSMDispatch();
804821
USMDispatch->setKernelIndirectAccess(Kernel, MQueue->getHandleRef());
805822

823+
// Remember this information before the range dimensions are reversed
824+
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);
825+
826+
ReverseRangeDimensionsForKernel(NDRDesc);
827+
806828
PI_CALL(RT::piEnqueueKernelLaunch(
807829
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
808830
&NDRDesc.GlobalSize[0],
809-
NDRDesc.LocalSize[0] ? &NDRDesc.LocalSize[0] : nullptr,
831+
HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
810832
RawEvents.size(),
811833
RawEvents.empty() ? nullptr : &RawEvents[0], &Event));
812834

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// RUN: %clangxx -fsycl %s -o %t.out
2+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
3+
//==--------------- linear-host-dev.cpp - SYCL linear id test -------------==//
4+
//
5+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6+
// See https://llvm.org/LICENSE.txt for license information.
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <CL/sycl.hpp>
12+
#include <algorithm>
13+
#include <cstdio>
14+
#include <cstdlib>
15+
16+
// Check that linear id is monotincally increased on host device.
17+
// Only there we can reliable check that. Since the kernel has a restriction
18+
// regarding usage of global variables, use stream to log the linear id
19+
// and ensure that they're monotonically increased.
20+
//
21+
// Note: This test heavily relies on the current implementation of
22+
// host device(single-threaded ordered executio). So if the implementation
23+
// is somehow changed so it's no longer possible to run this test reliable
24+
// it can be removed.
25+
26+
namespace s = cl::sycl;
27+
28+
int main(int argc, char *argv[]) {
29+
s::queue q;
30+
31+
const size_t outer = 3;
32+
const size_t inner = 2;
33+
const s::range<2> rng = {outer, inner};
34+
35+
q.submit([&](s::handler &h) {
36+
s::stream out(1024, 80, h);
37+
38+
h.parallel_for<class linear_id>(s::range<2>(rng), [=](s::item<2> item) {
39+
// CHECK: 0
40+
// CHECK-NEXT: 1
41+
// CHECK-NEXT: 2
42+
// CHECK-NEXT: 3
43+
// CHECK-NEXT: 4
44+
// CHECK-NEXT: 5
45+
out << item.get_linear_id() << "\n";
46+
});
47+
});
48+
49+
return 0;
50+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// RUN: %clangxx -fsycl %s -o %t.out
2+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
//==--------------- linear-sub_group.cpp - SYCL linear id test -------------==//
7+
//
8+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9+
// See https://llvm.org/LICENSE.txt for license information.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "../sub_group/helper.hpp"
15+
#include <CL/sycl.hpp>
16+
#include <algorithm>
17+
#include <cstdio>
18+
#include <cstdlib>
19+
20+
using namespace cl::sycl;
21+
22+
int main(int argc, char *argv[]) {
23+
queue q;
24+
if (!core_sg_supported(q.get_device())) {
25+
std::cout << "Skipping test\n";
26+
return 0;
27+
}
28+
29+
// Fill output array with sub-group IDs
30+
const uint32_t outer = 2;
31+
const uint32_t inner = 8;
32+
std::vector<int> output(outer * inner, 0);
33+
{
34+
buffer<int, 2> output_buf(output.data(), range<2>(outer, inner));
35+
q.submit([&](handler &cgh) {
36+
auto output = output_buf.get_access<access::mode::read_write>(cgh);
37+
cgh.parallel_for<class linear_id>(
38+
nd_range<2>(range<2>(outer, inner), range<2>(outer, inner)),
39+
[=](nd_item<2> it) {
40+
id<2> idx = it.get_global_id();
41+
intel::sub_group sg = it.get_sub_group();
42+
output[idx] = sg.get_group_id()[0] * sg.get_local_range()[0] +
43+
sg.get_local_id()[0];
44+
});
45+
});
46+
}
47+
48+
// Compare with expected result
49+
for (int idx = 0; idx < outer * inner; ++idx) {
50+
assert(output[idx] == idx);
51+
}
52+
std::cout << "Test passed.\n";
53+
return 0;
54+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// RUN: %clangxx -fsycl %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//==---------------- opencl-interop.cpp - SYCL linear id test --------------==//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
// See https://llvm.org/LICENSE.txt for license information.
9+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include <CL/sycl.hpp>
14+
#include <algorithm>
15+
#include <cstdio>
16+
#include <cstdlib>
17+
#include <numeric>
18+
19+
using namespace cl::sycl;
20+
21+
int main(int argc, char *argv[]) {
22+
queue q;
23+
if (q.is_host()) {
24+
std::cout << "Skipping test\n";
25+
return 0;
26+
}
27+
28+
// Compute expected answer.
29+
const uint32_t dimA = 2;
30+
const uint32_t dimB = 8;
31+
std::vector<int> input(dimA * dimB), output(dimA), expected(dimA);
32+
std::iota(input.begin(), input.end(), 0);
33+
for (int j = 0; j < dimA; ++j) {
34+
int sum = 0;
35+
for (int i = 0; i < dimB; ++i) {
36+
sum += input[j * dimB + i];
37+
}
38+
expected[j] = sum;
39+
}
40+
41+
// Compute sum using one work-group per element of dimA
42+
program prog(q.get_context(), {q.get_device()});
43+
prog.build_with_source("__kernel void sum(__global const int* input, "
44+
"__global int* output, const int dimA, const int dimB)"
45+
"{"
46+
" int j = get_global_id(1);"
47+
" int i = get_global_id(0);"
48+
" int sum = work_group_reduce_add(input[j*dimB+i]);"
49+
" if (get_local_id(0) == 0)"
50+
" {"
51+
" output[j] = sum;"
52+
" }"
53+
"}",
54+
"-cl-std=CL2.0");
55+
kernel sum = prog.get_kernel("sum");
56+
{
57+
buffer<int, 2> input_buf(input.data(), range<2>(dimA, dimB)),
58+
output_buf(output.data(), range<2>(dimA, dimB));
59+
q.submit([&](handler &cgh) {
60+
auto input = input_buf.get_access<access::mode::read>(cgh);
61+
auto output = output_buf.get_access<access::mode::discard_write>(cgh);
62+
cgh.set_args(input, output, dimA, dimB);
63+
cgh.parallel_for(nd_range<2>(range<2>(dimA, dimB), range<2>(1, dimB)),
64+
sum);
65+
});
66+
}
67+
68+
// Compare with expected result
69+
for (int j = 0; j < dimA; ++j) {
70+
assert(output[j] == expected[j]);
71+
}
72+
std::cout << "Test passed.\n";
73+
return 0;
74+
}

0 commit comments

Comments
 (0)