Skip to content

[SYCL] Adjust mapping to OpenCL to simplify tuning. #720

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sycl/include/CL/__spirv/spirv_vars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupL
}; \
\
template <class DstT> struct InitSizesST##POSTFIX<2, DstT> { \
static DstT initSize() { return {get##POSTFIX<0>(), get##POSTFIX<1>()}; } \
static DstT initSize() { return {get##POSTFIX<1>(), get##POSTFIX<0>()}; } \
}; \
\
template <class DstT> struct InitSizesST##POSTFIX<3, DstT> { \
static DstT initSize() { \
return {get##POSTFIX<0>(), get##POSTFIX<1>(), get##POSTFIX<2>()}; \
return {get##POSTFIX<2>(), get##POSTFIX<1>(), get##POSTFIX<0>()}; \
} \
}; \
\
Expand Down
61 changes: 18 additions & 43 deletions sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,19 +185,12 @@ class HostKernel : public HostKernelBase {
template <class ArgT = KernelArgType>
typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value>::type
runOnHost(const NDRDescT &NDRDesc) {
size_t XYZ[3] = {0};
sycl::id<Dims> ID;
for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
XYZ[1] = 0;
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
XYZ[0] = 0;
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
for (int I = 0; I < Dims; ++I)
ID[I] = XYZ[I];
MKernel(ID);
}
}
}
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I)
Range[I] = NDRDesc.GlobalSize[I];

detail::NDLoop<Dims>::iterate(
Range, [&](const sycl::id<Dims> &ID) { MKernel(ID); });
}

template <class ArgT = KernelArgType>
Expand All @@ -210,20 +203,11 @@ class HostKernel : public HostKernelBase {
for (int I = 0; I < Dims; ++I)
Range[I] = NDRDesc.GlobalSize[I];

for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
XYZ[1] = 0;
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
XYZ[0] = 0;
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
for (int I = 0; I < Dims; ++I)
ID[I] = XYZ[I];

sycl::item<Dims, /*Offset=*/false> Item =
IDBuilder::createItem<Dims, false>(Range, ID);
MKernel(Item);
}
}
}
detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
sycl::item<Dims, /*Offset=*/false> Item =
IDBuilder::createItem<Dims, false>(Range, ID);
MKernel(Item);
});
}

template <class ArgT = KernelArgType>
Expand All @@ -236,22 +220,13 @@ class HostKernel : public HostKernelBase {
Range[I] = NDRDesc.GlobalSize[I];
Offset[I] = NDRDesc.GlobalOffset[I];
}
size_t XYZ[3] = {0};
sycl::id<Dims> ID;
for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
XYZ[1] = 0;
for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
XYZ[0] = 0;
for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
for (int I = 0; I < Dims; ++I)
ID[I] = XYZ[I] + Offset[I];

sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, ID, Offset);
MKernel(Item);
}
}
}

detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> &ID) {
sycl::id<Dims> OffsetID = ID + Offset;
sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, OffsetID, Offset);
MKernel(Item);
});
}

template <class ArgT = KernelArgType>
Expand Down
12 changes: 7 additions & 5 deletions sycl/include/CL/sycl/detail/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@ struct NDLoopIterateImpl {
const LoopBoundTy<NDIMS> &Stride,
const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
LoopIndexTy<NDIMS> &Index) {

for (Index[DIM] = LowerBound[DIM]; Index[DIM] < UpperBound[DIM];
Index[DIM] += Stride[DIM]) {
constexpr size_t AdjIdx = NDIMS - 1 - DIM;
for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
Index[AdjIdx] += Stride[AdjIdx]) {

NDLoopIterateImpl<NDIMS, DIM - 1, LoopBoundTy, FuncTy, LoopIndexTy>{
LowerBound, Stride, UpperBound, f, Index};
Expand All @@ -177,8 +177,9 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
const LoopBoundTy<NDIMS> &UpperBound, FuncTy f,
LoopIndexTy<NDIMS> &Index) {

for (Index[0] = LowerBound[0]; Index[0] < UpperBound[0];
Index[0] += Stride[0]) {
constexpr size_t AdjIdx = NDIMS - 1;
for (Index[AdjIdx] = LowerBound[AdjIdx]; Index[AdjIdx] < UpperBound[AdjIdx];
Index[AdjIdx] += Stride[AdjIdx]) {

f(Index);
}
Expand All @@ -190,6 +191,7 @@ struct NDLoopIterateImpl<NDIMS, 0, LoopBoundTy, FuncTy, LoopIndexTy> {
/// over a multi-dimensional space - it allows to avoid generating unnecessary
/// outer loops like 'for (int z=0; z<1; z++)' in case of 1D and 2D iteration
/// spaces or writing specializations of the algorithms for 1D, 2D and 3D cases.
/// Loop is unrolled in a reverse directions, i.e. ID = 0 is the inner-most one.
template <int NDIMS> struct NDLoop {
/// Generates ND loop nest with {0,..0} .. \c UpperBound bounds with unit
/// stride. Applies \c f at each iteration, passing current index of
Expand Down
2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/id.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ template <int dimensions = 1> class id : public detail::array<dimensions> {
private:
using base = detail::array<dimensions>;
static_assert(dimensions >= 1 && dimensions <= 3,
"id can only be 1, 2, or 3 dimentional.");
"id can only be 1, 2, or 3 dimensional.");
template <int N, int val, typename T>
using ParamTy = detail::enable_if_t<(N == val), T>;

Expand Down
2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/range.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace sycl {
template <int dimensions> class id;
template <int dimensions = 1> class range : public detail::array<dimensions> {
static_assert(dimensions >= 1 && dimensions <= 3,
"range can only be 1, 2, or 3 dimentional.");
"range can only be 1, 2, or 3 dimensional.");
using base = detail::array<dimensions>;

public:
Expand Down
24 changes: 23 additions & 1 deletion sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,23 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
}

// We have the following mapping between dimensions with SPIRV builtins:
// 1D: id[0] -> x
// 2D: id[0] -> y, id[1] -> x
// 3D: id[0] -> z, id[1] -> y, id[2] -> x
// So in order to ensure the correctness we update all the kernel
// parameters accordingly.
// Initially we keep the order of NDRDescT as it provided by the user, this
// simplifies overall handling and do the reverse only when
// the kernel is enqueued.
static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
if (NDR.Dims > 1) {
std::swap(NDR.GlobalSize[0], NDR.GlobalSize[NDR.Dims - 1]);
std::swap(NDR.LocalSize[0], NDR.LocalSize[NDR.Dims - 1]);
std::swap(NDR.GlobalOffset[0], NDR.GlobalOffset[NDR.Dims - 1]);
}
}

// The function initialize accessors and calls lambda.
// The function is used as argument to piEnqueueNativeKernel which requires
// that the passed function takes one void* argument.
Expand Down Expand Up @@ -803,10 +820,15 @@ cl_int ExecCGCommand::enqueueImp() {
getSyclObjImpl(Context)->getUSMDispatch();
USMDispatch->setKernelIndirectAccess(Kernel, MQueue->getHandleRef());

// Remember this information before the range dimensions are reversed
const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0);

ReverseRangeDimensionsForKernel(NDRDesc);

PI_CALL(RT::piEnqueueKernelLaunch(
MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
&NDRDesc.GlobalSize[0],
NDRDesc.LocalSize[0] ? &NDRDesc.LocalSize[0] : nullptr,
HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr,
RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0], &Event));

Expand Down
50 changes: 50 additions & 0 deletions sycl/test/linear_id/linear-host-dev.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
//==--------------- linear-host-dev.cpp - SYCL linear id test -------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl.hpp>
#include <algorithm>
#include <cstdio>
#include <cstdlib>

// Check that linear id is monotincally increased on host device.
// Only there we can reliable check that. Since the kernel has a restriction
// regarding usage of global variables, use stream to log the linear id
// and ensure that they're monotonically increased.
//
// Note: This test heavily relies on the current implementation of
// host device(single-threaded ordered executio). So if the implementation
// is somehow changed so it's no longer possible to run this test reliable
// it can be removed.

namespace s = cl::sycl;

int main(int argc, char *argv[]) {
s::queue q;

const size_t outer = 3;
const size_t inner = 2;
const s::range<2> rng = {outer, inner};

q.submit([&](s::handler &h) {
s::stream out(1024, 80, h);

h.parallel_for<class linear_id>(s::range<2>(rng), [=](s::item<2> item) {
// CHECK: 0
// CHECK-NEXT: 1
// CHECK-NEXT: 2
// CHECK-NEXT: 3
// CHECK-NEXT: 4
// CHECK-NEXT: 5
out << item.get_linear_id() << "\n";
});
});

return 0;
}
54 changes: 54 additions & 0 deletions sycl/test/linear_id/linear-sub_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//==--------------- linear-sub_group.cpp - SYCL linear id test -------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "../sub_group/helper.hpp"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cstdio>
#include <cstdlib>

using namespace cl::sycl;

int main(int argc, char *argv[]) {
queue q;
if (!core_sg_supported(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

// Fill output array with sub-group IDs
const uint32_t outer = 2;
const uint32_t inner = 8;
std::vector<int> output(outer * inner, 0);
{
buffer<int, 2> output_buf(output.data(), range<2>(outer, inner));
q.submit([&](handler &cgh) {
auto output = output_buf.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class linear_id>(
nd_range<2>(range<2>(outer, inner), range<2>(outer, inner)),
[=](nd_item<2> it) {
id<2> idx = it.get_global_id();
intel::sub_group sg = it.get_sub_group();
output[idx] = sg.get_group_id()[0] * sg.get_local_range()[0] +
sg.get_local_id()[0];
});
});
}

// Compare with expected result
for (int idx = 0; idx < outer * inner; ++idx) {
assert(output[idx] == idx);
}
std::cout << "Test passed.\n";
return 0;
}
74 changes: 74 additions & 0 deletions sycl/test/linear_id/opencl-interop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//==---------------- opencl-interop.cpp - SYCL linear id test --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl.hpp>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <numeric>

using namespace cl::sycl;

int main(int argc, char *argv[]) {
queue q;
if (q.is_host()) {
std::cout << "Skipping test\n";
return 0;
}

// Compute expected answer.
const uint32_t dimA = 2;
const uint32_t dimB = 8;
std::vector<int> input(dimA * dimB), output(dimA), expected(dimA);
std::iota(input.begin(), input.end(), 0);
for (int j = 0; j < dimA; ++j) {
int sum = 0;
for (int i = 0; i < dimB; ++i) {
sum += input[j * dimB + i];
}
expected[j] = sum;
}

// Compute sum using one work-group per element of dimA
program prog(q.get_context(), {q.get_device()});
prog.build_with_source("__kernel void sum(__global const int* input, "
"__global int* output, const int dimA, const int dimB)"
"{"
" int j = get_global_id(1);"
" int i = get_global_id(0);"
" int sum = work_group_reduce_add(input[j*dimB+i]);"
" if (get_local_id(0) == 0)"
" {"
" output[j] = sum;"
" }"
"}",
"-cl-std=CL2.0");
kernel sum = prog.get_kernel("sum");
{
buffer<int, 2> input_buf(input.data(), range<2>(dimA, dimB)),
output_buf(output.data(), range<2>(dimA, dimB));
q.submit([&](handler &cgh) {
auto input = input_buf.get_access<access::mode::read>(cgh);
auto output = output_buf.get_access<access::mode::discard_write>(cgh);
cgh.set_args(input, output, dimA, dimB);
cgh.parallel_for(nd_range<2>(range<2>(dimA, dimB), range<2>(1, dimB)),
sum);
});
}

// Compare with expected result
for (int j = 0; j < dimA; ++j) {
assert(output[j] == expected[j]);
}
std::cout << "Test passed.\n";
return 0;
}