Skip to content

Commit 13e8dae

Browse files
authored
[SYCL] Avoid using intrinsics to get global id in stream (#2318)
Avoid calling intrinsics to get global id when calculating the offset of the work item in the flush buffer. When stream is used in a single_task kernel this could cause an overhead on targets like FPGA. That is why use global atomic variable to calculate offsets. Signed-off-by: Artur Gainullin <[email protected]>
1 parent 8953bfd commit 13e8dae

File tree

3 files changed

+43
-13
lines changed

3 files changed

+43
-13
lines changed

sycl/include/CL/sycl/stream.hpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -811,13 +811,12 @@ class __SYCL_EXPORT stream {
811811

812812
#ifdef __SYCL_DEVICE_ONLY__
813813
void __init() {
814-
// Calculate work item's global id, this should be done once, that
815-
// is why this is done in _init method, call to __init method is generated
816-
// by frontend. As a result each work item will write to its own section
817-
// in the flush buffer
818-
819-
id<1> GlobalId = __spirv::initGlobalInvocationId<1, id<1>>();
820-
WIOffset = (unsigned int)GlobalId[0] * FlushBufferSize;
814+
// Calculate offset in the flush buffer for each work item in the global
815+
// work space. We need to avoid calling intrinsics to get global id because
816+
// when stream is used in a single_task kernel this could cause some
817+
// overhead on FPGA target. That is why use global atomic variable to
818+
// calculate offsets.
819+
WIOffset = GlobalOffset[1].fetch_add(FlushBufferSize);
821820
}
822821

823822
void __finalize() {

sycl/source/detail/stream_impl.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ class __SYCL_EXPORT stream_impl {
3838
}
3939

4040
// Method to provide an atomic access to the offset in the global stream
41-
// buffer
41+
// buffer and offset in the flush buffer
4242
GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) {
4343
auto OffsetSubBuf = buffer<char, 1>(Buf, id<1>(0), range<1>(OffsetSize));
44-
auto ReinterpretedBuf = OffsetSubBuf.reinterpret<unsigned, 1>(range<1>(1));
44+
auto ReinterpretedBuf = OffsetSubBuf.reinterpret<unsigned, 1>(range<1>(2));
4545
return ReinterpretedBuf.get_access<cl::sycl::access::mode::atomic>(
46-
CGH, range<1>(1), id<1>(0));
46+
CGH, range<1>(2), id<1>(0));
4747
}
4848

4949
// Copy stream buffer to the host and print the contents
@@ -61,10 +61,9 @@ class __SYCL_EXPORT stream_impl {
6161
// statement till the semicolon
6262
unsigned MaxStatementSize_;
6363

64-
// Size of the variable which is used as an offset in the stream buffer.
6564
// Additinonal memory is allocated in the beginning of the stream buffer for
66-
// this variable.
67-
static const size_t OffsetSize = sizeof(unsigned);
65+
// 2 variables: offset in the stream buffer and offset in the flush buffer.
66+
static const size_t OffsetSize = 2 * sizeof(unsigned);
6867

6968
// Vector on the host side which is used to initialize the stream buffer
7069
std::vector<char> Data;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// RUN: %clangxx -S -emit-llvm -fsycl-device-only %s -o - -Xclang -disable-llvm-passes | FileCheck %s
2+
3+
//==------------------ no_intrinsics.cpp - SYCL stream test ----------------==//
4+
//
5+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
6+
// See https://llvm.org/LICENSE.txt for license information.
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
// Test to check that intrinsics to get a global id are not generated for the
12+
// stream.
13+
14+
// CHECK-NOT: call spir_func void @{{.*}}__spirvL22initGlobalInvocationId{{.*}}
15+
16+
#include <CL/sycl.hpp>
17+
18+
using namespace cl::sycl;
19+
20+
int main() {
21+
{
22+
queue Queue;
23+
24+
Queue.submit([&](handler &CGH) {
25+
stream Out(1024, 80, CGH);
26+
CGH.single_task<class integral>([=]() { Out << "Hello, World!\n"; });
27+
});
28+
Queue.wait();
29+
}
30+
31+
return 0;
32+
}

0 commit comments

Comments
 (0)