Skip to content

Commit 52e30f7

Browse files
committed
Merge branch 'sycl' into oneapins
Signed-off-by: James Brodman <[email protected]>
2 parents 4d30921 + 68ec253 commit 52e30f7

File tree

5 files changed

+131
-2
lines changed

5 files changed

+131
-2
lines changed

sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ John Pennycook, Intel (john 'dot' pennycook 'at' intel 'dot' com)
5151

5252
== Dependencies
5353

54-
This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6.
54+
This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6 and the following extensions:
55+
56+
- +SYCL_INTEL_extended_atomics+
5557

5658
== Overview
5759

@@ -67,6 +69,10 @@ The extension introduces the following functions:
6769
- +reduce+
6870
- +exclusive_scan+
6971
- +inclusive_scan+
72+
- +barrier+
73+
74+
The definitions and behavior of the following functions are based on equivalents in the SYCL 2020 provisional specification:
75+
- +barrier+
7076

7177
=== Alignment with OpenCL vs C++
7278

@@ -252,6 +258,21 @@ The return types of the collective functions in {cpp}17 are not deduced from the
252258
|Perform an inclusive scan over the values in the range [_first_, _last_) using the operator _binary_op_, which must be one of the group algorithms library function objects. The value written to +result + i+ is the inclusive scan of the first +i+ values in the range and an initial value specified by _init_. Returns a pointer to the end of the output range. _first_, _last_, _result_, _binary_op_ and _init_ must be the same for all work-items in the group. _binary_op(init, *first)_ must return a value of type _T_.
253259
|===
254260

261+
==== Synchronization
262+
263+
The behavior of memory fences in this section is aligned with the single happens-before relationship defined by the +SYCL_INTEL_extended_atomics+ extension.
264+
265+
|===
266+
|Function|Description
267+
268+
|+template <typename Group> void barrier(Group g);+
269+
|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items in the group after the barrier. The scope of the group memory fences implied by this barrier is the narrowest scope including all work-items in the group.
270+
271+
|+template <typename Group> void barrier(Group g, memory_scope scope);+
272+
|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items specified by _scope_ after the barrier. The scope of the group memory fences implied by this barrier is controlled by _scope_ and must be broader than the narrowest scope including all work-items in the group. If the specified _scope_ is narrower than the narrowest scope including all work-items in the group, the _scope_ argument is ignored.
273+
274+
|===
275+
255276
== Issues
256277

257278
None.
@@ -270,6 +291,7 @@ None.
270291
|========================================
271292
|Rev|Date|Author|Changes
272293
|1|2020-01-30|John Pennycook|*Initial public working draft*
294+
|2|2020-07-28|John Pennycook|*Add group barrier*
273295
|========================================
274296
275297
//************************************************************************

sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ This extension adds sub-group support to all of the functions from +SYCL_INTEL_g
7070
- +reduce+
7171
- +exclusive_scan+
7272
- +inclusive_scan+
73+
- +barrier+
7374

7475
It additionally introduces a number of functions that are currently specific to sub-groups:
7576

@@ -165,6 +166,7 @@ None.
165166
|========================================
166167
|Rev|Date|Author|Changes
167168
|1|2020-03-16|John Pennycook|*Initial public working draft*
169+
|2|2020-07-28|John Pennycook|*Add group barrier*
168170
|========================================
169171
170172
//************************************************************************

sycl/include/CL/sycl/ONEAPI/group_algorithm.hpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <CL/__spirv/spirv_ops.hpp>
1111
#include <CL/__spirv/spirv_types.hpp>
1212
#include <CL/__spirv/spirv_vars.hpp>
13+
#include <CL/sycl/ONEAPI/atomic.hpp>
1314
#include <CL/sycl/ONEAPI/functional.hpp>
1415
#include <CL/sycl/ONEAPI/sub_group.hpp>
1516
#include <CL/sycl/detail/spirv.hpp>
@@ -78,6 +79,15 @@ template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) {
7879
return result;
7980
}
8081

82+
// TODO: Replace with Group::fence_scope from SYCL 2020 provisional
83+
template <typename Group> struct FenceScope {
84+
static constexpr intel::memory_scope value = intel::memory_scope::work_group;
85+
};
86+
87+
template <> struct FenceScope<intel::sub_group> {
88+
static constexpr intel::memory_scope value = intel::memory_scope::sub_group;
89+
};
90+
8191
template <typename T, class BinaryOperation> struct identity {};
8292

8393
template <typename T, typename V> struct identity<T, ONEAPI::plus<V>> {
@@ -897,6 +907,42 @@ template <typename Group> bool leader(Group g) {
897907
#endif
898908
}
899909

910+
template <typename Group> void barrier(Group, memory_scope scope) {
911+
static_assert(sycl::detail::is_generic_group<Group>::value,
912+
"Group algorithms only support the sycl::group and "
913+
"intel::sub_group class.");
914+
#ifdef __SYCL_DEVICE_ONLY__
915+
// MemoryScope must be broader than Group scope for correctness
916+
auto GroupScope = detail::FenceScope<Group>::value;
917+
auto BroadestScope = (scope > GroupScope) ? scope : GroupScope;
918+
auto MemoryScope = sycl::detail::spirv::getScope(BroadestScope);
919+
auto ExecutionScope = sycl::detail::spirv::group_scope<Group>::value;
920+
__spirv_ControlBarrier(ExecutionScope, MemoryScope,
921+
__spv::MemorySemanticsMask::AcquireRelease |
922+
__spv::MemorySemanticsMask::SubgroupMemory |
923+
__spv::MemorySemanticsMask::WorkgroupMemory |
924+
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
925+
#else
926+
(void)scope;
927+
throw runtime_error("Group algorithms are not supported on host device.",
928+
PI_INVALID_DEVICE);
929+
#endif
930+
}
931+
932+
template <typename Group> void barrier(Group g) {
933+
static_assert(sycl::detail::is_generic_group<Group>::value,
934+
"Group algorithms only support the sycl::group and "
935+
"intel::sub_group class.");
936+
#ifdef __SYCL_DEVICE_ONLY__
937+
auto MemoryScope = detail::FenceScope<Group>::value;
938+
barrier(g, MemoryScope);
939+
#else
940+
(void)g;
941+
throw runtime_error("Group algorithms are not supported on host device.",
942+
PI_INVALID_DEVICE);
943+
#endif
944+
}
945+
900946
} // namespace ONEAPI
901947
} // namespace sycl
902948
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/include/CL/sycl/backend/level_zero.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
#pragma once
1010

1111
#include <CL/sycl.hpp>
12-
#include <level_zero/ze_api.h>
12+
// This header should be included by users.
13+
//#include <level_zero/ze_api.h>
1314

1415
__SYCL_INLINE_NAMESPACE(cl) {
1516
namespace sycl {

sycl/test/group-algorithm/barrier.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// UNSUPPORTED: cuda
2+
//
3+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
7+
8+
#include <CL/sycl.hpp>
9+
#include <cassert>
10+
using namespace sycl;
11+
using namespace sycl::intel;
12+
13+
class barrier_kernel;
14+
15+
void test(queue q) {
16+
17+
constexpr size_t N = 32;
18+
constexpr size_t L = 16;
19+
std::array<int, N> out;
20+
std::fill(out.begin(), out.end(), 0);
21+
{
22+
buffer<int> out_buf(out.data(), range<1>{N});
23+
q.submit([&](handler &cgh) {
24+
auto tmp =
25+
accessor<int, 1, access::mode::read_write, access::target::local>(
26+
L, cgh);
27+
auto out = out_buf.get_access<access::mode::read_write>(cgh);
28+
cgh.parallel_for<class barrier_kernel>(
29+
nd_range<1>(N, L), [=](nd_item<1> it) {
30+
group<1> g = it.get_group();
31+
tmp[it.get_local_linear_id()] = it.get_global_linear_id() + 1;
32+
barrier(g);
33+
int result = 0;
34+
for (int i = 0; i < L; ++i) {
35+
result += tmp[i];
36+
}
37+
out[it.get_global_linear_id()] = result;
38+
});
39+
});
40+
}
41+
42+
// Each work-item should see writes from all other work-items in its group
43+
for (int g = 0; g < N / L; ++g) {
44+
int sum = 0;
45+
for (int wi = 0; wi < L; ++wi) {
46+
sum += g * L + wi + 1;
47+
}
48+
for (int wi = 0; wi < L; ++wi) {
49+
assert(out[g * L + wi] == sum);
50+
}
51+
}
52+
}
53+
54+
int main() {
55+
queue q;
56+
test(q);
57+
std::cout << "Test passed." << std::endl;
58+
}

0 commit comments

Comments
 (0)