intel · pvchupin · May 17, 2022 · Dec 9, 2021 · Jan 6, 2022 · Jan 7, 2022
@@ -71,6 +71,7 @@ relational/isfinite.cl
 relational/isinf.cl
 relational/isnan.cl
 synchronization/barrier.cl
+synchronization/aw_barrier.cl
 workitem/get_global_id.cl
 workitem/get_global_offset.cl
 workitem/get_global_size.cl

@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <spirv/spirv.h>
+#include <spirv/spirv_types.h>
+
+_CLC_OVERLOAD _CLC_DEF void __clc_BarrierInitialize(long* state,
+                                                        int expected_count) {
+  __nvvm_mbarrier_init(state, expected_count);
+}
+
+_CLC_OVERLOAD _CLC_DEF void
+__clc_BarrierInvalidate(long* state) {
+  __nvvm_mbarrier_inval(state);
+}
+
+_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArrive(long* state) {
+  return __nvvm_mbarrier_arrive(state);
+}
+
+_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDrop(long* state) {
+  return __nvvm_mbarrier_arrive_drop(state);
+}
+
+_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveNoComplete(long* state, int count) {
+  return __nvvm_mbarrier_arrive_noComplete(state, count);
+}
+
+_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDropNoComplete(long* state, int count) {
+  return __nvvm_mbarrier_arrive_drop_noComplete(state, count);
+}
+
+_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArrive(long* state) {
+  return __nvvm_cp_async_mbarrier_arrive(state);
+}
+
+_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArriveNoInc(long* state) {
+  return __nvvm_cp_async_mbarrier_arrive_noinc(state);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __clc_BarrierWait(long* state, long arrival) {
+  while(!__nvvm_mbarrier_test_wait(state, arrival)){}
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT bool __clc_BarrierTestWait(long* state, long arrival) {
+  return __nvvm_mbarrier_test_wait(state, arrival);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __clc_BarrierArriveAndWait(long* state) {
+  __clc_BarrierWait(state, __clc_BarrierArrive(state));
+}
diff --git a/sycl/doc/extensions/Barrier/Barrier.asciidoc b/sycl/doc/extensions/Barrier/Barrier.asciidoc
@@ -0,0 +1,172 @@
+= SYCL_EXT_ONEAPI_BARRIER
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+== Introduction
+IMPORTANT: This specification is a draft.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos.
+
+This document describes an extension that adds `barrier`, which acts similarly to https://en.cppreference.com/w/cpp/thread/barrier[`std::barrier`].
+
+NOTE: CUDA can hardware accelerate from compute capability 8.
+
+== Notice
+
+Copyright (c) 2021 Intel Corporation.  All rights reserved.
+
+== Status
+
+Working Draft
+
+This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products.
+
+== Version
+
+Revision: 1
+
+== Contact
+Tadej Ciglarič, Codeplay (tadej 'dot' ciglaric 'at' codeplay 'dot' com)
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 4.
+
+== Feature Test Macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros".  Therefore, an
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_CUDA_ASYNC_BARRIER` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version.  Base features are supported.
+|===
+
+== Overview
+
+This extension introduces asynchronous barrier for CUDA devices. This extends `group_barrier` by splitting it into into two calls - arrive and wait. The wait call blocks until the predetermined number of work items in the same work group call arrive. This is also very similar to https://en.cppreference.com/w/cpp/thread/barrier[`std::barrier`] introduced in c++20.
+
+Implementing this requires some space in local memory, where the state of the barrier is kept (this is true even for the CUDA compute capability 8 that support this functionality in hardware).
+
+This extension introduces `barrier` class. It must be used only in local memory. Once a `barrier` is initialized it can be reused for multiple cycles of arrivals and waits.
+
+This extension can only be used on devices that support independent forward progress.
+
+=== Cycle of arrivals and waits
+
+When predetermined number of arrivals happen, the barrier moves into the next cycle. That unblocks the waits for current cycle. That is any wait that was previously called with the arrival token from this cycle stops blocking and future calls to wait with arrival token from this cycle will not block. The pending count is also reset and any future arrivals happen in the next cycle. However, at least one wait (or test wait returning `true`) must happen with arrival token from current cycle before any arrivals can happen in the next cycle. Another arrival happening before the wait is undefined behavior.
+
+Wait and test wait can only be called with an arrival token from current cycle (in which case wait will block until the barrier moves into next cycle) or the previous cycle (in which case wait will not block). Calling wait or test wait with an arrival token from any other cycle is undefined behavior.
+
+=== Interface
+
+`barrier` class has the follwing member functions:
+
+`void initialize(uint32_t expected_count)` : Initializes the barrier with given count of arrivals before wait unblocks. It only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`)needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object.
-`void initialize(uint32_t expected_count)` : Initializes the barrier with given count of arrivals before wait unblocks. It only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`)needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object.
+`void initialize(uint32_t expected_count)` : Initializes the barrier with an expected number of arrivals, representing the number of arrivals required to unblock calls to `wait`. This function only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object. If `expected_count` is greater than the value returned by `max()`, behavior is undefined.
-`void initialize(uint32_t expected_count)` : Initializes the barrier with given count of arrivals before wait unblocks. It only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`)needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object.
+`void initialize(uint32_t expected_count)` : Initializes the barrier with an expected number of arrivals, representing the number of arrivals required to unblock calls to `wait`. This function only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object. If `expected_count` is greater than the value returned by `max()`, behavior is undefined.
+
+`void invalidate()` : This must be called on an initialized barrier before the memory it is using can be repurposed for any other use. It only needs to be called by one work-item in work group. Before the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object, after the last call on the `barrier`. After the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by any work-items reusing the memory before the memory is reused. Calling any member function, except `initialize()` on an invalidated barrier is undefined behavior.
+
+`arrival_token arrive()` : Executes arrival operation and returns a token that is needed for the wait call corresponding to this arrival.
+
+`arrival_token arrive_and_drop()` : Reduces expected arrival count for future cycles by one, executes arrival operation and returns a token that is needed for the wait call corresponding to this arrival.
+
+`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival that causes the cycle to complete - it would be undefined behavior.
-`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival that causes the cycle to complete - it would be undefined behavior.
+`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the `wait` call corresponding to this arrival. If this is the last arrival that causes the cycle to complete, behavior is undefined.
-`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival that causes the cycle to complete - it would be undefined behavior.
+`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the `wait` call corresponding to this arrival. If this is the last arrival that causes the cycle to complete, behavior is undefined.
+
+`arrival_token arrive_and_drop_no_complete(int32_t count)` : Reduces expected arrival count for future cycles by one, executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival thet causes the cycle to complete - it would be undefined behavior.
+
+`void arrive_copy_async()` : Schedules arrive operation to be triggered asynchronously when all previous asynchronous memory copies initiated by the calling work item complete. Before the arrive operation is triggered, the pending count on the barrier is increased by 1, so after the arrival there is no change to the pending count.
+
+`void arrive_copy_async_no_inc()` : Schedules arrive operation to be triggered asynchronously when all previous asynchronous memory copies initiated by the calling work item complete.
+
+`void wait(arrival_token arrival)` : Executes wait operation, blocking until the predetermined number of work items reach arrive.
+
+`bool test_wait(arrival_token arrival)` : Checks whether all the arrivals have already happened for the current cycle, returning `true` if they did and `false` if `wait(arrival)` would block.
+
+`void arrive_and_wait()` : Equivalent to calling `this->wait(this->arrive())`.
+
+`static constexpr uint64_t max()` : Returns the maximum value of the expected and pending counts supported by the implementation. Exceeding this in the internal state of the barrier is undefined behavior.
+
+==== Sample Header
+
+[source, c++]
+----
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace cuda {
+
+class barrier {
+  [implementation defined internal state]
+
+public:
+  using arrival_token = [implementation defined];
+
+  // barriers cannot be moved or copied
+  barrier(const barrier &other) = delete;
+  barrier(barrier &&other) noexcept = delete;
+  barrier &operator=(const barrier &other) = delete;
+  barrier &operator=(barrier &&other) noexcept = delete;
+
+  void initialize(uint32_t expected_count);
+  void invalidate();
+  arrival_token arrive();
+  arrival_token arrive_and_drop();
+  arrival_token arrive_no_complete(int32_t count);
+  arrival_token arrive_and_drop_no_complete(int32_t count);
+  void arrive_copy_async();
+  void arrive_copy_async_no_inc();
+  void wait(arrival_token arrival);
+  bool test_wait(arrival_token arrival);
+  void arrive_and_wait();
+  static constexpr uint64_t max();
+};
+
+} // namespace cuda
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+----
+
+== Issues
+
+. Is `barrier` the best name? Reasons for that name are that it is mostly in line with c+\+20 `std::barrier` and CUDA has the same name for this functionality. However it might be confusing with `group_barrier`, which is not present in c++20 and has a different name in CUDA - `__syncthreads`. Earlier version of CUDA docs called this `awbarrier`. Now that name is deprecated and they call it asynchronous barrier in text and `barrier` in code. Related PTX instructions use `mbarrier`. Other ideas for the name: "non-blocking barrier" and "split barrier".
+--
+*RESOLUTION*: Not resolved.
+--
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2022-01-07|Tadej Ciglarič|*Initial public working draft*
+|========================================
+
+== Resources
+* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#aw-barrier
+* https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
+* https://nvidia.github.io/libcudacxx/extended_api/synchronization_primitives/barrier.html
@@ -742,6 +742,39 @@ extern SYCL_EXTERNAL float __spirv_ConvertBF16ToFINTEL(uint16_t) noexcept;
 __SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __ocl_vec_t<uint32_t, 4>
 __spirv_GroupNonUniformBallot(uint32_t Execution, bool Predicate) noexcept;
 
+extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierInvalidate(int64_t *state) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT int64_t
+__clc_BarrierArrive(int64_t *state) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT int64_t
+__clc_BarrierArriveAndDrop(int64_t *state) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT int64_t
+__clc_BarrierArriveNoComplete(int64_t *state, int32_t count) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT int64_t
+__clc_BarrierArriveAndDropNoComplete(int64_t *state, int32_t count) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierCopyAsyncArrive(int64_t *state) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierCopyAsyncArriveNoInc(int64_t *state) noexcept;
+
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierWait(int64_t *state, int64_t arrival) noexcept;
+
+extern SYCL_EXTERNAL __SYCL_EXPORT bool
+__clc_BarrierTestWait(int64_t *state, int64_t arrival) noexcept;
+
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT void
+__clc_BarrierArriveAndWait(int64_t *state) noexcept;
+
 #ifdef __SYCL_USE_NON_VARIADIC_SPIRV_OCL_PRINTF__
 template <typename... Args>
 extern SYCL_EXTERNAL int

@@ -60,6 +60,7 @@
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
 #endif
+#include <sycl/ext/oneapi/barrier.hpp>
 #include <sycl/ext/oneapi/experimental/builtins.hpp>
 #include <sycl/ext/oneapi/filter_selector.hpp>
 #include <sycl/ext/oneapi/group_algorithm.hpp>