Skip to content

[SYCL][CUDA][libclc] Add asynchronous barrier #5303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8f5e03e
WIP: started trying to impl with atomics
FMarno Dec 9, 2021
4eed959
Merge branch 'sycl' into finlay/async_barrier_proposal
t4c1 Jan 6, 2022
4721cf6
proposal and untested async barrier implementation
t4c1 Jan 7, 2022
38f1d76
added advanced functionality (still untested)
t4c1 Jan 10, 2022
d34a206
fixed max
t4c1 Jan 10, 2022
41c66ec
clarified the cycle of arrivals and waits
t4c1 Jan 10, 2022
bc4f04a
bugfixes
t4c1 Jan 13, 2022
8dff0f7
removed pending_count, which is deprecated in CUDA
t4c1 Jan 13, 2022
f68d80b
format
t4c1 Jan 13, 2022
62ada41
addressed first review comments and clarified how it works without in…
t4c1 Jan 20, 2022
c662d70
format
t4c1 Jan 20, 2022
1e2d99b
clarified that the extension is only for CUDA and fixed some minor is…
t4c1 Jan 25, 2022
a804562
change the name of libclc functions to __clc
t4c1 Jan 26, 2022
9f2f636
Apply suggestions from code review
t4c1 Feb 9, 2022
f33ddf0
addressed review comments
t4c1 Feb 9, 2022
f63b973
added examples
t4c1 Feb 10, 2022
bb08a1b
Merge branch 'sycl' into async_barrier
t4c1 Mar 2, 2022
fa086bd
addressed review comments
t4c1 Mar 7, 2022
bccc461
format
t4c1 Mar 7, 2022
99596cd
moved to experimental namespace
t4c1 Mar 8, 2022
42afcf0
format
t4c1 Mar 8, 2022
62d731e
added explanation of limitations and fixed namespace in spec
t4c1 Mar 8, 2022
d420f88
changed limitations to a note
t4c1 Mar 9, 2022
1adecca
changed to a single note
t4c1 Mar 9, 2022
f110d25
Merge branch 'sycl' into async_barrier
t4c1 Mar 14, 2022
77cafed
Merge branch 'sycl' into async_barrier
t4c1 Mar 28, 2022
5c8e030
format
t4c1 Mar 28, 2022
32f75aa
fix merge
t4c1 Mar 28, 2022
3ccd520
another fix for bad merge
t4c1 Mar 28, 2022
6092e61
format
t4c1 Mar 28, 2022
52e540e
addressed review comments
t4c1 Mar 30, 2022
8e4f969
format
t4c1 Mar 30, 2022
9478857
fixed namespace in examples
t4c1 May 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 86 additions & 24 deletions libclc/ptx-nvidiacl/libspirv/synchronization/aw_barrier.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,48 +9,110 @@
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

_CLC_OVERLOAD _CLC_DEF void __clc_BarrierInitialize(long* state,
int expected_count) {
extern int __clc_nvvm_reflect_arch();

_CLC_OVERLOAD _CLC_DEF void __clc_BarrierInitialize(long *state,
int expected_count) {
if (__clc_nvvm_reflect_arch() >= 800) {
__nvvm_mbarrier_init(state, expected_count);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF void
__clc_BarrierInvalidate(long* state) {
__nvvm_mbarrier_inval(state);
_CLC_OVERLOAD _CLC_DEF void __clc_BarrierInvalidate(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
__nvvm_mbarrier_inval(state);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArrive(long* state) {
return __nvvm_mbarrier_arrive(state);
_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArrive(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_mbarrier_arrive(state);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDrop(long* state) {
return __nvvm_mbarrier_arrive_drop(state);
_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDrop(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_mbarrier_arrive_drop(state);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveNoComplete(long* state, int count) {
return __nvvm_mbarrier_arrive_noComplete(state, count);
_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveNoComplete(long *state,
int count) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_mbarrier_arrive_noComplete(state, count);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDropNoComplete(long* state, int count) {
return __nvvm_mbarrier_arrive_drop_noComplete(state, count);
_CLC_OVERLOAD _CLC_DEF long __clc_BarrierArriveAndDropNoComplete(long *state,
int count) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_mbarrier_arrive_drop_noComplete(state, count);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArrive(long* state) {
return __nvvm_cp_async_mbarrier_arrive(state);
_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArrive(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_cp_async_mbarrier_arrive(state);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArriveNoInc(long* state) {
return __nvvm_cp_async_mbarrier_arrive_noinc(state);
_CLC_OVERLOAD _CLC_DEF void __clc_BarrierCopyAsyncArriveNoInc(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_cp_async_mbarrier_arrive_noinc(state);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __clc_BarrierWait(long* state, long arrival) {
while(!__nvvm_mbarrier_test_wait(state, arrival)){}
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __clc_BarrierWait(long *state,
long arrival) {
if (__clc_nvvm_reflect_arch() >= 800) {
while (!__nvvm_mbarrier_test_wait(state, arrival)) {
}
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT bool __clc_BarrierTestWait(long* state, long arrival) {
return __nvvm_mbarrier_test_wait(state, arrival);
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT bool
__clc_BarrierTestWait(long *state, long arrival) {
if (__clc_nvvm_reflect_arch() >= 800) {
return __nvvm_mbarrier_test_wait(state, arrival);
} else {
__builtin_trap();
__builtin_unreachable();
}
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __clc_BarrierArriveAndWait(long* state) {
__clc_BarrierWait(state, __clc_BarrierArrive(state));
}
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
__clc_BarrierArriveAndWait(long *state) {
if (__clc_nvvm_reflect_arch() >= 800) {
__clc_BarrierWait(state, __clc_BarrierArrive(state));
} else {
__builtin_trap();
__builtin_unreachable();
}
}
34 changes: 19 additions & 15 deletions sycl/doc/extensions/Barrier/Barrier.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -70,43 +70,47 @@ This extension introduces asynchronous barrier for CUDA devices. This extends `g

Implementing this requires some space in local memory, where the state of the barrier is kept (this is true even for the CUDA compute capability 8 that support this functionality in hardware).

This extension introduces `barrier` class. It must be used only in local memory. Once a `barrier` is initialized it can be reused for multiple cycles of arrivals and waits.
This extension introduces a `barrier`` class, encapsulating the state of an asynchronous barrier. It must be used only in local memory, and reserves space in local memory even on hardware with native support (e.g. devices with CUDA Compute Capability 8.0). Once a barrier is initialized, it can be reused for multiple cycles of arrivals and waits.

This extension can only be used on devices that support independent forward progress.
This extension can only be used on devices that support Independent Thread Scheduling (i.e. devices with CUDA Compute Capability 7.0 or higher) when using CUDA backend. On other backend the interface of `barrier` is not defined or declared. Attempting to call any functions defined in this extension with CUDA backend and a device with lower compute capability will cause kernel launch to fail.

=== Cycle of arrivals and waits

When predetermined number of arrivals happen, the barrier moves into the next cycle. That unblocks the waits for current cycle. That is any wait that was previously called with the arrival token from this cycle stops blocking and future calls to wait with arrival token from this cycle will not block. The pending count is also reset and any future arrivals happen in the next cycle. However, at least one wait (or test wait returning `true`) must happen with arrival token from current cycle before any arrivals can happen in the next cycle. Another arrival happening before the wait is undefined behavior.
After a predetermined number of arrivals, the barrier moves into the next cycle. Any `wait` that was previously called with the arrival token from this cycle is unblocked, and future calls to `wait` with the arrival token from this cycle will not block. The pending count is reset and any future arrivals happen in the next cycle. However, at least one `wait` (or `test_wait` returning `true`) must happen with the arrival token from the current cycle before any arrivals can happen in the next cycle. Another arrival happening before the `wait` is undefined behavior.

Wait and test wait can only be called with an arrival token from current cycle (in which case wait will block until the barrier moves into next cycle) or the previous cycle (in which case wait will not block). Calling wait or test wait with an arrival token from any other cycle is undefined behavior.
`wait` and `test_wait` can only be called with an arrival token from the current cycle (in which case `wait` will block until the barrier moves into next cycle) or the previous cycle (in which case `wait` will not block). Calling `wait` or `test_wait` with an arrival token from any other cycle results in undefined behavior.

=== Interface

`barrier` class has the follwing member functions:
`barrier` class has the following member functions:

`void initialize(uint32_t expected_count)` : Initializes the barrier with given count of arrivals before wait unblocks. It only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`)needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object.
`void initialize(uint32_t expected_count)` : Initializes the barrier with an expected number of arrivals, representing the number of arrivals required to unblock calls to `wait`. This function only needs to be called by one work-item in work group. After the initialization a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object before they can use the newly initialized `barrier` object. If `expected_count` is greater than the value returned by `max()`, behavior is undefined.

`void invalidate()` : This must be called on an initialized barrier before the memory it is using can be repurposed for any other use. It only needs to be called by one work-item in work group. Before the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object, after the last call on the `barrier`. After the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by any work-items reusing the memory before the memory is reused. Calling any member function, except `initialize()` on an invalidated barrier is undefined behavior.
`void invalidate()` : Invalidates a previously initialized barrier, enabling its associated memory to be repurposed. This function only needs to be called by one work-item in the work group. Before the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by all work-items using the `barrier` object, after the last call on the `barrier`. After the invalidation a barrier operation (such as `group_barrier()`) needs to be executed by any work-items reusing the memory before the memory is reused. Calling any member function except `initialize()` on an invalidated barrier results in undefined behavior.

`arrival_token arrive()` : Executes arrival operation and returns a token that is needed for the wait call corresponding to this arrival.
`arrival_token arrive()` : Executes arrival operation and returns a token that is needed for the `wait` call corresponding to this arrival.

`arrival_token arrive_and_drop()` : Reduces expected arrival count for future cycles by one, executes arrival operation and returns a token that is needed for the wait call corresponding to this arrival.
`arrival_token arrive_and_drop()` : Reduces expected arrival count for future cycles by one, executes arrival operation and returns a token that is needed for the `wait` call corresponding to this arrival.

`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival that causes the cycle to complete - it would be undefined behavior.
`arrival_token arrive_no_complete(int32_t count)` : Executes arrival operation that counts as `count` arrivals and returns a token that is needed for the `wait` call corresponding to this arrival. If this is the last arrival that causes the cycle to complete, behavior is undefined. That means `count` must be strictly lower than the remaining number of arrivals required to complete this cycle.

`arrival_token arrive_and_drop_no_complete(int32_t count)` : Reduces expected arrival count for future cycles by one, executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival thet causes the cycle to complete - it would be undefined behavior.
This can be used to signal many arrivals by one function call. However, it should not be used on its own, as it can not be the last arrival in a cycle. So it should either be followed with a call to `arrive` or by a barrier operation, such as (such as `group_barrier()`) after which a different work-item is guaranteed to call `arrive` within the same cycle.

`void arrive_copy_async()` : Schedules arrive operation to be triggered asynchronously when all previous asynchronous memory copies initiated by the calling work item complete. Before the arrive operation is triggered, the pending count on the barrier is increased by 1, so after the arrival there is no change to the pending count.
`arrival_token arrive_and_drop_no_complete(int32_t count)` : Reduces expected arrival count for future cycles by `count`, executes arrival operation that counts as `count` arrivals and returns a token that is needed for the wait call corresponding to this arrival. This must not be the last arrival thet causes the cycle to complete - it would be undefined behavior. That means `count` must be strictly lower than the remaining number of arrivals required to complete this cycle.

This can be used to signal many arrivals by one function call. However, it should not be used on its own, as it can not be the last arrival in a cycle. So it should either be followed with a call to `arrive` or by a barrier operation, such as (such as `group_barrier()`) after which a different work-item is guaranteed to call `arrive` within the same cycle.

`void arrive_copy_async()` : Schedules arrive operation to be triggered asynchronously when all previous asynchronous memory copies initiated by the calling work item complete. Before the arrive operation is triggered, the pending count on the barrier is increased by 1, so after the arrival there is no change to the pending count. Pending count with the increase by this call must not exceed the value returned by `max`. If it does, it causes undefined behavior.

`void arrive_copy_async_no_inc()` : Schedules arrive operation to be triggered asynchronously when all previous asynchronous memory copies initiated by the calling work item complete.

`void wait(arrival_token arrival)` : Executes wait operation, blocking until the predetermined number of work items reach arrive.
`void wait(arrival_token arrival)` : Executes wait operation, blocking until the predetermined number of work items have called `arrive`.

`bool test_wait(arrival_token arrival)` : Checks whether all the arrivals have already happened for the current cycle, returning `true` if they did and `false` if `wait(arrival)` would block.

`void arrive_and_wait()` : Equivalent to calling `this->wait(this->arrive())`.
`void arrive_and_wait()` : Equivalent to calling `wait(arrive())`.

`static constexpr uint64_t max()` : Returns the maximum value of the expected and pending counts supported by the implementation. Exceeding this in the internal state of the barrier is undefined behavior.
`static constexpr uint64_t max()` : Returns the maximum value of the expected and pending counts supported by the implementation.

==== Sample Header

Expand Down