Skip to content

Commit 122b9ca

Browse files
committed
[libc] Implement simple lock-free stack data structure
Summary: This patch implements a `FixedStack` which represents a lock-free stack implemented using a fixed-size memory buffer. The utility for this class is to implement a data structure that the GPU implementation can use incases where a mutex is normally used. We cannot implement a general-purpose mutex on the GPU due to the lack of a guaranteed fair thread scheduler. This lock free stack is implemented as a pair of 'pointers' to a used and free stack. The pointers here are simple 32-bit indexes into an underlying fixed-size memory buffer. The free stack is initialized to point to the entire buffer, i.e. `1, 2, 3, ..., N` with N being used as a sentinel value. To perform a push operation, we pop a node off of the free stack and then push it into the used stack and vice-versa for a pop operation. The underlying pop implementation relies on continually trying to update the head pointer to the next node using atomic CAS. The CAS loop will repeat until it reads the next pointer successfully and the head pointer has not changed. The underlying push ipmlementation relies on contiually trying to update the head pointer to the new node. The CAS loop will reap until we write to the head pointer and it has not changed. Both of these implementations rely on detecting whether or not the head pointer has changed. Simply using indexes we are succeptiable to the ABA problem as the stack could have been pushed and popped until we are back to the same index and we have not noticed. For this reason, the 'head' pointer is augmented with a 32-bit ABA that increments each time it is updated. This allows the head update to be done in a single 64-bit atomic update which is supported by hardware.
1 parent 7789fb6 commit 122b9ca

File tree

7 files changed

+272
-0
lines changed

7 files changed

+272
-0
lines changed

libc/src/__support/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,15 @@ add_header_library(
177177
libc.src.__support.CPP.array
178178
)
179179

180+
add_header_library(
181+
fixedstack
182+
HDRS
183+
fixedstack.h
184+
DEPENDS
185+
libc.src.__support.CPP.array
186+
libc.src.__support.CPP.atomic
187+
)
188+
180189
add_header_library(
181190
char_vector
182191
HDRS

libc/src/__support/fixedstack.h

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//===-- A lock-free data structure for a fixed capacity stack ---*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H
10+
#define LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H
11+
12+
#include "src/__support/CPP/array.h"
13+
#include "src/__support/CPP/atomic.h"
14+
#include "src/__support/threads/sleep.h"
15+
16+
#include <stdint.h>
17+
18+
namespace LIBC_NAMESPACE {
19+
20+
// A lock-free fixed size stack backed by an underlying cpp::array data
21+
// structure. It supports push and pop operations in a thread safe manner.
22+
template <typename T, uint32_t CAPACITY> class alignas(16) FixedStack {
23+
static_assert(CAPACITY < 1024 * 1024, "Invalid buffer size");
24+
25+
// The head of the free and used stacks. Represents as a 20-bit index combined
26+
// with a 44-bit ABA tag that is updated in a single atomic operation.
27+
uint64_t free;
28+
uint64_t used;
29+
30+
// The stack is a linked list of indices into the underlying data
31+
cpp::array<uint32_t, CAPACITY> next;
32+
cpp::array<T, CAPACITY> data;
33+
34+
// Get the 20-bit index into the underlying array from the head.
35+
static constexpr uint32_t get_node(uint64_t head) {
36+
return static_cast<uint32_t>(head & 0xffff);
37+
}
38+
39+
// Increment the old ABA tag and merge it into the new index.
40+
static constexpr uint64_t make_new_head(uint64_t orig, uint32_t node) {
41+
return static_cast<uint64_t>(node) | (((orig >> 20ul) + 1ul) << 20ul);
42+
}
43+
44+
// Helper macros for the atomic operations. We cannot use the standard
45+
// cpp::atomic helpers because the initializer will no longer be constexpr and
46+
// the NVPTX backend cannot currently support all of the atomics.
47+
#define atomic_load(val, mem_order) __atomic_load_n(val, (int)mem_order)
48+
#define atomic_cas(val, expected, desired, success_order, failure_order) \
49+
__atomic_compare_exchange_n(val, expected, desired, /*weak=*/true, \
50+
(int)success_order, (int)failure_order)
51+
52+
// Attempts to pop data from the given stack by making it point to the next
53+
// node. We repeatedly attempt to write to the head using compare-and-swap,
54+
// expecting that it has not been changed by any other thread.
55+
uint32_t pop_impl(uint64_t *head) {
56+
uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED);
57+
58+
for (;;) {
59+
if (get_node(orig) == CAPACITY)
60+
return CAPACITY;
61+
62+
uint32_t node =
63+
atomic_load(&next[get_node(orig)], cpp::MemoryOrder::RELAXED);
64+
if (atomic_cas(head, &orig, make_new_head(orig, node),
65+
cpp::MemoryOrder::ACQUIRE, cpp::MemoryOrder::RELAXED))
66+
break;
67+
sleep_briefly();
68+
}
69+
return get_node(orig);
70+
}
71+
72+
// Attempts to push data to the given stack by making it point to the new
73+
// node. We repeatedly attempt to write to the head using compare-and-swap,
74+
// expecting that it has not been changed by any other thread.
75+
uint32_t push_impl(uint64_t *head, uint32_t node) {
76+
uint64_t orig = atomic_load(head, cpp::MemoryOrder::RELAXED);
77+
for (;;) {
78+
next[node] = get_node(orig);
79+
if (atomic_cas(head, &orig, make_new_head(orig, node),
80+
cpp::MemoryOrder::RELEASE, cpp::MemoryOrder::RELAXED))
81+
break;
82+
sleep_briefly();
83+
}
84+
return get_node(*head);
85+
}
86+
87+
public:
88+
// Initialize the free stack to be full and the used stack to be empty. We use
89+
// the capacity of the stack as a sentinel value.
90+
constexpr FixedStack() : free(0), used(CAPACITY), data{} {
91+
for (uint32_t i = 0; i < CAPACITY; ++i)
92+
next[i] = i + 1;
93+
}
94+
95+
bool push(const T &val) {
96+
uint32_t node = pop_impl(&free);
97+
if (node == CAPACITY)
98+
return false;
99+
100+
data[node] = val;
101+
push_impl(&used, node);
102+
return true;
103+
}
104+
105+
bool pop(T &val) {
106+
uint32_t node = pop_impl(&used);
107+
if (node == CAPACITY)
108+
return false;
109+
110+
val = data[node];
111+
push_impl(&free, node);
112+
return true;
113+
}
114+
115+
bool empty() const {
116+
return get_node(atomic_load(&used, cpp::MemoryOrder::RELAXED)) == CAPACITY;
117+
}
118+
119+
bool full() const {
120+
return get_node(atomic_load(&free, cpp::MemoryOrder::RELAXED)) == CAPACITY;
121+
}
122+
123+
#undef atomic_load
124+
#undef atomic_cas
125+
};
126+
127+
} // namespace LIBC_NAMESPACE
128+
129+
#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDSTACK_H
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1+
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
2+
add_subdirectory(${LIBC_TARGET_OS})
3+
endif()
4+
15
add_subdirectory(threads)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
add_custom_target(support-gpu-integration-tests)
2+
add_dependencies(libc-integration-tests support-gpu-integration-tests)
3+
4+
add_integration_test(
5+
support_fixed_stack_test
6+
SUITE support-gpu-integration-tests
7+
SRCS
8+
fixed_stack_test.cpp
9+
DEPENDS
10+
libc.src.__support.GPU.utils
11+
libc.src.__support.fixedstack
12+
LOADER_ARGS
13+
--blocks-x 2
14+
--blocks-y 2
15+
--blocks-z 2
16+
--threads-x 4
17+
--threads-y 4
18+
--threads-z 4
19+
)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
//===-- Integration test for the lock-free stack --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/GPU/utils.h"
10+
#include "src/__support/fixedstack.h"
11+
#include "test/IntegrationTest/test.h"
12+
13+
using namespace LIBC_NAMESPACE;
14+
15+
void single_thread() {
16+
// FIXME: The NVPTX backend cannot handle atomic CAS on a local address space.
17+
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
18+
FixedStack<int, 16> local_stack;
19+
20+
for (int i = 0; i < 16; ++i)
21+
EXPECT_TRUE(local_stack.push(i));
22+
ASSERT_TRUE(local_stack.full());
23+
24+
int val;
25+
for (int i = 0; i < 16; ++i) {
26+
EXPECT_TRUE(local_stack.pop(val));
27+
EXPECT_EQ(val, 16 - 1 - i);
28+
}
29+
ASSERT_TRUE(local_stack.empty());
30+
#endif
31+
}
32+
33+
static FixedStack<uint32_t, 2048> global_stack;
34+
void multiple_threads() {
35+
// We need enough space in the stack as threads in flight can temporarily
36+
// consume memory before they finish comitting it back to the stack.
37+
ASSERT_EQ(gpu::get_num_blocks() * gpu::get_num_threads(), 512);
38+
39+
uint32_t val;
40+
uint32_t num_threads = static_cast<uint32_t>(gpu::get_num_threads());
41+
for (int i = 0; i < 256; ++i) {
42+
EXPECT_TRUE(global_stack.push(UINT32_MAX))
43+
EXPECT_TRUE(global_stack.pop(val))
44+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
45+
}
46+
47+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
48+
EXPECT_TRUE(global_stack.push(static_cast<uint32_t>(gpu::get_thread_id())));
49+
EXPECT_TRUE(global_stack.pop(val));
50+
ASSERT_TRUE(val < num_threads || val == UINT32_MAX);
51+
52+
// Fill the rest of the stack with the default value.
53+
while (!global_stack.push(UINT32_MAX))
54+
;
55+
}
56+
57+
// Once all the threads have finished executing check the final state of the
58+
// stack. Destructors are always run with a single thread on the GPU.
59+
[[gnu::destructor]] void check_stack() {
60+
ASSERT_FALSE(global_stack.empty());
61+
62+
while (!global_stack.empty()) {
63+
uint32_t val;
64+
ASSERT_TRUE(global_stack.pop(val));
65+
ASSERT_TRUE(val < 64 || val == UINT32_MAX);
66+
}
67+
}
68+
69+
TEST_MAIN(int argc, char **argv, char **envp) {
70+
single_thread();
71+
72+
multiple_threads();
73+
74+
return 0;
75+
}

libc/test/src/__support/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,16 @@ add_libc_test(
122122
libc.src.__support.fixedvector
123123
)
124124

125+
add_libc_test(
126+
fixedstack_test
127+
SUITE
128+
libc-support-tests
129+
SRCS
130+
fixedstack_test.cpp
131+
DEPENDS
132+
libc.src.__support.fixedstack
133+
)
134+
125135
add_libc_test(
126136
char_vector_test
127137
SUITE
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//===-- Unittests for FixedStack ------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/fixedstack.h"
10+
#include "test/UnitTest/Test.h"
11+
12+
TEST(LlvmLibcFixedVectorTest, PushAndPop) {
13+
static LIBC_NAMESPACE::FixedStack<int, 20> fixed_stack;
14+
ASSERT_TRUE(fixed_stack.empty());
15+
for (int i = 0; i < 20; i++)
16+
ASSERT_TRUE(fixed_stack.push(i));
17+
ASSERT_FALSE(fixed_stack.empty());
18+
ASSERT_FALSE(fixed_stack.push(123));
19+
int val;
20+
for (int i = 20; i > 0; --i) {
21+
ASSERT_TRUE(fixed_stack.pop(val));
22+
ASSERT_EQ(val, i - 1);
23+
}
24+
ASSERT_FALSE(fixed_stack.pop(val));
25+
ASSERT_TRUE(fixed_stack.empty());
26+
}

0 commit comments

Comments
 (0)