Skip to content

[libc] Perform bitfield zero initialization wave-parallel #143607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 11, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 35 additions & 11 deletions libc/src/__support/GPU/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
return (x + N) & ~(N - 1);
}

// Perform a lane parallel memset on a uint32_t pointer.
void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
uint64_t mask = gpu::get_lane_mask();
uint32_t workers = cpp::popcount(uniform);
for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
s[i] = c;
}

} // namespace impl

/// A slab allocator used to hand out identically sized slabs of memory.
Expand Down Expand Up @@ -157,10 +165,15 @@ struct Slab {
Header *header = reinterpret_cast<Header *>(memory);
header->chunk_size = chunk_size;
header->global_index = global_index;
}

// This memset is expensive and likely not necessary for the current 'kfd'
// driver. Until zeroed pages are exposed by the API we must be careful.
__builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
// Set the necessary bitfield bytes to zero in parallel using many lanes. This
// must be called before the bitfield can be accessed safely, memory is not
// guaranteed to be zero initialized in the current implementation.
void initialize(uint64_t uniform) {
uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
sizeof(uint32_t);
impl::uniform_memset(get_bitfield(), 0, size, uniform);
}

// Get the number of chunks that can theoretically fit inside this slab.
Expand Down Expand Up @@ -354,14 +367,7 @@ struct GuardPtr {
void *raw = impl::rpc_allocate(sizeof(Slab));
if (!raw)
return nullptr;
Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);

cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
ptr.store(mem, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
if (!ref.acquire(n, count))
ref.reset(n, count);
return mem;
return new (raw) Slab(cpp::forward<Args>(args)...);
}

if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
Expand All @@ -374,6 +380,16 @@ struct GuardPtr {
return ptr.load(cpp::MemoryOrder::RELAXED);
}

// Finalize the associated memory and signal that it is ready to use by
// resetting the counter.
void finalize(Slab *mem, uint32_t n, uint64_t &count) {
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
ptr.store(mem, cpp::MemoryOrder::RELAXED);
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
if (!ref.acquire(n, count))
ref.reset(n, count);
}

public:
// Attempt to lock access to the pointer, potentially creating it if empty.
// The uniform mask represents which lanes share the same pointer. For each
Expand All @@ -392,6 +408,14 @@ struct GuardPtr {
if (!result)
return nullptr;

// We defer storing the newly allocated slab until now so that we can use
// multiple lanes to initialize it and release it for use.
if (count == cpp::numeric_limits<uint64_t>::max()) {
result->initialize(uniform);
if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
finalize(result, cpp::popcount(uniform), count);
}

if (count != cpp::numeric_limits<uint64_t>::max())
count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;

Expand Down
Loading