@@ -157,10 +157,19 @@ struct Slab {
157
157
Header *header = reinterpret_cast <Header *>(memory);
158
158
header->chunk_size = chunk_size;
159
159
header->global_index = global_index;
160
+ }
160
161
161
- // This memset is expensive and likely not necessary for the current 'kfd'
162
- // driver. Until zeroed pages are exposed by the API we must be careful.
163
- __builtin_memset (get_bitfield (), 0 , bitfield_bytes (chunk_size));
162
+ // Set the necessary bitfield bytes to zero in parallel using many lanes. This
163
+ // must be called before the bitfield can be accessed safely, memory is not
164
+ // guaranteed to be zero initialized in the current implementation.
165
+ void initialize (uint64_t uniform) {
166
+ uint64_t mask = gpu::get_lane_mask ();
167
+ uint32_t *bitfield = get_bitfield ();
168
+ uint32_t workers = cpp::popcount (uniform);
169
+ uint32_t words = (bitfield_bytes (get_chunk_size ()) + sizeof (uint32_t ) - 1 ) /
170
+ sizeof (uint32_t );
171
+ for (uint32_t i = impl::lane_count (mask & uniform); i < words; i += workers)
172
+ bitfield[i] = 0 ;
164
173
}
165
174
166
175
// Get the number of chunks that can theoretically fit inside this slab.
@@ -283,7 +292,7 @@ struct Slab {
283
292
284
293
// / A wait-free guard around a pointer resource to be created dynamically if
285
294
// / space is available and freed once there are no more users.
286
- template < typename T> struct GuardPtr {
295
+ struct GuardPtr {
287
296
private:
288
297
struct RefCounter {
289
298
// Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,32 +348,25 @@ template <typename T> struct GuardPtr {
339
348
cpp::Atomic<uint64_t > counter{0 };
340
349
};
341
350
342
- cpp::Atomic<T *> ptr{nullptr };
351
+ cpp::Atomic<Slab *> ptr{nullptr };
343
352
RefCounter ref{};
344
353
345
354
// Should be called be a single lane for each different pointer.
346
355
template <typename ... Args>
347
- T *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
348
- T *expected = ptr.load (cpp::MemoryOrder::RELAXED);
356
+ Slab *try_lock_impl (uint32_t n, uint64_t &count, Args &&...args) {
357
+ Slab *expected = ptr.load (cpp::MemoryOrder::RELAXED);
349
358
if (!expected &&
350
- ptr.compare_exchange_strong (expected, reinterpret_cast <T *>(SENTINEL),
351
- cpp::MemoryOrder::RELAXED ,
352
- cpp::MemoryOrder::RELAXED)) {
359
+ ptr.compare_exchange_strong (
360
+ expected, reinterpret_cast <Slab *>(SENTINEL) ,
361
+ cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
353
362
count = cpp::numeric_limits<uint64_t >::max ();
354
- void *raw = impl::rpc_allocate (sizeof (T ));
363
+ void *raw = impl::rpc_allocate (sizeof (Slab ));
355
364
if (!raw)
356
365
return nullptr ;
357
- T *mem = new (raw) T (cpp::forward<Args>(args)...);
358
-
359
- cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
360
- ptr.store (mem, cpp::MemoryOrder::RELAXED);
361
- cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
362
- if (!ref.acquire (n, count))
363
- ref.reset (n, count);
364
- return mem;
366
+ return new (raw) Slab (cpp::forward<Args>(args)...);
365
367
}
366
368
367
- if (!expected || expected == reinterpret_cast <T *>(SENTINEL))
369
+ if (!expected || expected == reinterpret_cast <Slab *>(SENTINEL))
368
370
return nullptr ;
369
371
370
372
if (!ref.acquire (n, count))
@@ -374,15 +376,25 @@ template <typename T> struct GuardPtr {
374
376
return ptr.load (cpp::MemoryOrder::RELAXED);
375
377
}
376
378
379
+ // Finalize the associated memory and signal that it is ready to use by
380
+ // resetting the counter.
381
+ void finalize (Slab *mem, uint32_t n, uint64_t &count) {
382
+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
383
+ ptr.store (mem, cpp::MemoryOrder::RELAXED);
384
+ cpp::atomic_thread_fence (cpp::MemoryOrder::ACQUIRE);
385
+ if (!ref.acquire (n, count))
386
+ ref.reset (n, count);
387
+ }
388
+
377
389
public:
378
390
// Attempt to lock access to the pointer, potentially creating it if empty.
379
391
// The uniform mask represents which lanes share the same pointer. For each
380
392
// uniform value we elect a leader to handle it on behalf of the other lanes.
381
393
template <typename ... Args>
382
- T *try_lock (uint64_t lane_mask, uint64_t uniform, uint64_t &count,
383
- Args &&...args) {
394
+ Slab *try_lock (uint64_t lane_mask, uint64_t uniform, uint64_t &count,
395
+ Args &&...args) {
384
396
count = 0 ;
385
- T *result = nullptr ;
397
+ Slab *result = nullptr ;
386
398
if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (uniform)))
387
399
result = try_lock_impl (cpp::popcount (uniform), count,
388
400
cpp::forward<Args>(args)...);
@@ -392,6 +404,14 @@ template <typename T> struct GuardPtr {
392
404
if (!result)
393
405
return nullptr ;
394
406
407
+ // We defer storing the newly allocated slab until now so that we can use
408
+ // multiple lanes to initialize it and release it for use.
409
+ if (count == cpp::numeric_limits<uint64_t >::max ()) {
410
+ result->initialize (uniform);
411
+ if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (uniform)))
412
+ finalize (result, cpp::popcount (uniform), count);
413
+ }
414
+
395
415
if (count != cpp::numeric_limits<uint64_t >::max ())
396
416
count = count - cpp::popcount (uniform) + impl::lane_count (uniform) + 1 ;
397
417
@@ -403,8 +423,8 @@ template <typename T> struct GuardPtr {
403
423
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
404
424
if (gpu::get_lane_id () == uint32_t (cpp::countr_zero (mask)) &&
405
425
ref.release (cpp::popcount (mask))) {
406
- T *p = ptr.load (cpp::MemoryOrder::RELAXED);
407
- p->~T ();
426
+ Slab *p = ptr.load (cpp::MemoryOrder::RELAXED);
427
+ p->~Slab ();
408
428
impl::rpc_free (p);
409
429
cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
410
430
ptr.store (nullptr , cpp::MemoryOrder::RELAXED);
@@ -417,7 +437,7 @@ template <typename T> struct GuardPtr {
417
437
};
418
438
419
439
// The global array used to search for a valid slab to allocate from.
420
- static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
440
+ static GuardPtr slots[ARRAY_SIZE] = {};
421
441
422
442
// Tries to find a slab in the table that can support the given chunk size.
423
443
static Slab *find_slab (uint32_t chunk_size) {
0 commit comments