Skip to content

Commit ccbe5fc

Browse files
committed
Switch MetadataCache to use a global slab allocator.
This seems to more than fix a performance regression that we detected on a metadata-allocation microbenchmark. A few months ago, I improved the metadata cache representation and changed the metadata allocation scheme to primarily use malloc. Previously, we'd been using malloc in the concurrent tree data structure but a per-cache slab allocator for the metadata itself. At the time, I was concerned about the overhead of per-cache allocators, since many metadata patterns see only a small number of instantiations. That's still an important factor, so in the new scheme we're using a global allocator; but instead of using malloc for individual allocations, we're using a slab allocator, which should have better peak, single-thread performance, at the cost of not easily supporting deallocation. Deallocation is only used for metadata when there's contention on the cache, and specifically only when there's contention for the same key, so leaking a little isn't the worst thing in the world. The initial slab is a 64K globally-allocated buffer. Successive slabs are 16K and allocated with malloc. rdar://28189496
1 parent 400f338 commit ccbe5fc

File tree

7 files changed

+117
-8
lines changed

7 files changed

+117
-8
lines changed

cmake/modules/AddSwift.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ function(_add_variant_link_flags)
329329
RESULT_VAR_NAME result)
330330

331331
if("${LFLAGS_SDK}" STREQUAL "LINUX")
332-
list(APPEND result "-lpthread" "-ldl")
332+
list(APPEND result "-lpthread" "-latomic" "-ldl")
333333
elseif("${LFLAGS_SDK}" STREQUAL "FREEBSD")
334334
list(APPEND result "-lpthread")
335335
elseif("${LFLAGS_SDK}" STREQUAL "CYGWIN")

cmake/modules/AddSwiftUnittests.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ function(add_swift_unittest test_dirname)
4242
if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
4343
set_property(TARGET "${test_dirname}" APPEND_STRING PROPERTY
4444
LINK_FLAGS " -Xlinker -rpath -Xlinker ${SWIFT_LIBRARY_OUTPUT_INTDIR}/swift/macosx")
45+
elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux")
46+
set_property(TARGET "${test_dirname}" APPEND_STRING PROPERTY
47+
LINK_FLAGS " -latomic")
4548
endif()
4649

4750
if(SWIFT_ENABLE_GOLD_LINKER AND

include/swift/Runtime/Concurrent.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,9 @@ class ConcurrentMapBase<EntryTy, false, Allocator> : protected Allocator {
189189
// Destroy the node's payload.
190190
node->~Node();
191191

192-
// Deallocate the node.
193-
this->Deallocate(node, allocSize);
192+
// Deallocate the node. The static_cast here is required
193+
// because LLVM's allocator API is insane.
194+
this->Deallocate(static_cast<void*>(node), allocSize);
194195
}
195196
};
196197

stdlib/public/runtime/Metadata.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2809,3 +2809,99 @@ swift::swift_getGenericWitnessTable(GenericWitnessTable *genericTable,
28092809
}
28102810

28112811
uint64_t swift::RelativeDirectPointerNullPtr = 0;
2812+
2813+
/***************************************************************************/
2814+
/*** Allocator implementation **********************************************/
2815+
/***************************************************************************/
2816+
2817+
namespace {
2818+
struct PoolRange {
2819+
static constexpr uintptr_t PageSize = 16 * 1024;
2820+
static constexpr uintptr_t MaxPoolAllocationSize = PageSize / 2;
2821+
2822+
/// The start of the allocation.
2823+
char *Begin;
2824+
2825+
/// The number of bytes remaining.
2826+
size_t Remaining;
2827+
};
2828+
}
2829+
2830+
// A statically-allocated pool. It's zero-initialized, so this
2831+
// doesn't cost us anything in binary size.
2832+
LLVM_ALIGNAS(alignof(void*)) static char InitialAllocationPool[64*1024];
2833+
static std::atomic<PoolRange>
2834+
AllocationPool{PoolRange{InitialAllocationPool,
2835+
sizeof(InitialAllocationPool)}};
2836+
2837+
void *MetadataAllocator::Allocate(size_t size, size_t alignment) {
2838+
assert(alignment <= alignof(void*));
2839+
assert(size % alignof(void*) == 0);
2840+
2841+
// If the size is larger than the maximum, just use malloc.
2842+
if (size > PoolRange::MaxPoolAllocationSize)
2843+
return malloc(size);
2844+
2845+
// Allocate out of the pool.
2846+
PoolRange curState = AllocationPool.load(std::memory_order_relaxed);
2847+
while (true) {
2848+
char *allocation;
2849+
PoolRange newState;
2850+
bool allocatedNewPage;
2851+
2852+
// Try to allocate out of the current page.
2853+
if (size <= curState.Remaining) {
2854+
allocatedNewPage = false;
2855+
allocation = curState.Begin;
2856+
newState = PoolRange{curState.Begin + size, curState.Remaining - size};
2857+
} else {
2858+
allocatedNewPage = true;
2859+
allocation = new char[PoolRange::PageSize];
2860+
newState = PoolRange{allocation + size, PoolRange::PageSize - size};
2861+
__asan_poison_memory_region(allocation, PoolRange::PageSize);
2862+
}
2863+
2864+
// Swap in the new state.
2865+
if (std::atomic_compare_exchange_weak_explicit(&AllocationPool,
2866+
&curState, newState,
2867+
std::memory_order_relaxed,
2868+
std::memory_order_relaxed)) {
2869+
// If that succeeded, we've successfully allocated.
2870+
__msan_allocated_memory(allocation, size);
2871+
__asan_poison_memory_region(allocation, size);
2872+
return allocation;
2873+
}
2874+
2875+
// If it failed, go back to a neutral state and try again.
2876+
if (allocatedNewPage) {
2877+
delete[] allocation;
2878+
}
2879+
}
2880+
}
2881+
2882+
void MetadataAllocator::Deallocate(const void *allocation, size_t size) {
2883+
__asan_poison_memory_region(allocation, size);
2884+
2885+
if (size > PoolRange::MaxPoolAllocationSize) {
2886+
free(const_cast<void*>(allocation));
2887+
return;
2888+
}
2889+
2890+
// Check whether the allocation pool is still in the state it was in
2891+
// immediately after the given allocation.
2892+
PoolRange curState = AllocationPool.load(std::memory_order_relaxed);
2893+
if (reinterpret_cast<const char*>(allocation) + size != curState.Begin) {
2894+
return;
2895+
}
2896+
2897+
// Try to swap back to the pre-allocation state. If this fails,
2898+
// don't bother trying again; we'll just leak the allocation.
2899+
PoolRange newState = { reinterpret_cast<char*>(const_cast<void*>(allocation)),
2900+
curState.Remaining + size };
2901+
(void)
2902+
std::atomic_compare_exchange_strong_explicit(&AllocationPool,
2903+
&curState, newState,
2904+
std::memory_order_relaxed,
2905+
std::memory_order_relaxed);
2906+
}
2907+

stdlib/public/runtime/MetadataCache.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,18 @@
2626

2727
namespace swift {
2828

29-
// For now, use malloc and free as our standard allocator for
30-
// metadata caches. It might make sense in the future to take
31-
// advantage of the fact that we know that most allocations here
32-
// won't ever be deallocated.
33-
using MetadataAllocator = llvm::MallocAllocator;
29+
class MetadataAllocator : public llvm::AllocatorBase<MetadataAllocator> {
30+
public:
31+
void Reset() {}
32+
33+
LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t size, size_t alignment);
34+
using AllocatorBase<MetadataAllocator>::Allocate;
35+
36+
void Deallocate(const void *Ptr, size_t size);
37+
using AllocatorBase<MetadataAllocator>::Deallocate;
38+
39+
void PrintStats() const {}
40+
};
3441

3542
/// A typedef for simple global caches.
3643
template <class EntryTy>

utils/gen-static-stdlib-link-args

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ function write_linkfile {
6262
-ldl
6363
-lpthread
6464
-lswiftCore
65+
-latomic
6566
-lswiftImageInspectionShared
6667
$ICU_LIBS
6768
-Xlinker

utils/static-executable-args.lnk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
-Xlinker
99
--defsym=__import_pthread_key_create=pthread_key_create
1010
-lpthread
11+
-latomic
1112
-licui18n
1213
-licuuc
1314
-licudata

0 commit comments

Comments
 (0)