Skip to content

Commit 1b376cd

Browse files
authored
Merge pull request #34463 from mikeash/concurrentreadablehashmap-inline-indices
[Runtime] Have ConcurrentReadableHashMap store indices inline when the table is sufficiently small.
2 parents b402827 + 9f53d4a commit 1b376cd

File tree

1 file changed

+141
-70
lines changed

1 file changed

+141
-70
lines changed

include/swift/Runtime/Concurrent.h

Lines changed: 141 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -624,25 +624,76 @@ struct ConcurrentReadableHashMap {
624624
/// is stored inline. We work around this contradiction by considering the
625625
/// first index to always be occupied with a value that never matches any key.
626626
struct IndexStorage {
627+
using RawType = uintptr_t;
628+
629+
RawType Value;
630+
631+
static constexpr uintptr_t log2(uintptr_t x) {
632+
return x <= 1 ? 0 : log2(x >> 1) + 1;
633+
}
634+
635+
static constexpr uintptr_t InlineIndexBits = 4;
636+
static constexpr uintptr_t InlineIndexMask = 0xF;
637+
static constexpr uintptr_t InlineCapacity =
638+
sizeof(RawType) * CHAR_BIT / InlineIndexBits;
639+
static constexpr uintptr_t InlineCapacityLog2 = log2(InlineCapacity);
640+
641+
// Indices can be stored in different ways, depending on how big they need
642+
// to be. The index mode is stored in the bottom two bits of Value. The
643+
// meaning of the rest of Value depends on the mode.
644+
enum class IndexMode {
645+
// Value is treated as an array of four-bit integers, storing the indices.
646+
// The first element overlaps with the mode, and is never used.
647+
Inline,
648+
649+
// The rest of Value holds a pointer to storage. The first byte of this
650+
// storage holds the log2 of the storage capacity. The storage is treated
651+
// as an array of 8, 16, or 32-bit integers. The first element overlaps
652+
// with the capacity, and is never used.
653+
Array8,
654+
Array16,
655+
Array32,
656+
};
657+
658+
IndexStorage() : Value(0) {}
659+
IndexStorage(RawType value) : Value(value) {}
660+
IndexStorage(void *ptr, unsigned indexSize, uint8_t capacityLog2) {
661+
assert(capacityLog2 > InlineCapacityLog2);
662+
IndexMode mode;
663+
switch (indexSize) {
664+
case sizeof(uint8_t):
665+
mode = IndexMode::Array8;
666+
break;
667+
case sizeof(uint16_t):
668+
mode = IndexMode::Array16;
669+
break;
670+
case sizeof(uint32_t):
671+
mode = IndexMode::Array32;
672+
break;
673+
default:
674+
swift_unreachable("unknown index size");
675+
}
676+
Value = reinterpret_cast<uintptr_t>(ptr) | static_cast<uintptr_t>(mode);
677+
*reinterpret_cast<uint8_t *>(ptr) = capacityLog2;
678+
}
679+
680+
bool valueIsPointer() { return Value & 3; }
681+
682+
void *pointer() {
683+
if (valueIsPointer())
684+
return (void *)(Value & (RawType)~3);
685+
return nullptr;
686+
}
687+
688+
IndexMode indexMode() { return IndexMode(Value & 3); }
689+
627690
// Index size is variable based on capacity, either 8, 16, or 32 bits.
628691
//
629692
// This is somewhat conservative. We could have, for example, a capacity of
630693
// 512 but a maximum index of only 200, which would still allow for 8-bit
631694
// indices. However, taking advantage of this would require reallocating
632695
// the index storage when the element count crossed a threshold, which is
633696
// more complex, and the advantages are minimal. This keeps it simple.
634-
//
635-
// The first byte of the storage is the log 2 of the capacity. The remaining
636-
// storage is then an array of 8, 16, or 32 bit integers, depending on the
637-
// capacity number. This union allows us to access the capacity, and then
638-
// access the rest of the storage by taking the address of one of the
639-
// IndexZero members and indexing into it (always avoiding index 0).
640-
union {
641-
uint8_t CapacityLog2;
642-
std::atomic<uint8_t> IndexZero8;
643-
std::atomic<uint16_t> IndexZero16;
644-
std::atomic<uint32_t> IndexZero32;
645-
};
646697

647698
// Get the size, in bytes, of the index needed for the given capacity.
648699
static unsigned indexSize(uint8_t capacityLog2) {
@@ -653,46 +704,66 @@ struct ConcurrentReadableHashMap {
653704
return sizeof(uint32_t);
654705
}
655706

656-
unsigned indexSize() { return indexSize(CapacityLog2); }
707+
uint8_t getCapacityLog2() {
708+
if (auto *ptr = pointer())
709+
return *reinterpret_cast<uint8_t *>(ptr);
710+
return InlineCapacityLog2;
711+
}
657712

658-
static IndexStorage *allocate(size_t capacityLog2) {
713+
static IndexStorage allocate(size_t capacityLog2) {
659714
assert(capacityLog2 > 0);
660715
size_t capacity = 1UL << capacityLog2;
661-
auto *ptr = reinterpret_cast<IndexStorage *>(
662-
calloc(capacity, indexSize(capacityLog2)));
716+
unsigned size = indexSize(capacityLog2);
717+
auto *ptr = calloc(capacity, size);
663718
if (!ptr)
664719
swift::crash("Could not allocate memory.");
665-
ptr->CapacityLog2 = capacityLog2;
666-
return ptr;
720+
return IndexStorage(ptr, size, capacityLog2);
667721
}
668722

669723
unsigned loadIndexAt(size_t i, std::memory_order order) {
670724
assert(i > 0 && "index zero is off-limits, used to store capacity");
671-
672-
switch (indexSize()) {
673-
case sizeof(uint8_t):
674-
return (&IndexZero8)[i].load(order);
675-
case sizeof(uint16_t):
676-
return (&IndexZero16)[i].load(order);
677-
case sizeof(uint32_t):
678-
return (&IndexZero32)[i].load(order);
679-
default:
680-
swift_unreachable("unknown index size");
725+
assert(i < (1 << getCapacityLog2()) &&
726+
"index is off the end of the indices");
727+
728+
switch (indexMode()) {
729+
case IndexMode::Inline:
730+
return (Value >> (i * InlineIndexBits)) & InlineIndexMask;
731+
case IndexMode::Array8:
732+
return ((std::atomic<uint8_t> *)pointer())[i].load(order);
733+
case IndexMode::Array16:
734+
return ((std::atomic<uint16_t> *)pointer())[i].load(order);
735+
case IndexMode::Array32:
736+
return ((std::atomic<uint32_t> *)pointer())[i].load(order);
681737
}
682738
}
683739

684-
void storeIndexAt(unsigned value, size_t i, std::memory_order order) {
740+
void storeIndexAt(std::atomic<RawType> *inlineStorage, unsigned value,
741+
size_t i, std::memory_order order) {
685742
assert(i > 0 && "index zero is off-limits, used to store capacity");
686-
687-
switch (indexSize()) {
688-
case sizeof(uint8_t):
689-
return (&IndexZero8)[i].store(value, order);
690-
case sizeof(uint16_t):
691-
return (&IndexZero16)[i].store(value, order);
692-
case sizeof(uint32_t):
693-
return (&IndexZero32)[i].store(value, order);
694-
default:
695-
swift_unreachable("unknown index size");
743+
assert(i < (1 << getCapacityLog2()) &&
744+
"index is off the end of the indices");
745+
746+
switch (indexMode()) {
747+
case IndexMode::Inline: {
748+
assert(value == (value & InlineIndexMask) && "value is too big to fit");
749+
auto shift = i * InlineIndexBits;
750+
assert((Value & (InlineIndexMask << shift)) == 0 &&
751+
"can't overwrite an existing index");
752+
assert(Value == inlineStorage->load(std::memory_order_relaxed) &&
753+
"writing with a stale IndexStorage");
754+
auto newStorage = Value | ((RawType)value << shift);
755+
inlineStorage->store(newStorage, order);
756+
break;
757+
}
758+
case IndexMode::Array8:
759+
((std::atomic<uint8_t> *)pointer())[i].store(value, order);
760+
break;
761+
case IndexMode::Array16:
762+
((std::atomic<uint16_t> *)pointer())[i].store(value, order);
763+
break;
764+
case IndexMode::Array32:
765+
((std::atomic<uint32_t> *)pointer())[i].store(value, order);
766+
break;
696767
}
697768
}
698769
};
@@ -753,7 +824,11 @@ struct ConcurrentReadableHashMap {
753824
std::atomic<ElementStorage *> Elements{nullptr};
754825

755826
/// The array of indices.
756-
std::atomic<IndexStorage *> Indices{nullptr};
827+
///
828+
/// This has to be stored as a IndexStorage::RawType instead of a IndexStorage
829+
/// because some of our targets don't support interesting structs as atomic
830+
/// types. See also MetadataCache::TrackingInfo which uses the same technique.
831+
std::atomic<typename IndexStorage::RawType> Indices{0};
757832

758833
/// The writer lock, which must be taken before any mutation of the table.
759834
MutexTy WriterLock;
@@ -798,18 +873,17 @@ struct ConcurrentReadableHashMap {
798873
/// returning the new array with all existing indices copied into it. This
799874
/// operation performs a rehash, so that the indices are in the correct
800875
/// location in the new array.
801-
IndexStorage *resize(IndexStorage *indices, uint8_t indicesCapacityLog2,
802-
ElemTy *elements) {
803-
// Double the size. Start with 16 (fits into 16-byte malloc
804-
// bucket), which is 2^4.
805-
size_t newCapacityLog2 = indices ? indicesCapacityLog2 + 1 : 4;
876+
IndexStorage resize(IndexStorage indices, uint8_t indicesCapacityLog2,
877+
ElemTy *elements) {
878+
// Double the size.
879+
size_t newCapacityLog2 = indicesCapacityLog2 + 1;
806880
size_t newMask = (1UL << newCapacityLog2) - 1;
807881

808-
IndexStorage *newIndices = IndexStorage::allocate(newCapacityLog2);
882+
IndexStorage newIndices = IndexStorage::allocate(newCapacityLog2);
809883

810884
size_t indicesCount = 1UL << indicesCapacityLog2;
811885
for (size_t i = 1; i < indicesCount; i++) {
812-
unsigned index = indices->loadIndexAt(i, std::memory_order_relaxed);
886+
unsigned index = indices.loadIndexAt(i, std::memory_order_relaxed);
813887
if (index == 0)
814888
continue;
815889

@@ -819,15 +893,16 @@ struct ConcurrentReadableHashMap {
819893
size_t newI = hash & newMask;
820894
// Index 0 is unusable (occupied by the capacity), so always skip it.
821895
while (newI == 0 ||
822-
newIndices->loadIndexAt(newI, std::memory_order_relaxed) != 0) {
896+
newIndices.loadIndexAt(newI, std::memory_order_relaxed) != 0) {
823897
newI = (newI + 1) & newMask;
824898
}
825-
newIndices->storeIndexAt(index, newI, std::memory_order_relaxed);
899+
newIndices.storeIndexAt(nullptr, index, newI, std::memory_order_relaxed);
826900
}
827901

828-
Indices.store(newIndices, std::memory_order_release);
902+
Indices.store(newIndices.Value, std::memory_order_release);
829903

830-
FreeListNode::add(&FreeList, indices);
904+
if (auto *ptr = indices.pointer())
905+
FreeListNode::add(&FreeList, ptr);
831906

832907
return newIndices;
833908
}
@@ -838,20 +913,18 @@ struct ConcurrentReadableHashMap {
838913
/// of the new element would be stored.
839914
template <class KeyTy>
840915
static std::pair<ElemTy *, unsigned>
841-
find(const KeyTy &key, IndexStorage *indices, size_t elementCount,
916+
find(const KeyTy &key, IndexStorage indices, size_t elementCount,
842917
ElemTy *elements) {
843-
if (!indices)
844-
return {nullptr, 0};
845918
auto hash = hash_value(key);
846-
auto indicesMask = (1UL << indices->CapacityLog2) - 1;
919+
auto indicesMask = (1UL << indices.getCapacityLog2()) - 1;
847920

848921
auto i = hash & indicesMask;
849922
while (true) {
850923
// Index 0 is used for the mask and is not actually an index.
851924
if (i == 0)
852925
i++;
853926

854-
auto index = indices->loadIndexAt(i, std::memory_order_acquire);
927+
auto index = indices.loadIndexAt(i, std::memory_order_acquire);
855928
// Element indices are 1-based, 0 means no entry.
856929
if (index == 0)
857930
return {nullptr, i};
@@ -884,12 +957,12 @@ struct ConcurrentReadableHashMap {
884957
/// Readers take a snapshot of the hash map, then work with the snapshot.
885958
class Snapshot {
886959
ConcurrentReadableHashMap *Map;
887-
IndexStorage *Indices;
960+
IndexStorage Indices;
888961
ElemTy *Elements;
889962
size_t ElementCount;
890963

891964
public:
892-
Snapshot(ConcurrentReadableHashMap *map, IndexStorage *indices,
965+
Snapshot(ConcurrentReadableHashMap *map, IndexStorage indices,
893966
ElemTy *elements, size_t elementCount)
894967
: Map(map), Indices(indices), Elements(elements),
895968
ElementCount(elementCount) {}
@@ -905,7 +978,7 @@ struct ConcurrentReadableHashMap {
905978
/// Search for an element matching the given key. Returns a pointer to the
906979
/// found element, or nullptr if no matching element exists.
907980
template <class KeyTy> const ElemTy *find(const KeyTy &key) {
908-
if (!Indices || !ElementCount || !Elements)
981+
if (!Indices.Value || !ElementCount || !Elements)
909982
return nullptr;
910983
return ConcurrentReadableHashMap::find(key, Indices, ElementCount,
911984
Elements)
@@ -937,7 +1010,7 @@ struct ConcurrentReadableHashMap {
9371010
// pointer can just mean a concurrent insert that triggered a resize of the
9381011
// elements array. This is harmless aside from a small performance hit, and
9391012
// should not happen often.
940-
IndexStorage *indices;
1013+
IndexStorage indices;
9411014
size_t elementCount;
9421015
ElementStorage *elements;
9431016
ElementStorage *elements2;
@@ -972,11 +1045,8 @@ struct ConcurrentReadableHashMap {
9721045
void getOrInsert(KeyTy key, const Call &call) {
9731046
ScopedLockTy guard(WriterLock);
9741047

975-
auto *indices = Indices.load(std::memory_order_relaxed);
976-
if (!indices)
977-
indices = resize(indices, 0, nullptr);
978-
979-
auto indicesCapacityLog2 = indices->CapacityLog2;
1048+
auto indices = IndexStorage{Indices.load(std::memory_order_relaxed)};
1049+
auto indicesCapacityLog2 = indices.getCapacityLog2();
9801050
auto elementCount = ElementCount.load(std::memory_order_relaxed);
9811051
auto *elements = Elements.load(std::memory_order_relaxed);
9821052
auto *elementsPtr = elements ? elements->data() : nullptr;
@@ -1012,8 +1082,8 @@ struct ConcurrentReadableHashMap {
10121082
assert(hash_value(key) == hash_value(*element) &&
10131083
"Element must have the same hash code as its key.");
10141084
ElementCount.store(elementCount + 1, std::memory_order_release);
1015-
indices->storeIndexAt(elementCount + 1, found.second,
1016-
std::memory_order_release);
1085+
indices.storeIndexAt(&Indices, elementCount + 1, found.second,
1086+
std::memory_order_release);
10171087
}
10181088

10191089
deallocateFreeListIfSafe();
@@ -1024,16 +1094,17 @@ struct ConcurrentReadableHashMap {
10241094
void clear() {
10251095
ScopedLockTy guard(WriterLock);
10261096

1027-
auto *indices = Indices.load(std::memory_order_relaxed);
1097+
IndexStorage indices = Indices.load(std::memory_order_relaxed);
10281098
auto *elements = Elements.load(std::memory_order_relaxed);
10291099

10301100
// Order doesn't matter here, snapshots will gracefully handle any field
10311101
// being NULL/0 while the others are not.
1032-
Indices.store(nullptr, std::memory_order_relaxed);
1102+
Indices.store(0, std::memory_order_relaxed);
10331103
ElementCount.store(0, std::memory_order_relaxed);
10341104
Elements.store(nullptr, std::memory_order_relaxed);
10351105

1036-
FreeListNode::add(&FreeList, indices);
1106+
if (auto *ptr = indices.pointer())
1107+
FreeListNode::add(&FreeList, ptr);
10371108
FreeListNode::add(&FreeList, elements);
10381109

10391110
deallocateFreeListIfSafe();

0 commit comments

Comments
 (0)