Skip to content

Commit 934baae

Browse files
[ADT] Add TrieRawHashMap
Implement TrieRawHashMap can be used to store object with its associated hash. User needs to supply a strong hashing function to guarantee the uniqueness of the hash of the objects to be inserted. A hash collision is not supported and will lead to error or failed to insert. TrieRawHashMap is thread-safe and lock-free and can be used as foundation data structure to implement a content addressible storage. TrieRawHashMap owns the data stored in it and is designed to be: * Fast to lookup. * Fast to "insert" if the data has already been inserted. * Can be used without lock and doesn't require any knowledge of the participating threads or extra coordination between threads. It is not currently designed to be used to insert unique new data with high contention, due to the limitation on the memory allocator.
1 parent e33aec8 commit 934baae

File tree

6 files changed

+1362
-0
lines changed

6 files changed

+1362
-0
lines changed
Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_ADT_TRIERAWHASHMAP_H
10+
#define LLVM_ADT_TRIERAWHASHMAP_H
11+
12+
#include "llvm/ADT/ArrayRef.h"
13+
#include <atomic>
14+
#include <optional>
15+
16+
namespace llvm {
17+
18+
class raw_ostream;
19+
20+
/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to
21+
/// store/index data based on a hash value. It can be customized to work with
22+
/// any hash algorithm or store any data.
23+
///
24+
/// Data structure:
25+
/// Data node stored in the Trie contains both hash and data:
26+
/// struct {
27+
/// HashT Hash;
28+
/// DataT Data;
29+
/// };
30+
///
31+
/// Data is stored/indexed via a prefix tree, where each node in the tree can be
32+
/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two
33+
/// data objects {0001, A} and {0100, B}, it can be stored in a trie
34+
/// (assuming Root has 2 bits, SubTrie has 1 bit):
35+
/// +--------+
36+
/// |Root[00]| -> {0001, A}
37+
/// | [01]| -> {0100, B}
38+
/// | [10]| (empty)
39+
/// | [11]| (empty)
40+
/// +--------+
41+
///
42+
/// Inserting a new object {0010, C} will result in:
43+
/// +--------+ +----------+
44+
/// |Root[00]| -> |SubTrie[0]| -> {0001, A}
45+
/// | | | [1]| -> {0010, C}
46+
/// | | +----------+
47+
/// | [01]| -> {0100, B}
48+
/// | [10]| (empty)
49+
/// | [11]| (empty)
50+
/// +--------+
51+
/// Note object A is sunk down to a sub-trie during the insertion. All the
52+
/// nodes are inserted through compare-exchange to ensure thread-safe and
53+
/// lock-free.
54+
///
55+
/// To find an object in the trie, walk the tree with prefix of the hash until
56+
/// the data node is found. Then the hash is compared with the hash stored in
57+
/// the data node to see if the is the same object.
58+
///
59+
/// Hash collision is not allowed so it is recommended to use trie with a
60+
/// "strong" hashing algorithm. A well-distributed hash can also result in
61+
/// better performance and memory usage.
62+
///
63+
/// It currently does not support iteration and deletion.
64+
65+
/// Base class for a lock-free thread-safe hash-mapped trie.
66+
class ThreadSafeTrieRawHashMapBase {
67+
public:
68+
static constexpr size_t TrieContentBaseSize = 4;
69+
static constexpr size_t DefaultNumRootBits = 6;
70+
static constexpr size_t DefaultNumSubtrieBits = 4;
71+
72+
private:
73+
template <class T> struct AllocValueType {
74+
char Base[TrieContentBaseSize];
75+
std::aligned_union_t<sizeof(T), T> Content;
76+
};
77+
78+
protected:
79+
template <class T>
80+
static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType<T>);
81+
82+
template <class T>
83+
static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType<T>);
84+
85+
template <class T>
86+
static constexpr size_t DefaultContentOffset =
87+
offsetof(AllocValueType<T>, Content);
88+
89+
public:
90+
static void *operator new(size_t Size) { return ::operator new(Size); }
91+
void operator delete(void *Ptr) { ::operator delete(Ptr); }
92+
93+
LLVM_DUMP_METHOD void dump() const;
94+
void print(raw_ostream &OS) const;
95+
96+
protected:
97+
/// Result of a lookup. Suitable for an insertion hint. Maybe could be
98+
/// expanded into an iterator of sorts, but likely not useful (visiting
99+
/// everything in the trie should probably be done some way other than
100+
/// through an iterator pattern).
101+
class PointerBase {
102+
protected:
103+
void *get() const { return I == -2u ? P : nullptr; }
104+
105+
public:
106+
PointerBase() noexcept = default;
107+
108+
private:
109+
friend class ThreadSafeTrieRawHashMapBase;
110+
explicit PointerBase(void *Content) : P(Content), I(-2u) {}
111+
PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {}
112+
113+
bool isHint() const { return I != -1u && I != -2u; }
114+
115+
void *P = nullptr;
116+
unsigned I = -1u;
117+
unsigned B = 0;
118+
};
119+
120+
/// Find the stored content with hash.
121+
PointerBase find(ArrayRef<uint8_t> Hash) const;
122+
123+
/// Insert and return the stored content.
124+
PointerBase
125+
insert(PointerBase Hint, ArrayRef<uint8_t> Hash,
126+
function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
127+
Constructor);
128+
129+
ThreadSafeTrieRawHashMapBase() = delete;
130+
131+
ThreadSafeTrieRawHashMapBase(
132+
size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
133+
std::optional<size_t> NumRootBits = std::nullopt,
134+
std::optional<size_t> NumSubtrieBits = std::nullopt);
135+
136+
/// Destructor, which asserts if there's anything to do. Subclasses should
137+
/// call \a destroyImpl().
138+
///
139+
/// \pre \a destroyImpl() was already called.
140+
~ThreadSafeTrieRawHashMapBase();
141+
void destroyImpl(function_ref<void(void *ValueMem)> Destructor);
142+
143+
ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS);
144+
145+
// Move assignment is not supported as it is not thread-safe.
146+
ThreadSafeTrieRawHashMapBase &
147+
operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete;
148+
149+
// No copy.
150+
ThreadSafeTrieRawHashMapBase(const ThreadSafeTrieRawHashMapBase &) = delete;
151+
ThreadSafeTrieRawHashMapBase &
152+
operator=(const ThreadSafeTrieRawHashMapBase &) = delete;
153+
154+
// Debug functions. Implementation details and not guaranteed to be
155+
// thread-safe.
156+
PointerBase getRoot() const;
157+
unsigned getStartBit(PointerBase P) const;
158+
unsigned getNumBits(PointerBase P) const;
159+
unsigned getNumSlotUsed(PointerBase P) const;
160+
std::string getTriePrefixAsString(PointerBase P) const;
161+
unsigned getNumTries() const;
162+
// Visit next trie in the allocation chain.
163+
PointerBase getNextTrie(PointerBase P) const;
164+
165+
private:
166+
friend class TrieRawHashMapTestHelper;
167+
const unsigned short ContentAllocSize;
168+
const unsigned short ContentAllocAlign;
169+
const unsigned short ContentOffset;
170+
unsigned short NumRootBits;
171+
unsigned short NumSubtrieBits;
172+
class ImplType;
173+
// ImplPtr is owned by ThreadSafeTrieRawHashMapBase and needs to be freed in
174+
// destroyImpl.
175+
std::atomic<ImplType *> ImplPtr;
176+
ImplType &getOrCreateImpl();
177+
ImplType *getImpl() const;
178+
};
179+
180+
/// Lock-free thread-safe hash-mapped trie.
181+
template <class T, size_t NumHashBytes>
182+
class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase {
183+
public:
184+
using HashT = std::array<uint8_t, NumHashBytes>;
185+
186+
class LazyValueConstructor;
187+
struct value_type {
188+
const HashT Hash;
189+
T Data;
190+
191+
value_type(value_type &&) = default;
192+
value_type(const value_type &) = default;
193+
194+
value_type(ArrayRef<uint8_t> Hash, const T &Data)
195+
: Hash(makeHash(Hash)), Data(Data) {}
196+
value_type(ArrayRef<uint8_t> Hash, T &&Data)
197+
: Hash(makeHash(Hash)), Data(std::move(Data)) {}
198+
199+
private:
200+
friend class LazyValueConstructor;
201+
202+
struct EmplaceTag {};
203+
template <class... ArgsT>
204+
value_type(ArrayRef<uint8_t> Hash, EmplaceTag, ArgsT &&...Args)
205+
: Hash(makeHash(Hash)), Data(std::forward<ArgsT>(Args)...) {}
206+
207+
static HashT makeHash(ArrayRef<uint8_t> HashRef) {
208+
HashT Hash;
209+
std::copy(HashRef.begin(), HashRef.end(), Hash.data());
210+
return Hash;
211+
}
212+
};
213+
214+
using ThreadSafeTrieRawHashMapBase::operator delete;
215+
using HashType = HashT;
216+
217+
using ThreadSafeTrieRawHashMapBase::dump;
218+
using ThreadSafeTrieRawHashMapBase::print;
219+
220+
private:
221+
template <class ValueT> class PointerImpl : PointerBase {
222+
friend class ThreadSafeTrieRawHashMap;
223+
224+
ValueT *get() const {
225+
return reinterpret_cast<ValueT *>(PointerBase::get());
226+
}
227+
228+
public:
229+
ValueT &operator*() const {
230+
assert(get());
231+
return *get();
232+
}
233+
ValueT *operator->() const {
234+
assert(get());
235+
return get();
236+
}
237+
explicit operator bool() const { return get(); }
238+
239+
PointerImpl() = default;
240+
241+
protected:
242+
PointerImpl(PointerBase Result) : PointerBase(Result) {}
243+
};
244+
245+
public:
246+
class pointer;
247+
class const_pointer;
248+
class pointer : public PointerImpl<value_type> {
249+
friend class ThreadSafeTrieRawHashMap;
250+
friend class const_pointer;
251+
252+
public:
253+
pointer() = default;
254+
255+
private:
256+
pointer(PointerBase Result) : pointer::PointerImpl(Result) {}
257+
};
258+
259+
class const_pointer : public PointerImpl<const value_type> {
260+
friend class ThreadSafeTrieRawHashMap;
261+
262+
public:
263+
const_pointer() = default;
264+
const_pointer(const pointer &P) : const_pointer::PointerImpl(P) {}
265+
266+
private:
267+
const_pointer(PointerBase Result) : const_pointer::PointerImpl(Result) {}
268+
};
269+
270+
class LazyValueConstructor {
271+
public:
272+
value_type &operator()(T &&RHS) {
273+
assert(Mem && "Constructor already called, or moved away");
274+
return assign(::new (Mem) value_type(Hash, std::move(RHS)));
275+
}
276+
value_type &operator()(const T &RHS) {
277+
assert(Mem && "Constructor already called, or moved away");
278+
return assign(::new (Mem) value_type(Hash, RHS));
279+
}
280+
template <class... ArgsT> value_type &emplace(ArgsT &&...Args) {
281+
assert(Mem && "Constructor already called, or moved away");
282+
return assign(::new (Mem)
283+
value_type(Hash, typename value_type::EmplaceTag{},
284+
std::forward<ArgsT>(Args)...));
285+
}
286+
287+
LazyValueConstructor(LazyValueConstructor &&RHS)
288+
: Mem(RHS.Mem), Result(RHS.Result), Hash(RHS.Hash) {
289+
RHS.Mem = nullptr; // Moved away, cannot call.
290+
}
291+
~LazyValueConstructor() { assert(!Mem && "Constructor never called!"); }
292+
293+
private:
294+
value_type &assign(value_type *V) {
295+
Mem = nullptr;
296+
Result = V;
297+
return *V;
298+
}
299+
friend class ThreadSafeTrieRawHashMap;
300+
LazyValueConstructor() = delete;
301+
LazyValueConstructor(void *Mem, value_type *&Result, ArrayRef<uint8_t> Hash)
302+
: Mem(Mem), Result(Result), Hash(Hash) {
303+
assert(Hash.size() == sizeof(HashT) && "Invalid hash");
304+
assert(Mem && "Invalid memory for construction");
305+
}
306+
void *Mem;
307+
value_type *&Result;
308+
ArrayRef<uint8_t> Hash;
309+
};
310+
311+
/// Insert with a hint. Default-constructed hint will work, but it's
312+
/// recommended to start with a lookup to avoid overhead in object creation
313+
/// if it already exists.
314+
pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
315+
function_ref<void(LazyValueConstructor)> OnConstruct) {
316+
return pointer(ThreadSafeTrieRawHashMapBase::insert(
317+
Hint, Hash, [&](void *Mem, ArrayRef<uint8_t> Hash) {
318+
value_type *Result = nullptr;
319+
OnConstruct(LazyValueConstructor(Mem, Result, Hash));
320+
return Result->Hash.data();
321+
}));
322+
}
323+
324+
pointer insertLazy(ArrayRef<uint8_t> Hash,
325+
function_ref<void(LazyValueConstructor)> OnConstruct) {
326+
return insertLazy(const_pointer(), Hash, OnConstruct);
327+
}
328+
329+
pointer insert(const_pointer Hint, value_type &&HashedData) {
330+
return insertLazy(Hint, HashedData.Hash, [&](LazyValueConstructor C) {
331+
C(std::move(HashedData.Data));
332+
});
333+
}
334+
335+
pointer insert(const_pointer Hint, const value_type &HashedData) {
336+
return insertLazy(Hint, HashedData.Hash,
337+
[&](LazyValueConstructor C) { C(HashedData.Data); });
338+
}
339+
340+
pointer find(ArrayRef<uint8_t> Hash) {
341+
assert(Hash.size() == std::tuple_size<HashT>::value);
342+
return ThreadSafeTrieRawHashMapBase::find(Hash);
343+
}
344+
345+
const_pointer find(ArrayRef<uint8_t> Hash) const {
346+
assert(Hash.size() == std::tuple_size<HashT>::value);
347+
return ThreadSafeTrieRawHashMapBase::find(Hash);
348+
}
349+
350+
ThreadSafeTrieRawHashMap(std::optional<size_t> NumRootBits = std::nullopt,
351+
std::optional<size_t> NumSubtrieBits = std::nullopt)
352+
: ThreadSafeTrieRawHashMapBase(DefaultContentAllocSize<value_type>,
353+
DefaultContentAllocAlign<value_type>,
354+
DefaultContentOffset<value_type>,
355+
NumRootBits, NumSubtrieBits) {}
356+
357+
~ThreadSafeTrieRawHashMap() {
358+
if constexpr (std::is_trivially_destructible<value_type>::value)
359+
this->destroyImpl(nullptr);
360+
else
361+
this->destroyImpl(
362+
[](void *P) { static_cast<value_type *>(P)->~value_type(); });
363+
}
364+
365+
// Move constructor okay.
366+
ThreadSafeTrieRawHashMap(ThreadSafeTrieRawHashMap &&) = default;
367+
368+
// No move assignment or any copy.
369+
ThreadSafeTrieRawHashMap &operator=(ThreadSafeTrieRawHashMap &&) = delete;
370+
ThreadSafeTrieRawHashMap(const ThreadSafeTrieRawHashMap &) = delete;
371+
ThreadSafeTrieRawHashMap &
372+
operator=(const ThreadSafeTrieRawHashMap &) = delete;
373+
};
374+
375+
} // namespace llvm
376+
377+
#endif // LLVM_ADT_TRIERAWHASHMAP_H

llvm/lib/Support/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ add_llvm_component_library(LLVMSupport
256256
TimeProfiler.cpp
257257
Timer.cpp
258258
ToolOutputFile.cpp
259+
TrieRawHashMap.cpp
259260
Twine.cpp
260261
TypeSize.cpp
261262
Unicode.cpp

0 commit comments

Comments
 (0)