Skip to content

Commit e2f0157

Browse files
[ADT] Add TrieRawHashMap
Implement TrieRawHashMap which stores objects into a Trie based on the hash of the object. User needs to supply the hashing function and guarantees the uniqueness of the hash for the objects to be inserted. Hash collision is not supported
1 parent 849f963 commit e2f0157

File tree

6 files changed

+1310
-0
lines changed

6 files changed

+1310
-0
lines changed
Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
//===- TrieRawHashMap.h -----------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_ADT_TRIERAWHASHMAP_H
10+
#define LLVM_ADT_TRIERAWHASHMAP_H
11+
12+
#include "llvm/ADT/ArrayRef.h"
13+
#include "llvm/ADT/StringRef.h"
14+
#include "llvm/Support/Casting.h"
15+
#include <atomic>
16+
#include <optional>
17+
18+
namespace llvm {
19+
20+
class raw_ostream;
21+
22+
/// TrieRawHashMap - is a lock-free thread-safe trie that is can be used to
23+
/// store/index data based on a hash value. It can be customized to work with
24+
/// any hash algorithm or store any data.
25+
///
26+
/// Data structure:
27+
/// Data node stored in the Trie contains both hash and data:
28+
/// struct {
29+
/// HashT Hash;
30+
/// DataT Data;
31+
/// };
32+
///
33+
/// Data is stored/indexed via a prefix tree, where each node in the tree can be
34+
/// either the root, a sub-trie or a data node. Assuming a 4-bit hash and two
35+
/// data objects {0001, A} and {0100, B}, it can be stored in a trie
36+
/// (assuming Root has 2 bits, SubTrie has 1 bit):
37+
/// +--------+
38+
/// |Root[00]| -> {0001, A}
39+
/// | [01]| -> {0100, B}
40+
/// | [10]| (empty)
41+
/// | [11]| (empty)
42+
/// +--------+
43+
///
44+
/// Inserting a new object {0010, C} will result in:
45+
/// +--------+ +----------+
46+
/// |Root[00]| -> |SubTrie[0]| -> {0001, A}
47+
/// | | | [1]| -> {0010, C}
48+
/// | | +----------+
49+
/// | [01]| -> {0100, B}
50+
/// | [10]| (empty)
51+
/// | [11]| (empty)
52+
/// +--------+
53+
/// Note object A is sunk down to a sub-trie during the insertion. All the
54+
/// nodes are inserted through compare-exchange to ensure thread-safe and
55+
/// lock-free.
56+
///
57+
/// To find an object in the trie, walk the tree with prefix of the hash until
58+
/// the data node is found. Then the hash is compared with the hash stored in
59+
/// the data node to see if the is the same object.
60+
///
61+
/// Hash collision is not allowed so it is recommended to use trie with a
62+
/// "strong" hashing algorithm. A well-distributed hash can also result in
63+
/// better performance and memory usage.
64+
///
65+
/// It currently does not support iteration and deletion.
66+
67+
/// Base class for a lock-free thread-safe hash-mapped trie.
68+
class ThreadSafeTrieRawHashMapBase {
69+
public:
70+
static constexpr size_t TrieContentBaseSize = 4;
71+
static constexpr size_t DefaultNumRootBits = 6;
72+
static constexpr size_t DefaultNumSubtrieBits = 4;
73+
74+
private:
75+
template <class T> struct AllocValueType {
76+
char Base[TrieContentBaseSize];
77+
std::aligned_union_t<sizeof(T), T> Content;
78+
};
79+
80+
protected:
81+
template <class T>
82+
static constexpr size_t DefaultContentAllocSize = sizeof(AllocValueType<T>);
83+
84+
template <class T>
85+
static constexpr size_t DefaultContentAllocAlign = alignof(AllocValueType<T>);
86+
87+
template <class T>
88+
static constexpr size_t DefaultContentOffset =
89+
offsetof(AllocValueType<T>, Content);
90+
91+
public:
92+
void operator delete(void *Ptr) { ::free(Ptr); }
93+
94+
LLVM_DUMP_METHOD void dump() const;
95+
void print(raw_ostream &OS) const;
96+
97+
protected:
98+
/// Result of a lookup. Suitable for an insertion hint. Maybe could be
99+
/// expanded into an iterator of sorts, but likely not useful (visiting
100+
/// everything in the trie should probably be done some way other than
101+
/// through an iterator pattern).
102+
class PointerBase {
103+
protected:
104+
void *get() const { return I == -2u ? P : nullptr; }
105+
106+
public:
107+
PointerBase() noexcept = default;
108+
PointerBase(PointerBase &&) = default;
109+
PointerBase(const PointerBase &) = default;
110+
PointerBase &operator=(PointerBase &&) = default;
111+
PointerBase &operator=(const PointerBase &) = default;
112+
113+
private:
114+
friend class ThreadSafeTrieRawHashMapBase;
115+
explicit PointerBase(void *Content) : P(Content), I(-2u) {}
116+
PointerBase(void *P, unsigned I, unsigned B) : P(P), I(I), B(B) {}
117+
118+
bool isHint() const { return I != -1u && I != -2u; }
119+
120+
void *P = nullptr;
121+
unsigned I = -1u;
122+
unsigned B = 0;
123+
};
124+
125+
/// Find the stored content with hash.
126+
PointerBase find(ArrayRef<uint8_t> Hash) const;
127+
128+
/// Insert and return the stored content.
129+
PointerBase
130+
insert(PointerBase Hint, ArrayRef<uint8_t> Hash,
131+
function_ref<const uint8_t *(void *Mem, ArrayRef<uint8_t> Hash)>
132+
Constructor);
133+
134+
ThreadSafeTrieRawHashMapBase() = delete;
135+
136+
ThreadSafeTrieRawHashMapBase(
137+
size_t ContentAllocSize, size_t ContentAllocAlign, size_t ContentOffset,
138+
std::optional<size_t> NumRootBits = std::nullopt,
139+
std::optional<size_t> NumSubtrieBits = std::nullopt);
140+
141+
/// Destructor, which asserts if there's anything to do. Subclasses should
142+
/// call \a destroyImpl().
143+
///
144+
/// \pre \a destroyImpl() was already called.
145+
~ThreadSafeTrieRawHashMapBase();
146+
void destroyImpl(function_ref<void(void *ValueMem)> Destructor);
147+
148+
ThreadSafeTrieRawHashMapBase(ThreadSafeTrieRawHashMapBase &&RHS);
149+
150+
// Move assignment can be implemented in a thread-safe way if NumRootBits and
151+
// NumSubtrieBits are stored inside the Root.
152+
ThreadSafeTrieRawHashMapBase &
153+
operator=(ThreadSafeTrieRawHashMapBase &&RHS) = delete;
154+
155+
// No copy.
156+
ThreadSafeTrieRawHashMapBase(const ThreadSafeTrieRawHashMapBase &) = delete;
157+
ThreadSafeTrieRawHashMapBase &
158+
operator=(const ThreadSafeTrieRawHashMapBase &) = delete;
159+
160+
// Debug functions. Implementation details and not guaranteed to be
161+
// thread-safe.
162+
PointerBase getRoot() const;
163+
unsigned getStartBit(PointerBase P) const;
164+
unsigned getNumBits(PointerBase P) const;
165+
unsigned getNumSlotUsed(PointerBase P) const;
166+
std::string getTriePrefixAsString(PointerBase P) const;
167+
unsigned getNumTries() const;
168+
// Visit next trie in the allocation chain.
169+
PointerBase getNextTrie(PointerBase P) const;
170+
171+
private:
172+
friend class TrieRawHashMapTestHelper;
173+
const unsigned short ContentAllocSize;
174+
const unsigned short ContentAllocAlign;
175+
const unsigned short ContentOffset;
176+
unsigned short NumRootBits;
177+
unsigned short NumSubtrieBits;
178+
struct ImplType;
179+
// ImplPtr is owned by ThreadSafeTrieRawHashMapBase and needs to be freed in
180+
// destoryImpl.
181+
std::atomic<ImplType *> ImplPtr;
182+
ImplType &getOrCreateImpl();
183+
ImplType *getImpl() const;
184+
};
185+
186+
/// Lock-free thread-safe hash-mapped trie.
187+
template <class T, size_t NumHashBytes>
188+
class ThreadSafeTrieRawHashMap : public ThreadSafeTrieRawHashMapBase {
189+
public:
190+
using HashT = std::array<uint8_t, NumHashBytes>;
191+
192+
class LazyValueConstructor;
193+
struct value_type {
194+
const HashT Hash;
195+
T Data;
196+
197+
value_type(value_type &&) = default;
198+
value_type(const value_type &) = default;
199+
200+
value_type(ArrayRef<uint8_t> Hash, const T &Data)
201+
: Hash(makeHash(Hash)), Data(Data) {}
202+
value_type(ArrayRef<uint8_t> Hash, T &&Data)
203+
: Hash(makeHash(Hash)), Data(std::move(Data)) {}
204+
205+
private:
206+
friend class LazyValueConstructor;
207+
208+
struct EmplaceTag {};
209+
template <class... ArgsT>
210+
value_type(ArrayRef<uint8_t> Hash, EmplaceTag, ArgsT &&...Args)
211+
: Hash(makeHash(Hash)), Data(std::forward<ArgsT>(Args)...) {}
212+
213+
static HashT makeHash(ArrayRef<uint8_t> HashRef) {
214+
HashT Hash;
215+
std::copy(HashRef.begin(), HashRef.end(), Hash.data());
216+
return Hash;
217+
}
218+
};
219+
220+
using ThreadSafeTrieRawHashMapBase::operator delete;
221+
using HashType = HashT;
222+
223+
using ThreadSafeTrieRawHashMapBase::dump;
224+
using ThreadSafeTrieRawHashMapBase::print;
225+
226+
private:
227+
template <class ValueT> class PointerImpl : PointerBase {
228+
friend class ThreadSafeTrieRawHashMap;
229+
230+
ValueT *get() const {
231+
if (void *B = PointerBase::get())
232+
return reinterpret_cast<ValueT *>(B);
233+
return nullptr;
234+
}
235+
236+
public:
237+
ValueT &operator*() const {
238+
assert(get());
239+
return *get();
240+
}
241+
ValueT *operator->() const {
242+
assert(get());
243+
return get();
244+
}
245+
explicit operator bool() const { return get(); }
246+
247+
PointerImpl() = default;
248+
PointerImpl(PointerImpl &&) = default;
249+
PointerImpl(const PointerImpl &) = default;
250+
PointerImpl &operator=(PointerImpl &&) = default;
251+
PointerImpl &operator=(const PointerImpl &) = default;
252+
253+
protected:
254+
PointerImpl(PointerBase Result) : PointerBase(Result) {}
255+
};
256+
257+
public:
258+
class pointer;
259+
class const_pointer;
260+
class pointer : public PointerImpl<value_type> {
261+
friend class ThreadSafeTrieRawHashMap;
262+
friend class const_pointer;
263+
264+
public:
265+
pointer() = default;
266+
pointer(pointer &&) = default;
267+
pointer(const pointer &) = default;
268+
pointer &operator=(pointer &&) = default;
269+
pointer &operator=(const pointer &) = default;
270+
271+
private:
272+
pointer(PointerBase Result) : pointer::PointerImpl(Result) {}
273+
};
274+
275+
class const_pointer : public PointerImpl<const value_type> {
276+
friend class ThreadSafeTrieRawHashMap;
277+
278+
public:
279+
const_pointer() = default;
280+
const_pointer(const_pointer &&) = default;
281+
const_pointer(const const_pointer &) = default;
282+
const_pointer &operator=(const_pointer &&) = default;
283+
const_pointer &operator=(const const_pointer &) = default;
284+
285+
const_pointer(const pointer &P) : const_pointer::PointerImpl(P) {}
286+
287+
private:
288+
const_pointer(PointerBase Result) : const_pointer::PointerImpl(Result) {}
289+
};
290+
291+
class LazyValueConstructor {
292+
public:
293+
value_type &operator()(T &&RHS) {
294+
assert(Mem && "Constructor already called, or moved away");
295+
return assign(::new (Mem) value_type(Hash, std::move(RHS)));
296+
}
297+
value_type &operator()(const T &RHS) {
298+
assert(Mem && "Constructor already called, or moved away");
299+
return assign(::new (Mem) value_type(Hash, RHS));
300+
}
301+
template <class... ArgsT> value_type &emplace(ArgsT &&...Args) {
302+
assert(Mem && "Constructor already called, or moved away");
303+
return assign(::new (Mem)
304+
value_type(Hash, typename value_type::EmplaceTag{},
305+
std::forward<ArgsT>(Args)...));
306+
}
307+
308+
LazyValueConstructor(LazyValueConstructor &&RHS)
309+
: Mem(RHS.Mem), Result(RHS.Result), Hash(RHS.Hash) {
310+
RHS.Mem = nullptr; // Moved away, cannot call.
311+
}
312+
~LazyValueConstructor() { assert(!Mem && "Constructor never called!"); }
313+
314+
private:
315+
value_type &assign(value_type *V) {
316+
Mem = nullptr;
317+
Result = V;
318+
return *V;
319+
}
320+
friend class ThreadSafeTrieRawHashMap;
321+
LazyValueConstructor() = delete;
322+
LazyValueConstructor(void *Mem, value_type *&Result, ArrayRef<uint8_t> Hash)
323+
: Mem(Mem), Result(Result), Hash(Hash) {
324+
assert(Hash.size() == sizeof(HashT) && "Invalid hash");
325+
assert(Mem && "Invalid memory for construction");
326+
}
327+
void *Mem;
328+
value_type *&Result;
329+
ArrayRef<uint8_t> Hash;
330+
};
331+
332+
/// Insert with a hint. Default-constructed hint will work, but it's
333+
/// recommended to start with a lookup to avoid overhead in object creation
334+
/// if it already exists.
335+
pointer insertLazy(const_pointer Hint, ArrayRef<uint8_t> Hash,
336+
function_ref<void(LazyValueConstructor)> OnConstruct) {
337+
return pointer(ThreadSafeTrieRawHashMapBase::insert(
338+
Hint, Hash, [&](void *Mem, ArrayRef<uint8_t> Hash) {
339+
value_type *Result = nullptr;
340+
OnConstruct(LazyValueConstructor(Mem, Result, Hash));
341+
return Result->Hash.data();
342+
}));
343+
}
344+
345+
pointer insertLazy(ArrayRef<uint8_t> Hash,
346+
function_ref<void(LazyValueConstructor)> OnConstruct) {
347+
return insertLazy(const_pointer(), Hash, OnConstruct);
348+
}
349+
350+
pointer insert(const_pointer Hint, value_type &&HashedData) {
351+
return insertLazy(Hint, HashedData.Hash, [&](LazyValueConstructor C) {
352+
C(std::move(HashedData.Data));
353+
});
354+
}
355+
356+
pointer insert(const_pointer Hint, const value_type &HashedData) {
357+
return insertLazy(Hint, HashedData.Hash,
358+
[&](LazyValueConstructor C) { C(HashedData.Data); });
359+
}
360+
361+
pointer find(ArrayRef<uint8_t> Hash) {
362+
assert(Hash.size() == std::tuple_size<HashT>::value);
363+
return ThreadSafeTrieRawHashMapBase::find(Hash);
364+
}
365+
366+
const_pointer find(ArrayRef<uint8_t> Hash) const {
367+
assert(Hash.size() == std::tuple_size<HashT>::value);
368+
return ThreadSafeTrieRawHashMapBase::find(Hash);
369+
}
370+
371+
ThreadSafeTrieRawHashMap(std::optional<size_t> NumRootBits = std::nullopt,
372+
std::optional<size_t> NumSubtrieBits = std::nullopt)
373+
: ThreadSafeTrieRawHashMapBase(DefaultContentAllocSize<value_type>,
374+
DefaultContentAllocAlign<value_type>,
375+
DefaultContentOffset<value_type>,
376+
NumRootBits, NumSubtrieBits) {}
377+
378+
~ThreadSafeTrieRawHashMap() {
379+
if constexpr (std::is_trivially_destructible<value_type>::value)
380+
this->destroyImpl(nullptr);
381+
else
382+
this->destroyImpl(
383+
[](void *P) { static_cast<value_type *>(P)->~value_type(); });
384+
}
385+
386+
// Move constructor okay.
387+
ThreadSafeTrieRawHashMap(ThreadSafeTrieRawHashMap &&) = default;
388+
389+
// No move assignment or any copy.
390+
ThreadSafeTrieRawHashMap &operator=(ThreadSafeTrieRawHashMap &&) = delete;
391+
ThreadSafeTrieRawHashMap(const ThreadSafeTrieRawHashMap &) = delete;
392+
ThreadSafeTrieRawHashMap &
393+
operator=(const ThreadSafeTrieRawHashMap &) = delete;
394+
};
395+
396+
} // namespace llvm
397+
398+
#endif // LLVM_ADT_TRIERAWHASHMAP_H

0 commit comments

Comments
 (0)