Skip to content

Commit a89c882

Browse files
authored
Merge pull request #35269 from CodaFi/hash-brown
Replace llvm::MD5 with SipHash-2-4
2 parents 8d7ac53 + 73ac8d3 commit a89c882

File tree

15 files changed

+724
-42
lines changed

15 files changed

+724
-42
lines changed

include/swift/AST/ParseRequests.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ class ParseAbstractFunctionBodyRequest :
8787
struct SourceFileParsingResult {
8888
ArrayRef<Decl *> TopLevelDecls;
8989
Optional<ArrayRef<Token>> CollectedTokens;
90-
Optional<llvm::MD5> InterfaceHash;
90+
Optional<StableHasher> InterfaceHasher;
9191
Optional<syntax::SourceFileSyntax> SyntaxRoot;
9292
};
9393

include/swift/AST/SourceFile.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,10 @@ class SourceFile final : public FileUnit {
105105
SourceLoc MainDeclDiagLoc;
106106

107107
/// A hash of all interface-contributing tokens that have been lexed for
108-
/// this source file so far.
108+
/// this source file.
109+
///
109110
/// We only collect interface hash for primary input files.
110-
llvm::Optional<llvm::MD5> InterfaceHash;
111+
llvm::Optional<StableHasher> InterfaceHasher;
111112

112113
/// The ID for the memory buffer containing this file's source.
113114
///

include/swift/Basic/Fingerprint.h

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
#ifndef SWIFT_BASIC_FINGERPRINT_H
1414
#define SWIFT_BASIC_FINGERPRINT_H
1515

16+
#include "swift/Basic/StableHasher.h"
1617
#include "llvm/ADT/Hashing.h"
1718
#include "llvm/ADT/SmallString.h"
1819
#include "llvm/ADT/StringRef.h"
19-
#include "llvm/Support/MD5.h"
2020

2121
#include <string>
2222

@@ -52,11 +52,6 @@ namespace swift {
5252
/// iterable decl contexts to detect when the tokens in their bodies have
5353
/// changed. This makes them a coarse - yet safe - overapproximation for when a
5454
/// decl has changed semantically.
55-
///
56-
/// \c Fingerprints are currently implemented as a thin wrapper around an MD5
57-
/// hash. MD5 is known to be neither the fastest nor the most
58-
/// cryptographically capable algorithm, but it does afford us the avalanche
59-
/// effect we desire. We should revisit the modeling decision here.
6055
class Fingerprint final {
6156
public:
6257
/// The size (in bytes) of the raw value of all fingerprints.
@@ -66,6 +61,8 @@ class Fingerprint final {
6661
private:
6762
Core core;
6863

64+
friend struct StableHasher::Combiner<swift::Fingerprint>;
65+
6966
public:
7067
/// Creates a fingerprint value from a pair of 64-bit integers.
7168
explicit Fingerprint(Fingerprint::Core value) : core(value) {}
@@ -76,9 +73,9 @@ class Fingerprint final {
7673
/// Strings that violate this invariant will return a null optional.
7774
static llvm::Optional<Fingerprint> fromString(llvm::StringRef value);
7875

79-
/// Creates a fingerprint value by consuming the given \c MD5Result from LLVM.
80-
explicit Fingerprint(llvm::MD5::MD5Result &&MD5Value)
81-
: core{MD5Value.words()} {}
76+
/// Creates a fingerprint value by consuming the given \c StableHasher.
77+
explicit Fingerprint(StableHasher &&stableHasher)
78+
: core{std::move(stableHasher).finalize()} {}
8279

8380
public:
8481
/// Retrieve the raw underlying bytes of this fingerprint.
@@ -100,7 +97,7 @@ class Fingerprint final {
10097
public:
10198
/// The fingerprint value consisting of 32 bytes of zeroes.
10299
///
103-
/// This fingerprint is a perfectly fine value for an MD5 hash, but it is
100+
/// This fingerprint is a perfectly fine value for a hash, but it is
104101
/// completely arbitrary.
105102
static Fingerprint ZERO() {
106103
return Fingerprint(Fingerprint::Core{0, 0});
@@ -118,6 +115,22 @@ class Fingerprint final {
118115
void simple_display(llvm::raw_ostream &out, const Fingerprint &fp);
119116
}; // namespace swift
120117

118+
namespace swift {
119+
120+
template <> struct StableHasher::Combiner<Fingerprint> {
121+
static void combine(StableHasher &hasher, const Fingerprint &Val) {
122+
// Our underlying buffer is already byte-swapped. Combine the
123+
// raw bytes from the core by hand.
124+
uint8_t buffer[8];
125+
memcpy(buffer, &Val.core.first, sizeof(buffer));
126+
hasher.combine(buffer);
127+
memcpy(buffer, &Val.core.second, sizeof(buffer));
128+
hasher.combine(buffer);
129+
}
130+
};
131+
132+
}; // namespace swift
133+
121134
namespace llvm {
122135
class raw_ostream;
123136
raw_ostream &operator<<(raw_ostream &OS, const swift::Fingerprint &fp);

include/swift/Basic/StableHasher.h

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
//===--- StableHasher.h - Stable Hashing ------------------------*- C++ -*-===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
//
13+
// An implementation of a stable hashing for Swift.
14+
//
15+
// Derived from the reference implementation for SipHash 2-4:
16+
// https://github.com/veorq/SipHash
17+
//
18+
// With inline buffering derived from the hash implementation in the Swift
19+
// Standard Library.
20+
//
21+
//===----------------------------------------------------------------------===//
22+
23+
#ifndef SWIFT_BASIC_STABLEHASHER_H
24+
#define SWIFT_BASIC_STABLEHASHER_H
25+
26+
#include "llvm/Support/Endian.h"
27+
#include "llvm/ADT/StringRef.h"
28+
#include <algorithm>
29+
#include <cstring>
30+
#include <string>
31+
#include <vector>
32+
#include <tuple>
33+
#include <utility>
34+
35+
namespace swift {
36+
37+
/// A \c StableHasher is an implementation of a 128-bit stable hash - built for
38+
/// speed.
39+
///
40+
/// A "stable" hash in this context refers to the idea that the output of this
41+
/// hasher is deterministic across instantiations of the compiler. In order to
42+
/// support this goal, this hasher disallows run-dependent or otherwise
43+
/// unstable values from entering into the hash-combiner. For example, this
44+
/// hasher will statically reject attempts to hash-combine pointers and
45+
/// aggregates of pointers. Note that this relies on user cooperation as well.
46+
/// The order of hash-combines is pivotal, thus e.g. collection values should
47+
/// have a guaranteed order or be sorted before being hash-combined.
48+
///
49+
/// Stable hash values must also be independent of the host architecture. For
50+
/// integral types and enumerations, the default hash-combiner will
51+
/// automatically byte-swap to a common little-endian format.
52+
///
53+
/// This hasher also allows for extending the hash-combiner to user-defined
54+
/// types. To do so, define a (partial) specialization of
55+
/// \c swift::StableHasher::Combiner<T>
56+
///
57+
/// template <typename T>
58+
/// struct swift::StableHasher::Combiner<std::optional<T>> {
59+
/// static void combine(StableHasher &hasher, const std::optional<T> &O) {
60+
/// if (!O.has_value()) {
61+
/// hasher.combine(0);
62+
/// } else {
63+
/// hasher.combine(1);
64+
/// swift::StableHasher::Combiner<T>::combine(hasher, O.value());
65+
/// }
66+
/// }
67+
/// };
68+
///
69+
/// The current implementation is the 128-bit (extended) SipHash 2-4, which
70+
/// has been empirically demonstrated to have the best throughput relative to
71+
/// the other SipHash tunings.
72+
class StableHasher final {
73+
private:
74+
struct State {
75+
uint64_t v0 = 0x736F6D6570736575;
76+
uint64_t v1 = 0x646F72616E646f6D;
77+
uint64_t v2 = 0x6C7967656E657261;
78+
uint64_t v3 = 0x7465646279746573;
79+
} state;
80+
81+
// A buffer of up to 8 items that this hasher uses to amortize the cost
82+
// of the hashing function for hash-combines shorter than 64-bits.
83+
uint8_t byteBuffer[8] = {0};
84+
// msb lsb
85+
// +---------+-------+-------+-------+-------+-------+-------+-------+
86+
// |byteCount| length (<= 56 bits) |
87+
// +---------+-------+-------+-------+-------+-------+-------+-------+
88+
uint64_t lengthAndByteCount = 0;
89+
90+
public:
91+
static StableHasher defaultHasher() { StableHasher hasher{0, 0}; return hasher; }
92+
93+
explicit StableHasher(uint64_t leftSeed, uint64_t rightSeed) {
94+
state.v3 ^= rightSeed;
95+
state.v2 ^= leftSeed;
96+
state.v1 ^= rightSeed;
97+
state.v0 ^= leftSeed;
98+
99+
state.v1 ^= 0xEE;
100+
}
101+
102+
public:
103+
template <typename T> struct Combiner {
104+
// static void combine(StableHasher &hasher, const T &Val);
105+
};
106+
107+
public:
108+
/// Consume this stable hasher and compute the final 128-bit stable hash value.
109+
std::pair<uint64_t, uint64_t> finalize() &&;
110+
111+
template <uint64_t N> void combine(uint8_t (&bits)[N]) {
112+
static_assert(N > 0, "Cannot append VLA");
113+
static_assert(N <= 8, "Can only append up to 64 bits at a time");
114+
115+
lengthAndByteCount += N;
116+
117+
const uint64_t bufLen = getBufferLength();
118+
const uint64_t available = sizeof(byteBuffer) - bufLen;
119+
120+
// Cram as many bytes into the buffer as we can.
121+
const uint64_t nhead = std::min(N, available);
122+
if (nhead == sizeof(byteBuffer)) {
123+
// We have headroom available for all 64 bits. Eagerly compress the
124+
// now-full buffer into our state.
125+
std::copy(bits, bits + sizeof(byteBuffer), byteBuffer);
126+
} else if (N >= available) {
127+
// There was some excess - append as many bytes as we can hold and
128+
// compress the buffer into our state.
129+
std::copy(bits, bits + nhead, byteBuffer + bufLen);
130+
} else {
131+
// We have headroom available for these bits.
132+
std::copy(bits, bits + N, byteBuffer + bufLen);
133+
return setBufferLength(bufLen + N);
134+
}
135+
136+
constexpr auto endian = llvm::support::endianness::little;
137+
compress(llvm::support::endian::read<uint64_t>(byteBuffer, endian));
138+
139+
// Now reseed the buffer with the remaining bytes.
140+
const uint64_t remainder = N - available;
141+
std::copy(bits + available, bits + N, byteBuffer);
142+
return setBufferLength(remainder);
143+
}
144+
145+
template <
146+
typename T,
147+
typename std::enable_if<std::is_integral<T>::value>::type * = nullptr>
148+
void combine(T bits) {
149+
constexpr auto endian = llvm::support::endianness::little;
150+
uint8_t buf[sizeof(T)] = {0};
151+
bits = llvm::support::endian::byte_swap<T>(bits, endian);
152+
std::memcpy(buf, &bits, sizeof(T));
153+
combine<sizeof(T)>(buf);
154+
}
155+
156+
template <
157+
typename EnumType,
158+
typename std::enable_if<std::is_enum<EnumType>::value>::type * = nullptr>
159+
void combine(EnumType value) {
160+
using Underlying = typename std::underlying_type<EnumType>::type;
161+
return this->template combine<Underlying>(static_cast<Underlying>(value));
162+
}
163+
164+
template <typename T>
165+
auto combine(const T *ptr) -> decltype("Cannot hash-combine pointers!"){};
166+
167+
template <typename T, typename... Ts>
168+
void combine(const T &arg, const Ts &... args) {
169+
return combine_many(arg, args...);
170+
}
171+
172+
template <typename T, typename U> void combine(const std::pair<T, U> &arg) {
173+
return combine_many(arg.first, arg.second);
174+
}
175+
176+
template <typename T> void combine(const std::basic_string<T> &arg) {
177+
return combine_range(arg.begin(), arg.end());
178+
}
179+
180+
void combine(llvm::StringRef arg) {
181+
return combine_range(arg.begin(), arg.end());
182+
}
183+
184+
template <typename T,
185+
decltype(StableHasher::Combiner<T>::combine) * = nullptr>
186+
void combine(const T &val) {
187+
return StableHasher::Combiner<T>::combine(*this, val);
188+
}
189+
190+
template <typename ValueT> void combine_range(ValueT first, ValueT last) {
191+
combine(std::distance(first, last));
192+
while (first != last) {
193+
combine(*first++);
194+
}
195+
}
196+
197+
template <typename... Ts> void combine(const std::tuple<Ts...> &arg) {
198+
return combine_tuple(arg, typename std::index_sequence_for<Ts...>{});
199+
}
200+
201+
private:
202+
template <typename... Ts, unsigned... Indices>
203+
void combine_tuple(const std::tuple<Ts...> &arg,
204+
std::index_sequence<Indices...> indices) {
205+
return combine_many(hash_value(std::get<Indices>(arg))...);
206+
}
207+
208+
// base case.
209+
void combine_many() {}
210+
211+
// recursive case
212+
template <typename T, typename... Ts>
213+
void combine_many(const T &arg, const Ts &... args) {
214+
combine(arg);
215+
return combine_many(args...);
216+
}
217+
218+
private:
219+
/// Return the number of bytes in the inline buffer.
220+
uint64_t getBufferLength() const { return lengthAndByteCount >> 56; }
221+
/// Set the number of bytes in the inline buffer.
222+
void setBufferLength(uint64_t newLen) {
223+
lengthAndByteCount = getDigestLength() | (newLen << 56);
224+
}
225+
226+
/// Return the number of bytes that have been hash-combined so far.
227+
uint64_t getDigestLength() const {
228+
return lengthAndByteCount & ~(uint64_t(0xFF) << 56);
229+
}
230+
231+
void compress(uint64_t value);
232+
};
233+
234+
} // namespace swift
235+
236+
#endif // SWIFT_BASIC_STABLEHASHER_H

include/swift/Parse/Parser.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ class Parser {
127127

128128
/// The current token hash, or \c None if the parser isn't computing a hash
129129
/// for the token stream.
130-
Optional<llvm::MD5> CurrentTokenHash;
130+
Optional<StableHasher> CurrentTokenHash;
131131

132132
void recordTokenHash(const Token Tok) {
133133
if (!Tok.getText().empty())

lib/AST/Module.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,22 +1092,21 @@ Fingerprint SourceFile::getInterfaceHash() const {
10921092
assert(hasInterfaceHash() && "Interface hash not enabled");
10931093
auto &eval = getASTContext().evaluator;
10941094
auto *mutableThis = const_cast<SourceFile *>(this);
1095-
auto md5 = *evaluateOrDefault(eval, ParseSourceFileRequest{mutableThis}, {})
1096-
.InterfaceHash;
1097-
llvm::MD5::MD5Result result;
1098-
md5.final(result);
1099-
return Fingerprint{std::move(result)};
1095+
Optional<StableHasher> interfaceHasher =
1096+
evaluateOrDefault(eval, ParseSourceFileRequest{mutableThis}, {})
1097+
.InterfaceHasher;
1098+
return Fingerprint{StableHasher{interfaceHasher.getValue()}.finalize()};
11001099
}
11011100

11021101
Fingerprint SourceFile::getInterfaceHashIncludingTypeMembers() const {
11031102
/// FIXME: Gross. Hashing multiple "hash" values.
1104-
llvm::MD5 hash;
1105-
hash.update(getInterfaceHash().getRawValue());
1103+
auto hash = StableHasher::defaultHasher();
1104+
hash.combine(getInterfaceHash());
11061105

11071106
std::function<void(IterableDeclContext *)> hashTypeBodyFingerprints =
11081107
[&](IterableDeclContext *IDC) {
11091108
if (auto fp = IDC->getBodyFingerprint())
1110-
hash.update(fp->getRawValue());
1109+
hash.combine(*fp);
11111110
for (auto *member : IDC->getParsedMembers())
11121111
if (auto *childIDC = dyn_cast<IterableDeclContext>(member))
11131112
hashTypeBodyFingerprints(childIDC);
@@ -1118,9 +1117,7 @@ Fingerprint SourceFile::getInterfaceHashIncludingTypeMembers() const {
11181117
hashTypeBodyFingerprints(IDC);
11191118
}
11201119

1121-
llvm::MD5::MD5Result result;
1122-
hash.final(result);
1123-
return Fingerprint{std::move(result)};
1120+
return Fingerprint{std::move(hash)};
11241121
}
11251122

11261123
syntax::SourceFileSyntax SourceFile::getSyntaxRoot() const {

lib/Basic/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ add_swift_host_library(swiftBasic STATIC
6464
Program.cpp
6565
QuotedString.cpp
6666
SourceLoc.cpp
67+
StableHasher.cpp
6768
Statistic.cpp
6869
StringExtras.cpp
6970
TaskQueue.cpp

0 commit comments

Comments
 (0)