Skip to content

Replace llvm::MD5 with SipHash-2-4 #35269

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/swift/AST/ParseRequests.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class ParseAbstractFunctionBodyRequest :
struct SourceFileParsingResult {
ArrayRef<Decl *> TopLevelDecls;
Optional<ArrayRef<Token>> CollectedTokens;
Optional<llvm::MD5> InterfaceHash;
Optional<StableHasher> InterfaceHasher;
Optional<syntax::SourceFileSyntax> SyntaxRoot;
};

Expand Down
5 changes: 3 additions & 2 deletions include/swift/AST/SourceFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,10 @@ class SourceFile final : public FileUnit {
SourceLoc MainDeclDiagLoc;

/// A hash of all interface-contributing tokens that have been lexed for
/// this source file so far.
/// this source file.
///
/// We only collect interface hash for primary input files.
llvm::Optional<llvm::MD5> InterfaceHash;
llvm::Optional<StableHasher> InterfaceHasher;

/// The ID for the memory buffer containing this file's source.
///
Expand Down
33 changes: 23 additions & 10 deletions include/swift/Basic/Fingerprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
#ifndef SWIFT_BASIC_FINGERPRINT_H
#define SWIFT_BASIC_FINGERPRINT_H

#include "swift/Basic/StableHasher.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MD5.h"

#include <string>

Expand Down Expand Up @@ -52,11 +52,6 @@ namespace swift {
/// iterable decl contexts to detect when the tokens in their bodies have
/// changed. This makes them a coarse - yet safe - overapproximation for when a
/// decl has changed semantically.
///
/// \c Fingerprints are currently implemented as a thin wrapper around an MD5
/// hash. MD5 is known to be neither the fastest nor the most
/// cryptographically capable algorithm, but it does afford us the avalanche
/// effect we desire. We should revisit the modeling decision here.
class Fingerprint final {
public:
/// The size (in bytes) of the raw value of all fingerprints.
Expand All @@ -66,6 +61,8 @@ class Fingerprint final {
private:
Core core;

friend struct StableHasher::Combiner<swift::Fingerprint>;

public:
/// Creates a fingerprint value from a pair of 64-bit integers.
explicit Fingerprint(Fingerprint::Core value) : core(value) {}
Expand All @@ -76,9 +73,9 @@ class Fingerprint final {
/// Strings that violate this invariant will return a null optional.
static llvm::Optional<Fingerprint> fromString(llvm::StringRef value);

/// Creates a fingerprint value by consuming the given \c MD5Result from LLVM.
explicit Fingerprint(llvm::MD5::MD5Result &&MD5Value)
: core{MD5Value.words()} {}
/// Creates a fingerprint value by consuming the given \c StableHasher.
explicit Fingerprint(StableHasher &&stableHasher)
: core{std::move(stableHasher).finalize()} {}

public:
/// Retrieve the raw underlying bytes of this fingerprint.
Expand All @@ -100,7 +97,7 @@ class Fingerprint final {
public:
/// The fingerprint value consisting of 32 bytes of zeroes.
///
/// This fingerprint is a perfectly fine value for an MD5 hash, but it is
/// This fingerprint is a perfectly fine value for a hash, but it is
/// completely arbitrary.
static Fingerprint ZERO() {
return Fingerprint(Fingerprint::Core{0, 0});
Expand All @@ -118,6 +115,22 @@ class Fingerprint final {
void simple_display(llvm::raw_ostream &out, const Fingerprint &fp);
}; // namespace swift

namespace swift {

template <> struct StableHasher::Combiner<Fingerprint> {
static void combine(StableHasher &hasher, const Fingerprint &Val) {
// Our underlying buffer is already byte-swapped. Combine the
// raw bytes from the core by hand.
uint8_t buffer[8];
memcpy(buffer, &Val.core.first, sizeof(buffer));
hasher.combine(buffer);
memcpy(buffer, &Val.core.second, sizeof(buffer));
hasher.combine(buffer);
}
};

}; // namespace swift

namespace llvm {
class raw_ostream;
raw_ostream &operator<<(raw_ostream &OS, const swift::Fingerprint &fp);
Expand Down
236 changes: 236 additions & 0 deletions include/swift/Basic/StableHasher.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
//===--- StableHasher.h - Stable Hashing ------------------------*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// An implementation of a stable hashing for Swift.
//
// Derived from the reference implementation for SipHash 2-4:
// https://github.com/veorq/SipHash
//
// With inline buffering derived from the hash implementation in the Swift
// Standard Library.
//
//===----------------------------------------------------------------------===//

#ifndef SWIFT_BASIC_STABLEHASHER_H
#define SWIFT_BASIC_STABLEHASHER_H

#include "llvm/Support/Endian.h"
#include "llvm/ADT/StringRef.h"
#include <algorithm>
#include <cstring>
#include <string>
#include <vector>
#include <tuple>
#include <utility>

namespace swift {

/// A \c StableHasher is an implementation of a 128-bit stable hash - built for
/// speed.
///
/// A "stable" hash in this context refers to the idea that the output of this
/// hasher is deterministic across instantiations of the compiler. In order to
/// support this goal, this hasher disallows run-dependent or otherwise
/// unstable values from entering into the hash-combiner. For example, this
/// hasher will statically reject attempts to hash-combine pointers and
/// aggregates of pointers. Note that this relies on user cooperation as well.
/// The order of hash-combines is pivotal, thus e.g. collection values should
/// have a guaranteed order or be sorted before being hash-combined.
///
/// Stable hash values must also be independent of the host architecture. For
/// integral types and enumerations, the default hash-combiner will
/// automatically byte-swap to a common little-endian format.
///
/// This hasher also allows for extending the hash-combiner to user-defined
/// types. To do so, define a (partial) specialization of
/// \c swift::StableHasher::Combiner<T>
///
/// template <typename T>
/// struct swift::StableHasher::Combiner<std::optional<T>> {
/// static void combine(StableHasher &hasher, const std::optional<T> &O) {
/// if (!O.has_value()) {
/// hasher.combine(0);
/// } else {
/// hasher.combine(1);
/// swift::StableHasher::Combiner<T>::combine(hasher, O.value());
/// }
/// }
/// };
///
/// The current implementation is the 128-bit (extended) SipHash 2-4, which
/// has been empirically demonstrated to have the best throughput relative to
/// the other SipHash tunings.
class StableHasher final {
private:
struct State {
uint64_t v0 = 0x736F6D6570736575;
uint64_t v1 = 0x646F72616E646f6D;
uint64_t v2 = 0x6C7967656E657261;
uint64_t v3 = 0x7465646279746573;
} state;

// A buffer of up to 8 items that this hasher uses to amortize the cost
// of the hashing function for hash-combines shorter than 64-bits.
uint8_t byteBuffer[8] = {0};
// msb lsb
// +---------+-------+-------+-------+-------+-------+-------+-------+
// |byteCount| length (<= 56 bits) |
// +---------+-------+-------+-------+-------+-------+-------+-------+
uint64_t lengthAndByteCount = 0;

public:
static StableHasher defaultHasher() { StableHasher hasher{0, 0}; return hasher; }

explicit StableHasher(uint64_t leftSeed, uint64_t rightSeed) {
state.v3 ^= rightSeed;
state.v2 ^= leftSeed;
state.v1 ^= rightSeed;
state.v0 ^= leftSeed;

state.v1 ^= 0xEE;
}

public:
template <typename T> struct Combiner {
// static void combine(StableHasher &hasher, const T &Val);
};

public:
/// Consume this stable hasher and compute the final 128-bit stable hash value.
std::pair<uint64_t, uint64_t> finalize() &&;

template <uint64_t N> void combine(uint8_t (&bits)[N]) {
static_assert(N > 0, "Cannot append VLA");
static_assert(N <= 8, "Can only append up to 64 bits at a time");

lengthAndByteCount += N;

const uint64_t bufLen = getBufferLength();
const uint64_t available = sizeof(byteBuffer) - bufLen;

// Cram as many bytes into the buffer as we can.
const uint64_t nhead = std::min(N, available);
if (nhead == sizeof(byteBuffer)) {
// We have headroom available for all 64 bits. Eagerly compress the
// now-full buffer into our state.
std::copy(bits, bits + sizeof(byteBuffer), byteBuffer);
} else if (N >= available) {
// There was some excess - append as many bytes as we can hold and
// compress the buffer into our state.
std::copy(bits, bits + nhead, byteBuffer + bufLen);
} else {
// We have headroom available for these bits.
std::copy(bits, bits + N, byteBuffer + bufLen);
return setBufferLength(bufLen + N);
}

constexpr auto endian = llvm::support::endianness::little;
compress(llvm::support::endian::read<uint64_t>(byteBuffer, endian));

// Now reseed the buffer with the remaining bytes.
const uint64_t remainder = N - available;
std::copy(bits + available, bits + N, byteBuffer);
return setBufferLength(remainder);
}

template <
typename T,
typename std::enable_if<std::is_integral<T>::value>::type * = nullptr>
void combine(T bits) {
constexpr auto endian = llvm::support::endianness::little;
uint8_t buf[sizeof(T)] = {0};
bits = llvm::support::endian::byte_swap<T>(bits, endian);
std::memcpy(buf, &bits, sizeof(T));
combine<sizeof(T)>(buf);
}

template <
typename EnumType,
typename std::enable_if<std::is_enum<EnumType>::value>::type * = nullptr>
void combine(EnumType value) {
using Underlying = typename std::underlying_type<EnumType>::type;
return this->template combine<Underlying>(static_cast<Underlying>(value));
}

template <typename T>
auto combine(const T *ptr) -> decltype("Cannot hash-combine pointers!"){};

template <typename T, typename... Ts>
void combine(const T &arg, const Ts &... args) {
return combine_many(arg, args...);
}

template <typename T, typename U> void combine(const std::pair<T, U> &arg) {
return combine_many(arg.first, arg.second);
}

template <typename T> void combine(const std::basic_string<T> &arg) {
return combine_range(arg.begin(), arg.end());
}

void combine(llvm::StringRef arg) {
return combine_range(arg.begin(), arg.end());
}

template <typename T,
decltype(StableHasher::Combiner<T>::combine) * = nullptr>
void combine(const T &val) {
return StableHasher::Combiner<T>::combine(*this, val);
}

template <typename ValueT> void combine_range(ValueT first, ValueT last) {
combine(std::distance(first, last));
while (first != last) {
combine(*first++);
}
}

template <typename... Ts> void combine(const std::tuple<Ts...> &arg) {
return combine_tuple(arg, typename std::index_sequence_for<Ts...>{});
}

private:
template <typename... Ts, unsigned... Indices>
void combine_tuple(const std::tuple<Ts...> &arg,
std::index_sequence<Indices...> indices) {
return combine_many(hash_value(std::get<Indices>(arg))...);
}

// base case.
void combine_many() {}

// recursive case
template <typename T, typename... Ts>
void combine_many(const T &arg, const Ts &... args) {
combine(arg);
return combine_many(args...);
}

private:
/// Return the number of bytes in the inline buffer.
uint64_t getBufferLength() const { return lengthAndByteCount >> 56; }
/// Set the number of bytes in the inline buffer.
void setBufferLength(uint64_t newLen) {
lengthAndByteCount = getDigestLength() | (newLen << 56);
}

/// Return the number of bytes that have been hash-combined so far.
uint64_t getDigestLength() const {
return lengthAndByteCount & ~(uint64_t(0xFF) << 56);
}

void compress(uint64_t value);
};

} // namespace swift

#endif // SWIFT_BASIC_STABLEHASHER_H
2 changes: 1 addition & 1 deletion include/swift/Parse/Parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class Parser {

/// The current token hash, or \c None if the parser isn't computing a hash
/// for the token stream.
Optional<llvm::MD5> CurrentTokenHash;
Optional<StableHasher> CurrentTokenHash;

void recordTokenHash(const Token Tok) {
if (!Tok.getText().empty())
Expand Down
19 changes: 8 additions & 11 deletions lib/AST/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1092,22 +1092,21 @@ Fingerprint SourceFile::getInterfaceHash() const {
assert(hasInterfaceHash() && "Interface hash not enabled");
auto &eval = getASTContext().evaluator;
auto *mutableThis = const_cast<SourceFile *>(this);
auto md5 = *evaluateOrDefault(eval, ParseSourceFileRequest{mutableThis}, {})
.InterfaceHash;
llvm::MD5::MD5Result result;
md5.final(result);
return Fingerprint{std::move(result)};
Optional<StableHasher> interfaceHasher =
evaluateOrDefault(eval, ParseSourceFileRequest{mutableThis}, {})
.InterfaceHasher;
return Fingerprint{StableHasher{interfaceHasher.getValue()}.finalize()};
}

Fingerprint SourceFile::getInterfaceHashIncludingTypeMembers() const {
/// FIXME: Gross. Hashing multiple "hash" values.
llvm::MD5 hash;
hash.update(getInterfaceHash().getRawValue());
auto hash = StableHasher::defaultHasher();
hash.combine(getInterfaceHash());

std::function<void(IterableDeclContext *)> hashTypeBodyFingerprints =
[&](IterableDeclContext *IDC) {
if (auto fp = IDC->getBodyFingerprint())
hash.update(fp->getRawValue());
hash.combine(*fp);
for (auto *member : IDC->getParsedMembers())
if (auto *childIDC = dyn_cast<IterableDeclContext>(member))
hashTypeBodyFingerprints(childIDC);
Expand All @@ -1118,9 +1117,7 @@ Fingerprint SourceFile::getInterfaceHashIncludingTypeMembers() const {
hashTypeBodyFingerprints(IDC);
}

llvm::MD5::MD5Result result;
hash.final(result);
return Fingerprint{std::move(result)};
return Fingerprint{std::move(hash)};
}

syntax::SourceFileSyntax SourceFile::getSyntaxRoot() const {
Expand Down
1 change: 1 addition & 0 deletions lib/Basic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ add_swift_host_library(swiftBasic STATIC
Program.cpp
QuotedString.cpp
SourceLoc.cpp
StableHasher.cpp
Statistic.cpp
StringExtras.cpp
TaskQueue.cpp
Expand Down
Loading