Skip to content

Commit dbb48a0

Browse files
[CAS] Add implementation for current OnDisk CAS + abstractions
Add current downstream cas API and implementation that includes OnDiskCAS implementation, different level of abstractions for CAS, different utilities.
1 parent 6c7d348 commit dbb48a0

35 files changed

+7139
-25
lines changed

llvm/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,13 @@ option (LLVM_ENABLE_SPHINX "Use Sphinx to generate llvm documentation." OFF)
758758
option (LLVM_ENABLE_OCAMLDOC "Build OCaml bindings documentation." ON)
759759
option (LLVM_ENABLE_BINDINGS "Build bindings." ON)
760760

761+
if(UNIX AND CMAKE_SIZEOF_VOID_P GREATER_EQUAL 8)
762+
set(LLVM_ENABLE_ONDISK_CAS_default ON)
763+
else()
764+
set(LLVM_ENABLE_ONDISK_CAS_default OFF)
765+
endif()
766+
option(LLVM_ENABLE_ONDISK_CAS "Build OnDiskCAS." ${LLVM_ENABLE_ONDISK_CAS_default})
767+
761768
set(LLVM_INSTALL_DOXYGEN_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/doxygen-html"
762769
CACHE STRING "Doxygen-generated HTML documentation install directory")
763770
set(LLVM_INSTALL_OCAMLDOC_HTML_DIR "${CMAKE_INSTALL_DOCDIR}/llvm/ocaml-html"

llvm/include/llvm/CAS/ActionCache.h

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
//===- llvm/CAS/ActionCache.h -----------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_CASACTIONCACHE_H
10+
#define LLVM_CAS_CASACTIONCACHE_H
11+
12+
#include "llvm/ADT/FunctionExtras.h"
13+
#include "llvm/ADT/StringRef.h"
14+
#include "llvm/CAS/CASID.h"
15+
#include "llvm/CAS/CASReference.h"
16+
#include "llvm/Support/Error.h"
17+
#include <future>
18+
19+
namespace llvm::cas {
20+
21+
class ObjectStore;
22+
class CASID;
23+
class ObjectProxy;
24+
25+
/// A key for caching an operation.
26+
/// It is implemented as a bag of bytes and provides a convenient constructor
27+
/// for CAS types.
28+
class CacheKey {
29+
public:
30+
StringRef getKey() const { return Key; }
31+
32+
// TODO: Support CacheKey other than a CASID but rather any array of bytes.
33+
// To do that, ActionCache need to be able to rehash the key into the index,
34+
// which then `getOrCompute` method can be used to avoid multiple calls to
35+
// has function.
36+
CacheKey(const CASID &ID);
37+
CacheKey(const ObjectProxy &Proxy);
38+
CacheKey(const ObjectStore &CAS, const ObjectRef &Ref);
39+
40+
private:
41+
std::string Key;
42+
};
43+
44+
using AsyncCASIDValue = AsyncValue<CASID>;
45+
46+
/// This is used to workaround the issue of MSVC needing default-constructible
47+
/// types for \c std::promise/future.
48+
struct AsyncErrorValue {
49+
Error take() { return std::move(Value); }
50+
51+
AsyncErrorValue() : Value(Error::success()) {}
52+
AsyncErrorValue(Error &&E) : Value(std::move(E)) {}
53+
54+
private:
55+
Error Value;
56+
};
57+
58+
/// A cache from a key describing an action to the result of doing it.
59+
///
60+
/// Actions are expected to be pure (collision is an error).
61+
class ActionCache {
62+
virtual void anchor();
63+
64+
public:
65+
/// Get a previously computed result for \p ActionKey.
66+
///
67+
/// \param Globally if true it is a hint to the underlying implementation that
68+
/// the lookup is profitable to be done on a distributed caching level, not
69+
/// just locally. The implementation is free to ignore this flag.
70+
Expected<std::optional<CASID>> get(const CacheKey &ActionKey,
71+
bool Globally = false) const {
72+
return getImpl(arrayRefFromStringRef(ActionKey.getKey()), Globally);
73+
}
74+
75+
/// Asynchronous version of \c get.
76+
std::future<AsyncCASIDValue> getFuture(const CacheKey &ActionKey,
77+
bool Globally = false) const;
78+
79+
/// Asynchronous version of \c get.
80+
void getAsync(
81+
const CacheKey &ActionKey, bool Globally,
82+
unique_function<void(Expected<std::optional<CASID>>)> Callback) const {
83+
return getImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Globally,
84+
std::move(Callback));
85+
}
86+
87+
/// Cache \p Result for the \p ActionKey computation.
88+
///
89+
/// \param Globally if true it is a hint to the underlying implementation that
90+
/// the association is profitable to be done on a distributed caching level,
91+
/// not just locally. The implementation is free to ignore this flag.
92+
Error put(const CacheKey &ActionKey, const CASID &Result,
93+
bool Globally = false) {
94+
assert(Result.getContext().getHashSchemaIdentifier() ==
95+
getContext().getHashSchemaIdentifier() &&
96+
"Hash schema mismatch");
97+
return putImpl(arrayRefFromStringRef(ActionKey.getKey()), Result, Globally);
98+
}
99+
100+
/// Asynchronous version of \c put.
101+
std::future<AsyncErrorValue> putFuture(const CacheKey &ActionKey,
102+
const CASID &Result,
103+
bool Globally = false);
104+
105+
/// Asynchronous version of \c put.
106+
void putAsync(const CacheKey &ActionKey, const CASID &Result, bool Globally,
107+
unique_function<void(Error)> Callback) {
108+
assert(Result.getContext().getHashSchemaIdentifier() ==
109+
getContext().getHashSchemaIdentifier() &&
110+
"Hash schema mismatch");
111+
return putImplAsync(arrayRefFromStringRef(ActionKey.getKey()), Result,
112+
Globally, std::move(Callback));
113+
}
114+
115+
virtual ~ActionCache() = default;
116+
117+
protected:
118+
virtual Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ResolvedKey,
119+
bool Globally) const = 0;
120+
virtual void getImplAsync(
121+
ArrayRef<uint8_t> ResolvedKey, bool Globally,
122+
unique_function<void(Expected<std::optional<CASID>>)> Callback) const;
123+
124+
virtual Error putImpl(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
125+
bool Globally) = 0;
126+
virtual void putImplAsync(ArrayRef<uint8_t> ResolvedKey, const CASID &Result,
127+
bool Globally,
128+
unique_function<void(Error)> Callback);
129+
130+
ActionCache(const CASContext &Context) : Context(Context) {}
131+
132+
const CASContext &getContext() const { return Context; }
133+
134+
private:
135+
const CASContext &Context;
136+
};
137+
138+
/// Create an action cache in memory.
139+
std::unique_ptr<ActionCache> createInMemoryActionCache();
140+
141+
/// Get a reasonable default on-disk path for a persistent ActionCache for the
142+
/// current user.
143+
std::string getDefaultOnDiskActionCachePath();
144+
145+
/// Create an action cache on disk.
146+
Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
147+
} // end namespace llvm::cas
148+
149+
#endif // LLVM_CAS_CASACTIONCACHE_H
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//===- BuiltinCASContext.h --------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_BUILTINCASCONTEXT_H
10+
#define LLVM_CAS_BUILTINCASCONTEXT_H
11+
12+
#include "llvm/CAS/CASID.h"
13+
#include "llvm/Support/BLAKE3.h"
14+
#include "llvm/Support/Error.h"
15+
16+
namespace llvm::cas::builtin {
17+
18+
/// Current hash type for the builtin CAS.
19+
///
20+
/// FIXME: This should be configurable via an enum to allow configuring the hash
21+
/// function. The enum should be sent into \a createInMemoryCAS() and \a
22+
/// createOnDiskCAS().
23+
///
24+
/// This is important (at least) for future-proofing, when we want to make new
25+
/// CAS instances use BLAKE7, but still know how to read/write BLAKE3.
26+
///
27+
/// Even just for BLAKE3, it would be useful to have these values:
28+
///
29+
/// BLAKE3 => 32B hash from BLAKE3
30+
/// BLAKE3_16B => 16B hash from BLAKE3 (truncated)
31+
///
32+
/// ... where BLAKE3_16 uses \a TruncatedBLAKE3<16>.
33+
///
34+
/// Motivation for a truncated hash is that it's cheaper to store. It's not
35+
/// clear if we always (or ever) need the full 32B, and for an ephemeral
36+
/// in-memory CAS, we almost certainly don't need it.
37+
///
38+
/// Note that the cost is linear in the number of objects for the builtin CAS,
39+
/// since we're using internal offsets and/or pointers as an optimization.
40+
///
41+
/// However, it's possible we'll want to hook up a local builtin CAS to, e.g.,
42+
/// a distributed generic hash map to use as an ActionCache. In that scenario,
43+
/// the transitive closure of the structured objects that are the results of
44+
/// the cached actions would need to be serialized into the map, something
45+
/// like:
46+
///
47+
/// "action:<schema>:<key>" -> "0123"
48+
/// "object:<schema>:0123" -> "3,4567,89AB,CDEF,9,some data"
49+
/// "object:<schema>:4567" -> ...
50+
/// "object:<schema>:89AB" -> ...
51+
/// "object:<schema>:CDEF" -> ...
52+
///
53+
/// These references would be full cost.
54+
using HasherT = BLAKE3;
55+
using HashType = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
56+
57+
class BuiltinCASContext : public CASContext {
58+
void printIDImpl(raw_ostream &OS, const CASID &ID) const final;
59+
void anchor() override;
60+
61+
public:
62+
/// Get the name of the hash for any table identifiers.
63+
///
64+
/// FIXME: This should be configurable via an enum, with at the following
65+
/// values:
66+
///
67+
/// "BLAKE3" => 32B hash from BLAKE3
68+
/// "BLAKE3.16" => 16B hash from BLAKE3 (truncated)
69+
///
70+
/// Enum can be sent into \a createInMemoryCAS() and \a createOnDiskCAS().
71+
static StringRef getHashName() { return "BLAKE3"; }
72+
StringRef getHashSchemaIdentifier() const final {
73+
static const std::string ID =
74+
("llvm.cas.builtin.v2[" + getHashName() + "]").str();
75+
return ID;
76+
}
77+
78+
static const BuiltinCASContext &getDefaultContext();
79+
80+
BuiltinCASContext() = default;
81+
82+
static Expected<HashType> parseID(StringRef PrintedDigest);
83+
static void printID(ArrayRef<uint8_t> Digest, raw_ostream &OS);
84+
};
85+
86+
} // namespace llvm::cas::builtin
87+
88+
#endif // LLVM_CAS_BUILTINCASCONTEXT_H
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
//===- BuiltinObjectHasher.h ------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_BUILTINOBJECTHASHER_H
10+
#define LLVM_CAS_BUILTINOBJECTHASHER_H
11+
12+
#include "llvm/ADT/StringRef.h"
13+
#include "llvm/CAS/ObjectStore.h"
14+
#include "llvm/Support/Endian.h"
15+
16+
namespace llvm::cas {
17+
18+
template <class HasherT> class BuiltinObjectHasher {
19+
public:
20+
using HashT = decltype(HasherT::hash(std::declval<ArrayRef<uint8_t> &>()));
21+
22+
static HashT hashObject(const ObjectStore &CAS, ArrayRef<ObjectRef> Refs,
23+
ArrayRef<char> Data) {
24+
BuiltinObjectHasher H;
25+
H.updateSize(Refs.size());
26+
for (const ObjectRef &Ref : Refs)
27+
H.updateRef(CAS, Ref);
28+
H.updateArray(Data);
29+
return H.finish();
30+
}
31+
32+
static HashT hashObject(ArrayRef<ArrayRef<uint8_t>> Refs,
33+
ArrayRef<char> Data) {
34+
BuiltinObjectHasher H;
35+
H.updateSize(Refs.size());
36+
for (const ArrayRef<uint8_t> &Ref : Refs)
37+
H.updateID(Ref);
38+
H.updateArray(Data);
39+
return H.finish();
40+
}
41+
42+
private:
43+
HashT finish() { return Hasher.final(); }
44+
45+
void updateRef(const ObjectStore &CAS, ObjectRef Ref) {
46+
updateID(CAS.getID(Ref));
47+
}
48+
49+
void updateID(const CASID &ID) { updateID(ID.getHash()); }
50+
51+
void updateID(ArrayRef<uint8_t> Hash) {
52+
// NOTE: Does not hash the size of the hash. That's a CAS implementation
53+
// detail that shouldn't leak into the UUID for an object.
54+
assert(Hash.size() == sizeof(HashT) &&
55+
"Expected object ref to match the hash size");
56+
Hasher.update(Hash);
57+
}
58+
59+
void updateArray(ArrayRef<uint8_t> Bytes) {
60+
updateSize(Bytes.size());
61+
Hasher.update(Bytes);
62+
}
63+
64+
void updateArray(ArrayRef<char> Bytes) {
65+
updateArray(ArrayRef(reinterpret_cast<const uint8_t *>(Bytes.data()),
66+
Bytes.size()));
67+
}
68+
69+
void updateSize(uint64_t Size) {
70+
Size = support::endian::byte_swap(Size, support::endianness::little);
71+
Hasher.update(
72+
ArrayRef(reinterpret_cast<const uint8_t *>(&Size), sizeof(Size)));
73+
}
74+
75+
BuiltinObjectHasher() = default;
76+
~BuiltinObjectHasher() = default;
77+
HasherT Hasher;
78+
};
79+
80+
} // namespace llvm::cas
81+
82+
#endif // LLVM_CAS_BUILTINOBJECTHASHER_H
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//===- BuiltinUnifiedCASDatabases.h -----------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
10+
#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
11+
12+
#include "llvm/Support/Error.h"
13+
14+
namespace llvm::cas {
15+
16+
class ActionCache;
17+
class ObjectStore;
18+
19+
/// Create on-disk \c ObjectStore and \c ActionCache instances based on
20+
/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
21+
Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
22+
createOnDiskUnifiedCASDatabases(StringRef Path);
23+
24+
} // namespace llvm::cas
25+
26+
#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H

0 commit comments

Comments
 (0)