Skip to content

Commit f97d13d

Browse files
authored
Merge pull request #18889 from ahoppen/bytetree-forward-compatible
[libSyntax] Make the ByteTree format forwards-compatible
2 parents 99d0147 + 34a89d4 commit f97d13d

File tree

6 files changed

+125
-27
lines changed

6 files changed

+125
-27
lines changed

docs/ByteTree.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,26 @@ The ByteTree format consists of two different constructs: *objects* and *scalars
77

88
## Serialization of scalars
99

10-
A scalar is encoded as its size followed by the data. Size is a `uint_32` that represents the size of the data in bytes in little endian order.
10+
A scalar is encoded as its size followed by the data. Size is a `uint_32` that represents the size of the data in bytes in little endian order. It always has its most significant bit set to 0 (to distinguish objects from scalars, see *Forwards compatibility*).
1111

1212
For example, the string "Hello World" would be encoded as `(uint32_t)11` `"Hello World"`, or in hex `0B 00 00 00 48 65 6C 6C 6F 20 57 6F 72 6C 64`.
1313

1414
## Serialization of objects
1515

1616
An object consists of its size, measured in the number of fields and represented as a `uint_32t` in little endian order, followed by the direct concatenation of its fields. Because each field is again prefixed with its size, no delimites are necessary in between the fields.
1717

18+
To distinguish scalars and objects, the size of objects has its most-siginificant bit set to 1. It must be ignored to retrieve the number of fields in the object.
19+
1820
Arrays are modelled as objects whose fields are all of the same type and whose length is variadic (and is indicated by the object's size).
1921

2022
## Versioning
2123

2224
The ByteTree format is prepended by a 4-byte protocol version number that describes the version of the object tree that was serialized. Its exact semantics are up to each specific application, but it is encouraged to interpret it as a two-comentent number where the first component, consisting of the first three bytes, is incremented for breaking changes and the last byte is incremented for backwards-compatible changes.
2325

26+
## Forward compatilibity
27+
28+
Fields may be added to the end of objects in a backwards compatible manner (older deserialisers will still be able to deserialise the format). It does so by skipping over all fields that are not read during deserialisation. Newer versions of the deserialiser can detect if recently added fields are not present in the serialised data by inspecting the `numFields` property passed during deserialisation.
29+
2430
## Serialization safety
2531

2632
Since all fields in objects are accessed by their index, issues quickly arise if a new field is accidentally added at the beginning of an object. To prevent issues like this, the ByteTree serialiser and deserialiser requires the explicit specification of each field's index within the object. These indicies are never serialised. Their sole purpose is to check that all fields are read in the correct order in assertion builds.

include/swift/Basic/ByteTreeSerialization.h

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,15 @@ class ByteTreeWriter {
167167
"NumFields may not be reset since it has already been written to "
168168
"the byte stream");
169169
assert((this->NumFields == UINT_MAX) && "NumFields has already been set");
170-
171-
auto Error = StreamWriter.writeInteger(NumFields);
170+
// Num fields cannot exceed (1 << 31) since it would otherwise interfere
171+
// with the bitflag that indicates if the next construct in the tree is an
172+
// object or a scalar.
173+
assert((NumFields & ((uint32_t)1 << 31)) == 0 && "Field size too large");
174+
175+
// Set the most significant bit to indicate that the next construct is an
176+
// object and not a scalar.
177+
uint32_t ToWrite = NumFields | (1 << 31);
178+
auto Error = StreamWriter.writeInteger(ToWrite);
172179
(void)Error;
173180
assert(!Error);
174181

@@ -229,6 +236,10 @@ class ByteTreeWriter {
229236
validateAndIncreaseFieldIndex(Index);
230237

231238
uint32_t ValueSize = ScalarTraits<T>::size(Value);
239+
// Size cannot exceed (1 << 31) since it would otherwise interfere with the
240+
// bitflag that indicates if the next construct in the tree is an object
241+
// or a scalar.
242+
assert((ValueSize & ((uint32_t)1 << 31)) == 0 && "Value size too large");
232243
auto SizeError = StreamWriter.writeInteger(ValueSize);
233244
(void)SizeError;
234245
assert(!SizeError);
@@ -301,13 +312,16 @@ struct ScalarTraits<llvm::StringRef> {
301312
};
302313

303314
template <>
304-
struct ScalarTraits<llvm::NoneType> {
305-
// Serialize llvm::None as a value with 0 length
306-
static unsigned size(const llvm::NoneType &None) { return 0; }
307-
static llvm::Error write(llvm::BinaryStreamWriter &Writer,
308-
const llvm::NoneType &None) {
315+
struct ObjectTraits<llvm::NoneType> {
316+
// Serialize llvm::None as an object without any elements
317+
static unsigned numFields(const llvm::NoneType &Object,
318+
UserInfoMap &UserInfo) {
319+
return 0;
320+
}
321+
322+
static void write(ByteTreeWriter &Writer, const llvm::NoneType &Object,
323+
UserInfoMap &UserInfo) {
309324
// Nothing to write
310-
return llvm::ErrorSuccess();
311325
}
312326
};
313327

include/swift/Syntax/Serialization/SyntaxSerialization.h

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,11 @@ namespace byteTree {
208208
/// shall be omitted when the syntax tree gets serialized.
209209
static void *UserInfoKeyReusedNodeIds = &UserInfoKeyReusedNodeIds;
210210

211+
/// The key for a ByteTree serializion user info interpreted as `bool`.
212+
/// If specified, additional fields will be added to objects in the ByteTree
213+
/// to test forward compatibility.
214+
static void *UserInfoKeyAddInvalidFields = &UserInfoKeyAddInvalidFields;
215+
211216
template <>
212217
struct WrapperTypeTraits<tok> {
213218
static uint8_t numericValue(const tok &Value);
@@ -302,13 +307,21 @@ struct ObjectTraits<syntax::RawSyntax> {
302307

303308
static unsigned numFields(const syntax::RawSyntax &Syntax,
304309
UserInfoMap &UserInfo) {
305-
switch (nodeKind(Syntax, UserInfo)) {
306-
case Token:
307-
return 6;
308-
case Layout:
309-
return 5;
310-
case Omitted:
311-
return 2;
310+
// FIXME: We know this is never set in production builds. Should we
311+
// disable this code altogether in that case
312+
// (e.g. if assertions are not enabled?)
313+
if (UserInfo[UserInfoKeyAddInvalidFields]) {
314+
switch (nodeKind(Syntax, UserInfo)) {
315+
case Token: return 7;
316+
case Layout: return 6;
317+
case Omitted: return 2;
318+
}
319+
} else {
320+
switch (nodeKind(Syntax, UserInfo)) {
321+
case Token: return 6;
322+
case Layout: return 5;
323+
case Omitted: return 2;
324+
}
312325
}
313326
}
314327

@@ -326,11 +339,28 @@ struct ObjectTraits<syntax::RawSyntax> {
326339
/*Index=*/3);
327340
Writer.write(Syntax.getLeadingTrivia(), /*Index=*/4);
328341
Writer.write(Syntax.getTrailingTrivia(), /*Index=*/5);
342+
// FIXME: We know this is never set in production builds. Should we
343+
// disable this code altogether in that case
344+
// (e.g. if assertions are not enabled?)
345+
if (UserInfo[UserInfoKeyAddInvalidFields]) {
346+
// Test adding a new scalar field
347+
StringRef Str = "invalid forward compatible field";
348+
Writer.write(Str, /*Index=*/6);
349+
}
329350
break;
330351
case Layout:
331352
Writer.write(Syntax.getPresence(), /*Index=*/2);
332353
Writer.write(Syntax.getKind(), /*Index=*/3);
333354
Writer.write(Syntax.getLayout(), /*Index=*/4);
355+
// FIXME: We know this is never set in production builds. Should we
356+
// disable this code altogether in that case
357+
// (e.g. if assertions are not enabled?)
358+
if (UserInfo[UserInfoKeyAddInvalidFields]) {
359+
// Test adding a new object
360+
auto Piece = syntax::TriviaPiece::spaces(2);
361+
ArrayRef<syntax::TriviaPiece> SomeTrivia(Piece);
362+
Writer.write(SomeTrivia, /*Index=*/5);
363+
}
334364
break;
335365
case Omitted:
336366
// Nothing more to write

test/Syntax/round_trip_function.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
1+
// We need to require macOS because swiftSyntax currently doesn't build on Linux
2+
// REQUIRES: OS=macosx
3+
4+
// RUN: %empty-directory(%t)
15
// RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %s
26

7+
// RUN: %swift-syntax-test --serialize-raw-tree --serialize-byte-tree --input-source-filename %s --output-filename %t/tree.bin
8+
// RUN: %swift-swiftsyntax-test -deserialize -serialization-format byteTree -pre-edit-tree %t/tree.bin -out %t/afterRoundtrip.swift
9+
// RUN: diff -u %t/afterRoundtrip.swift %s
10+
11+
// RUN: %swift-syntax-test --serialize-raw-tree --serialize-byte-tree --input-source-filename %s --output-filename %t/tree_with_additional_fields.bin --add-bytetree-fields
12+
// RUN: %swift-swiftsyntax-test -deserialize -serialization-format byteTree -pre-edit-tree %t/tree_with_additional_fields.bin -out %t/afterRoundtripWithAdditionalFields.swift
13+
// RUN: diff -u %t/afterRoundtripWithAdditionalFields.swift %s
14+
315
func noArgs() {}
416
func oneArg(x: Int) {}
517
func oneUnlabeledArg(_ x: Int) {}

tools/SwiftSyntax/ByteTreeDeserialization.swift

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -215,12 +215,30 @@ class ByteTreeReader {
215215
return result
216216
}
217217

218-
/// Read the number of fields in an object or the binary length of a scalar
219-
/// field.
218+
private func readFieldLength() -> (isObject: Bool, length: Int) {
219+
let raw = UInt32(littleEndian: readRaw(UInt32.self))
220+
let isObject = (raw & (UInt32(1) << 31)) != 0
221+
let length = Int(raw & ~(UInt32(1) << 31))
222+
return (isObject, length)
223+
}
224+
225+
226+
/// Read the number of fields in an object.
220227
///
221-
/// - Returns: The read value
222-
private func readFieldLength() -> Int {
223-
return Int(UInt32(littleEndian: readRaw(UInt32.self)))
228+
/// - Returns: The number of fields in the following object
229+
private func readObjectLength() -> Int {
230+
let (isObject, length) = readFieldLength()
231+
assert(isObject)
232+
return length
233+
}
234+
235+
/// Read the size of a scalar in bytes
236+
///
237+
/// - Returns: The size of the following scalar in bytes
238+
private func readScalarLength() -> Int {
239+
let (isObject, length) = readFieldLength()
240+
assert(!isObject)
241+
return length
224242
}
225243

226244
/// Read the protocol version and validate that it can be read using the given
@@ -246,7 +264,7 @@ class ByteTreeReader {
246264
fileprivate func read<T: ByteTreeObjectDecodable>(
247265
_ objectType: T.Type
248266
) -> T {
249-
let numFields = readFieldLength()
267+
let numFields = readObjectLength()
250268
let objectReader = ByteTreeObjectReader(reader: self,
251269
numFields: numFields)
252270
return T.read(from: objectReader, numFields: numFields, userInfo: userInfo)
@@ -259,7 +277,7 @@ class ByteTreeReader {
259277
fileprivate func read<T: ByteTreeScalarDecodable>(
260278
_ scalarType: T.Type
261279
) -> T {
262-
let fieldSize = readFieldLength()
280+
let fieldSize = readScalarLength()
263281
defer {
264282
pointer = pointer.advanced(by: fieldSize)
265283
}
@@ -268,10 +286,16 @@ class ByteTreeReader {
268286

269287
/// Discard the next scalar field, advancing the pointer to the next field
270288
fileprivate func discardField() {
271-
// FIXME: This can currently only discard scalar fields. Object fields
272-
// should also be discardable
273-
let fieldSize = readFieldLength()
274-
pointer = pointer.advanced(by: fieldSize)
289+
let (isObject, length) = readFieldLength()
290+
if isObject {
291+
// Discard object by discarding all its objects
292+
for _ in 0..<length {
293+
discardField()
294+
}
295+
} else {
296+
// Discard scalar
297+
pointer = pointer.advanced(by: length)
298+
}
275299
}
276300
}
277301

tools/swift-syntax-test/swift-syntax-test.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,15 @@ SerializeAsByteTree("serialize-byte-tree",
156156
"serialized in the ByteTree format instead "
157157
"of JSON."));
158158

159+
static llvm::cl::opt<bool>
160+
AddByteTreeFields("add-bytetree-fields",
161+
llvm::cl::desc("If specified, further fields will be added "
162+
"to the syntax tree if it is serialized as a "
163+
"ByteTree. This is to test forward "
164+
"compatibility with future versions of "
165+
"SwiftSyntax that might add more fields to "
166+
"syntax nodes."));
167+
159168
static llvm::cl::opt<bool>
160169
IncrementalSerialization("incremental-serialization",
161170
llvm::cl::desc("If specified, the serialized syntax "
@@ -724,6 +733,9 @@ int doSerializeRawTree(const char *MainExecutablePath,
724733
llvm::BinaryStreamWriter Writer(Stream);
725734
std::map<void *, void *> UserInfo;
726735
UserInfo[swift::byteTree::UserInfoKeyReusedNodeIds] = &ReusedNodeIds;
736+
if (options::AddByteTreeFields) {
737+
UserInfo[swift::byteTree::UserInfoKeyAddInvalidFields] = (void *)true;
738+
}
727739
swift::byteTree::ByteTreeWriter::write(/*ProtocolVersion=*/1, Writer,
728740
*Root, UserInfo);
729741
auto OutputBufferOrError = llvm::FileOutputBuffer::create(

0 commit comments

Comments
 (0)