swiftlang · ahoppen · Aug 24, 2018 · Jul 27, 2018 · rintaro · Aug 23, 2018
diff --git a/docs/ByteTree.md b/docs/ByteTree.md
@@ -7,20 +7,26 @@ The ByteTree format consists of two different constructs: *objects* and *scalars
 
 ## Serialization of scalars
 
-A scalar is encoded as its size followed by the data. Size is a `uint_32` that represents the size of the data in bytes in little endian order.
+A scalar is encoded as its size followed by the data. Size is a `uint_32` that represents the size of the data in bytes in little endian order. It always has its most significant bit set to 0 (to distinguish objects from scalars, see *Forwards compatibility*).
 
 For example, the string "Hello World" would be encoded as `(uint32_t)11` `"Hello World"`, or in hex `0B 00 00 00   48 65 6C 6C 6F 20 57 6F 72 6C 64`.
 
 ## Serialization of objects
 
 An object consists of its size, measured in the number of fields and represented as a `uint_32t` in little endian order, followed by the direct concatenation of its fields. Because each field is again prefixed with its size, no delimites are necessary in between the fields.
 
+To distinguish scalars and objects, the size of objects has its most-siginificant bit set to 1. It must be ignored to retrieve the number of fields in the object.
+
 Arrays are modelled as objects whose fields are all of the same type and whose length is variadic (and is indicated by the object's size).
 
 ## Versioning
 
 The ByteTree format is prepended by a 4-byte protocol version number that describes the version of the object tree that was serialized. Its exact semantics are up to each specific application, but it is encouraged to interpret it as a two-comentent number where the first component, consisting of the first three bytes, is incremented for breaking changes and the last byte is incremented for backwards-compatible changes.
 
+## Forward compatilibity
+
+Fields may be added to the end of objects in a backwards compatible manner (older deserialisers will still be able to deserialise the format). It does so by skipping over all fields that are not read during deserialisation. Newer versions of the deserialiser can detect if recently added fields are not present in the serialised data by inspecting the `numFields` property passed during deserialisation.
+
 ## Serialization safety
 
 Since all fields in objects are accessed by their index, issues quickly arise if a new field is accidentally added at the beginning of an object. To prevent issues like this, the ByteTree serialiser and deserialiser requires the explicit specification of each field's index within the object. These indicies are never serialised. Their sole purpose is to check that all fields are read in the correct order in assertion builds.
diff --git a/include/swift/Basic/ByteTreeSerialization.h b/include/swift/Basic/ByteTreeSerialization.h
@@ -167,8 +167,15 @@ class ByteTreeWriter {
            "NumFields may not be reset since it has already been written to "
            "the byte stream");
     assert((this->NumFields == UINT_MAX) && "NumFields has already been set");
-
-    auto Error = StreamWriter.writeInteger(NumFields);
+    // Num fields cannot exceed (1 << 31) since it would otherwise interfere
+    // with the bitflag that indicates if the next construct in the tree is an
+    // object or a scalar.
+    assert((NumFields & ((uint32_t)1 << 31)) == 0 && "Field size too large");
+
+    // Set the most significant bit to indicate that the next construct is an
+    // object and not a scalar.
+    uint32_t ToWrite = NumFields | (1 << 31);
+    auto Error = StreamWriter.writeInteger(ToWrite);
     (void)Error;
     assert(!Error);
 
@@ -229,6 +236,10 @@ class ByteTreeWriter {
     validateAndIncreaseFieldIndex(Index);
 
     uint32_t ValueSize = ScalarTraits<T>::size(Value);
+    // Size cannot exceed (1 << 31) since it would otherwise interfere with the
+    // bitflag that indicates if the next construct in the tree is an object
+    // or a scalar.
+    assert((ValueSize & ((uint32_t)1 << 31)) == 0 && "Value size too large");
     auto SizeError = StreamWriter.writeInteger(ValueSize);
     (void)SizeError;
     assert(!SizeError);
@@ -301,13 +312,16 @@ struct ScalarTraits<llvm::StringRef> {
 };
 
 template <>
-struct ScalarTraits<llvm::NoneType> {
-  // Serialize llvm::None as a value with 0 length
-  static unsigned size(const llvm::NoneType &None) { return 0; }
-  static llvm::Error write(llvm::BinaryStreamWriter &Writer,
-                           const llvm::NoneType &None) {
+struct ObjectTraits<llvm::NoneType> {
+  // Serialize llvm::None as an object without any elements
+  static unsigned numFields(const llvm::NoneType &Object,
+                            UserInfoMap &UserInfo) {
+    return 0;
+  }
+
+  static void write(ByteTreeWriter &Writer, const llvm::NoneType &Object,
+                    UserInfoMap &UserInfo) {
     // Nothing to write
-    return llvm::ErrorSuccess();
   }
 };
 

diff --git a/include/swift/Syntax/Serialization/SyntaxSerialization.h b/include/swift/Syntax/Serialization/SyntaxSerialization.h
@@ -208,6 +208,11 @@ namespace byteTree {
 /// shall be omitted when the syntax tree gets serialized.
 static void *UserInfoKeyReusedNodeIds = &UserInfoKeyReusedNodeIds;
 
+/// The key for a ByteTree serializion user info interpreted as `bool`.
+/// If specified, additional fields will be added to objects in the ByteTree
+/// to test forward compatibility.
+static void *UserInfoKeyAddInvalidFields = &UserInfoKeyAddInvalidFields;
+
 template <>
 struct WrapperTypeTraits<tok> {
   static uint8_t numericValue(const tok &Value);
@@ -302,13 +307,21 @@ struct ObjectTraits<syntax::RawSyntax> {
 
   static unsigned numFields(const syntax::RawSyntax &Syntax,
                             UserInfoMap &UserInfo) {
-    switch (nodeKind(Syntax, UserInfo)) {
-    case Token:
-      return 6;
-    case Layout:
-      return 5;
-    case Omitted:
-      return 2;
+    // FIXME: We know this is never set in production builds. Should we
+    // disable this code altogether in that case
+    // (e.g. if assertions are not enabled?)
+    if (UserInfo[UserInfoKeyAddInvalidFields]) {
+      switch (nodeKind(Syntax, UserInfo)) {
+      case Token: return 7;
+      case Layout: return 6;
+      case Omitted: return 2;
+      }
+    } else {
+      switch (nodeKind(Syntax, UserInfo)) {
+      case Token: return 6;
+      case Layout: return 5;
+      case Omitted: return 2;
+      }
     }
   }
 
@@ -326,11 +339,28 @@ struct ObjectTraits<syntax::RawSyntax> {
                    /*Index=*/3);
       Writer.write(Syntax.getLeadingTrivia(), /*Index=*/4);
       Writer.write(Syntax.getTrailingTrivia(), /*Index=*/5);
+      // FIXME: We know this is never set in production builds. Should we
+      // disable this code altogether in that case
+      // (e.g. if assertions are not enabled?)
+      if (UserInfo[UserInfoKeyAddInvalidFields]) {
+        // Test adding a new scalar field
+        StringRef Str = "invalid forward compatible field";
+        Writer.write(Str, /*Index=*/6);
+      }
       break;
     case Layout:
       Writer.write(Syntax.getPresence(), /*Index=*/2);
       Writer.write(Syntax.getKind(), /*Index=*/3);
       Writer.write(Syntax.getLayout(), /*Index=*/4);
+      // FIXME: We know this is never set in production builds. Should we
+      // disable this code altogether in that case
+      // (e.g. if assertions are not enabled?)
+      if (UserInfo[UserInfoKeyAddInvalidFields]) {
+        // Test adding a new object
+        auto Piece = syntax::TriviaPiece::spaces(2);
+        ArrayRef<syntax::TriviaPiece> SomeTrivia(Piece);
+        Writer.write(SomeTrivia, /*Index=*/5);
+      }
       break;
     case Omitted:
       // Nothing more to write

diff --git a/test/Syntax/round_trip_function.swift b/test/Syntax/round_trip_function.swift
@@ -1,5 +1,17 @@
+// We need to require macOS because swiftSyntax currently doesn't build on Linux
+// REQUIRES: OS=macosx
+
+// RUN: %empty-directory(%t)
 // RUN: %round-trip-syntax-test --swift-syntax-test %swift-syntax-test --file %s
 
+// RUN: %swift-syntax-test --serialize-raw-tree --serialize-byte-tree --input-source-filename %s --output-filename %t/tree.bin
+// RUN: %swift-swiftsyntax-test -deserialize -serialization-format byteTree -pre-edit-tree %t/tree.bin -out %t/afterRoundtrip.swift
+// RUN: diff -u %t/afterRoundtrip.swift %s
+
+// RUN: %swift-syntax-test --serialize-raw-tree --serialize-byte-tree --input-source-filename %s --output-filename %t/tree_with_additional_fields.bin --add-bytetree-fields
+// RUN: %swift-swiftsyntax-test -deserialize -serialization-format byteTree -pre-edit-tree %t/tree_with_additional_fields.bin -out %t/afterRoundtripWithAdditionalFields.swift
+// RUN: diff -u %t/afterRoundtripWithAdditionalFields.swift %s
+
 func noArgs() {}
 func oneArg(x: Int) {}
 func oneUnlabeledArg(_ x: Int) {}

diff --git a/tools/SwiftSyntax/ByteTreeDeserialization.swift b/tools/SwiftSyntax/ByteTreeDeserialization.swift
@@ -215,12 +215,30 @@ class ByteTreeReader {
     return result
   }
 
-  /// Read the number of fields in an object or the binary length of a scalar
-  /// field.
+  private func readFieldLength() -> (isObject: Bool, length: Int) {
+    let raw = UInt32(littleEndian: readRaw(UInt32.self))
+    let isObject = (raw & (UInt32(1) << 31)) != 0
+    let length = Int(raw & ~(UInt32(1) << 31))
+    return (isObject, length)
+  }
+
+
+  /// Read the number of fields in an object.
   ///
-  /// - Returns: The read value
-  private func readFieldLength() -> Int {
-    return Int(UInt32(littleEndian: readRaw(UInt32.self)))
+  /// - Returns: The number of fields in the following object
+  private func readObjectLength() -> Int {
+    let (isObject, length) = readFieldLength()
+    assert(isObject)
+    return length
+  }
+
+  /// Read the size of a scalar in bytes
+  ///
+  /// - Returns: The size of the following scalar in bytes
+  private func readScalarLength() -> Int {
+    let (isObject, length) = readFieldLength()
+    assert(!isObject)
+    return length
   }
 
   /// Read the protocol version and validate that it can be read using the given
@@ -246,7 +264,7 @@ class ByteTreeReader {
   fileprivate func read<T: ByteTreeObjectDecodable>(
     _ objectType: T.Type
   ) -> T {
-    let numFields = readFieldLength()
+    let numFields = readObjectLength()
     let objectReader = ByteTreeObjectReader(reader: self,
                                             numFields: numFields)
     return T.read(from: objectReader, numFields: numFields, userInfo: userInfo)
@@ -259,7 +277,7 @@ class ByteTreeReader {
   fileprivate func read<T: ByteTreeScalarDecodable>(
     _ scalarType: T.Type
   ) -> T {
-    let fieldSize = readFieldLength()
+    let fieldSize = readScalarLength()
     defer {
       pointer = pointer.advanced(by: fieldSize)
     }
@@ -268,10 +286,16 @@ class ByteTreeReader {
 
   /// Discard the next scalar field, advancing the pointer to the next field
   fileprivate func discardField() {
-    // FIXME: This can currently only discard scalar fields. Object fields
-    // should also be discardable
-    let fieldSize = readFieldLength()
-    pointer = pointer.advanced(by: fieldSize)
+    let (isObject, length) = readFieldLength()
+    if isObject {
+      // Discard object by discarding all its objects
+      for _ in 0..<length {
+        discardField()
+      }
+    } else {
+      // Discard scalar
+      pointer = pointer.advanced(by: length)
+    }
   }
 }
 

diff --git a/tools/swift-syntax-test/swift-syntax-test.cpp b/tools/swift-syntax-test/swift-syntax-test.cpp
@@ -156,6 +156,15 @@ SerializeAsByteTree("serialize-byte-tree",
                                    "serialized in the ByteTree format instead "
                                    "of JSON."));
 
+static llvm::cl::opt<bool>
+AddByteTreeFields("add-bytetree-fields",
+                  llvm::cl::desc("If specified, further fields will be added "
+                                 "to the syntax tree if it is serialized as a "
+                                 "ByteTree. This is to test forward "
+                                 "compatibility with future versions of "
+                                 "SwiftSyntax that might add more fields to "
+                                 "syntax nodes."));
+
 static llvm::cl::opt<bool>
 IncrementalSerialization("incremental-serialization",
                          llvm::cl::desc("If specified, the serialized syntax "
@@ -724,6 +733,9 @@ int doSerializeRawTree(const char *MainExecutablePath,
       llvm::BinaryStreamWriter Writer(Stream);
       std::map<void *, void *> UserInfo;
       UserInfo[swift::byteTree::UserInfoKeyReusedNodeIds] = &ReusedNodeIds;
+      if (options::AddByteTreeFields) {
+        UserInfo[swift::byteTree::UserInfoKeyAddInvalidFields] = (void *)true;
+      }
       swift::byteTree::ByteTreeWriter::write(/*ProtocolVersion=*/1, Writer,
                                              *Root, UserInfo);
       auto OutputBufferOrError = llvm::FileOutputBuffer::create(