swiftlang · ahoppen · Jan 12, 2024 · Jan 10, 2024 · Jan 10, 2024 · ahoppen
@@ -156,12 +156,6 @@ extension Unicode.Scalar {
     // including and above the DEL character U+7F.
     return self.value >= 0x20 && self.value < 0x7F
   }
-
-  var isStartOfUTF8Character: Bool {
-    // RFC 2279: The octet values FE and FF never appear.
-    // RFC 3629: The octet values C0, C1, F5 to FF never appear.
-    return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
-  }
 }
 
 extension Unicode.Scalar {
@@ -179,20 +173,25 @@ extension Unicode.Scalar {
       return Unicode.Scalar(curByte)
     }
 
-    // Read the number of high bits set, which indicates the number of bytes in
-    // the character.
-    let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
-
-    // If this is 0b10XXXXXX, then it is a continuation character.
-    if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
+    // If this is not the start of a UTF8 character,
+    // then it is either a continuation byte or an invalid UTF8 code point.
+    if !curByte.isStartOfUTF8Character {
       // Skip until we get the start of another character.  This is guaranteed to
       // at least stop at the nul at the end of the buffer.
-      while let peeked = peek(), !Unicode.Scalar(peeked).isStartOfUTF8Character {
+      while let peeked = peek(), !peeked.isStartOfUTF8Character {
         _ = advance()
       }
       return nil
     }
 
+    // Read the number of high bits set, which indicates the number of bytes in
+    // the character.
+    let encodedBytes = (~curByte).leadingZeroBitCount
+    // We have a multi-byte UTF-8 scalar.
+    // Single-byte UTF-8 scalars are handled at the start of the function by checking `curByte < 0x80`.
+    // `isStartOfUTF8Character` guaranteed that the `curByte` has 2 to 4 leading ones.
+    precondition(encodedBytes >= 2 && encodedBytes <= 4)
+
     // Drop the high bits indicating the # bytes of the result.
     var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
 
@@ -252,3 +251,11 @@ extension Unicode.Scalar {
     return self.lexing(advance: advance, peek: peek)
   }
 }
+
+extension UInt8 {
+  var isStartOfUTF8Character: Bool {
+    // RFC 2279: The octet values FE and FF never appear.
+    // RFC 3629: The octet values C0, C1, F5 to FF never appear.
+    return self < 0x80 || (self >= 0xC2 && self < 0xF5)
+  }
+}
@@ -1504,4 +1504,23 @@ public class LexerTests: ParserTestCase {
       ]
     )
   }
+
+  func testUnicodeContainTheEdgeContinuationByte() {
+    // A continuation byte must be in the range greater than or
+    // equal to 0x80 and less than or equal to 0xBF
+
+    // À(0xC3 0x80), 㗀(0xE3 0x97 0x80), 🀀(0xF0 0x9F 0x80 0x80),
+    // ÿ(0xC3 0xBF), 俿(0xE4 0xBF 0xBF), 𐐿(0xF0 0x90 0x90 0xBF)
+    assertLexemes(
+      "À 㗀 🀀 ÿ 俿 𐐿",
+      lexemes: [
+        LexemeSpec(.identifier, text: "À", trailing: " "),
+        LexemeSpec(.identifier, text: "㗀", trailing: " "),
+        LexemeSpec(.identifier, text: "🀀", trailing: " "),
+        LexemeSpec(.identifier, text: "ÿ", trailing: " "),
+        LexemeSpec(.identifier, text: "俿", trailing: " "),
+        LexemeSpec(.identifier, text: "𐐿"),
+      ]
+    )
+  }
 }