Skip to content

Commit d46a292

Browse files
committed
[SwiftParser] Correct the range for the start byte of a UTF8 character
1 parent 8018224 commit d46a292

File tree

1 file changed

+17
-13
lines changed

1 file changed

+17
-13
lines changed

Sources/SwiftParser/Lexer/UnicodeScalarExtensions.swift

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,6 @@ extension Unicode.Scalar {
156156
// including and above the DEL character U+7F.
157157
return self.value >= 0x20 && self.value < 0x7F
158158
}
159-
160-
var isStartOfUTF8Character: Bool {
161-
// RFC 2279: The octet values FE and FF never appear.
162-
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
163-
return self.value <= 0x80 || (self.value >= 0xC2 && self.value < 0xF5)
164-
}
165159
}
166160

167161
extension Unicode.Scalar {
@@ -179,20 +173,22 @@ extension Unicode.Scalar {
179173
return Unicode.Scalar(curByte)
180174
}
181175

182-
// Read the number of high bits set, which indicates the number of bytes in
183-
// the character.
184-
let encodedBytes = (~(UInt32(curByte) << 24)).leadingZeroBitCount
185-
186-
// If this is 0b10XXXXXX, then it is a continuation character.
187-
if encodedBytes == 1 || !Unicode.Scalar(curByte).isStartOfUTF8Character {
176+
// If this is not the start of a UTF8 character,
177+
// then it is either a continuation byte or an invalid UTF8 code point.
178+
if !curByte.isStartOfUTF8Character {
188179
// Skip until we get the start of another character. This is guaranteed to
189180
// at least stop at the nul at the end of the buffer.
190-
while let peeked = peek(), !Unicode.Scalar(peeked).isStartOfUTF8Character {
181+
while let peeked = peek(), !peeked.isStartOfUTF8Character {
191182
_ = advance()
192183
}
193184
return nil
194185
}
195186

187+
// Read the number of high bits set, which indicates the number of bytes in
188+
// the character.
189+
let encodedBytes = (~curByte).leadingZeroBitCount
190+
precondition(encodedBytes >= 2 && encodedBytes <= 4)
191+
196192
// Drop the high bits indicating the # bytes of the result.
197193
var charValue = UInt32(curByte << encodedBytes) >> encodedBytes
198194

@@ -252,3 +248,11 @@ extension Unicode.Scalar {
252248
return self.lexing(advance: advance, peek: peek)
253249
}
254250
}
251+
252+
extension UInt8 {
253+
var isStartOfUTF8Character: Bool {
254+
// RFC 2279: The octet values FE and FF never appear.
255+
// RFC 3629: The octet values C0, C1, F5 to FF never appear.
256+
return self < 0x80 || (self >= 0xC2 && self < 0xF5)
257+
}
258+
}

0 commit comments

Comments
 (0)