Skip to content

Commit 39fa244

Browse files
authored
Introduce SyntaxText - light weight string representation
'SyntaxText' represent a string, but without owning the data. This will be used in Syntax tree as string representation. Provides basic `String`-like functionalities including: - `Hashable` and `Equatable` - Collection of `UInt8` - Slicing - Prefix/Postfix check - Interconvertible with `Swift.String`
1 parent 5b5383c commit 39fa244

File tree

3 files changed

+327
-0
lines changed

3 files changed

+327
-0
lines changed

Sources/SwiftSyntax/SyntaxText.swift

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
//===---------- SyntaxText.swift - Unowned String Representation ---------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#if canImport(Darwin)
14+
import Darwin
15+
#elseif canImport(Glibc)
16+
import Glibc
17+
#endif
18+
19+
/// Represent a string.
20+
///
21+
/// This type does not own the string data. The data reside in some other buffer
22+
/// whose lifetime extends past that of the SyntaxText.
23+
///
24+
/// `SyntaxText` is a `Collection` of `UInt8` which is _expected_ to be a UTF8
25+
/// encoded byte sequence. However, since that is essentialy just a span of a
26+
/// memory buffer, it may contain ill-formed UTF8 sequences. And their
27+
/// comparision (e.g.`==`, hasPrefix()) are purely based on the byte squences,
28+
/// without any Unicode normalization or anything.
29+
///
30+
/// Since it's just a byte sequence, `SyntaxText` can represent the exact source
31+
/// buffer regardless of whether it is a valid UTF8. When creating
32+
/// `Swift.String`, ill-formed UTF8 sequences are replaced with the Unicode
33+
/// replacement character (`\u{FFFD}`).
34+
@_spi(Testing) // SPI name is subject to change
35+
public struct SyntaxText {
36+
var buffer: UnsafeBufferPointer<UInt8>
37+
38+
public init(baseAddress: UnsafePointer<UInt8>?, count: Int) {
39+
assert(count == 0 || baseAddress != nil,
40+
"If count is not zero, base address must be exist")
41+
buffer = .init(start: baseAddress, count: count)
42+
}
43+
44+
/// Creates an empty `SyntaxText`
45+
public init() {
46+
self.init(baseAddress: nil, count: 0)
47+
}
48+
49+
/// Creates a `SyntaxText` from a `StaticString`
50+
public init(_ string: StaticString) {
51+
self.init(baseAddress: string.utf8Start, count: string.utf8CodeUnitCount)
52+
}
53+
54+
/// Creates a `SyntaxText` over the same memory as the given slice.
55+
public init(rebasing slice: SubSequence) {
56+
self.init(
57+
baseAddress: slice.base.baseAddress?.advanced(by: slice.startIndex),
58+
count: slice.count)
59+
}
60+
61+
/// Base address of the memory range this string refers to.
62+
public var baseAddress: UnsafePointer<UInt8>? {
63+
buffer.baseAddress
64+
}
65+
66+
/// Byte length of this string.
67+
public var count: Int {
68+
buffer.count
69+
}
70+
71+
/// A Boolean value indicating whether a string has no characters.
72+
public var isEmpty: Bool {
73+
buffer.isEmpty
74+
}
75+
76+
/// Returns `true` if the memory range of this string is a part of `other`.
77+
///
78+
/// `SyntaxText(rebasing: text[n ..< m]).isSliceOf(text)` is always true as
79+
/// long as `n` and `m` are valid indices.
80+
public func isSlice(of other: SyntaxText) -> Bool {
81+
// If either of it is empty, returns 'true' only if both are empty.
82+
// Otherwise, returns 'false'.
83+
guard !self.isEmpty && !other.isEmpty else {
84+
return self.isEmpty && other.isEmpty
85+
}
86+
return (other.baseAddress! <= self.baseAddress! &&
87+
self.baseAddress! + count <= other.baseAddress! + other.count)
88+
}
89+
90+
/// Returns `true` if `other` is a substring of this `SyntaxText`.
91+
public func contains(_ other: SyntaxText) -> Bool {
92+
return firstRange(of: other) != nil
93+
}
94+
95+
/// Finds and returns the range of the first occurrence of `other` within this
96+
/// string. Returns `nil` if `other` is not found.
97+
public func firstRange(of other: SyntaxText) -> Range<Index>? {
98+
if other.isEmpty { return nil }
99+
let stop = self.count - other.count
100+
var start = 0
101+
// If 'other' is longer than 'self', 'stop' is less than zero, so the
102+
// condition is never satisfied.
103+
while start <= stop {
104+
// Force unwrappings are safe because we know 'self' and 'other' are both
105+
// not empty.
106+
if compareMemory(self.baseAddress! + start, other.baseAddress!, other.count) {
107+
return start ..< (start + other.count)
108+
} else {
109+
start += 1
110+
}
111+
}
112+
return nil
113+
}
114+
115+
/// Returns `true` if the string begins with the specified prefix.
116+
public func hasPrefix(_ other: SyntaxText) -> Bool {
117+
guard self.count >= other.count else { return false }
118+
guard !other.isEmpty else { return true }
119+
let prefixSlice = self[0 ..< other.count]
120+
return Self(rebasing: prefixSlice) == other
121+
}
122+
123+
/// Returns `true` if the string ends with the specified suffix.
124+
public func hasSuffix(_ other: SyntaxText) -> Bool {
125+
guard self.count >= other.count else { return false }
126+
guard !other.isEmpty else { return true }
127+
let suffixSlice = self[(self.count - other.count) ..< self.count]
128+
return Self(rebasing: suffixSlice) == other
129+
}
130+
}
131+
132+
/// `SyntaxText` is a collection of `UInt8`.
133+
extension SyntaxText: RandomAccessCollection {
134+
public typealias Element = UInt8
135+
public typealias Index = Int
136+
public typealias SubSequence = Slice<SyntaxText>
137+
138+
public var startIndex: Index { buffer.startIndex }
139+
public var endIndex: Index { buffer.endIndex }
140+
141+
public subscript(position: Index) -> Element {
142+
get { return buffer[position] }
143+
}
144+
}
145+
146+
extension SyntaxText: Hashable {
147+
public static func ==(lhs: SyntaxText, rhs: SyntaxText) -> Bool {
148+
if lhs.buffer.count != rhs.buffer.count {
149+
return false
150+
}
151+
if lhs.isEmpty || lhs.buffer.baseAddress == rhs.buffer.baseAddress {
152+
return true
153+
}
154+
return compareMemory(lhs.baseAddress!, rhs.baseAddress!, lhs.count)
155+
}
156+
157+
public func hash(into hasher: inout Hasher) {
158+
hasher.combine(bytes: .init(buffer))
159+
}
160+
}
161+
162+
extension SyntaxText: ExpressibleByStringLiteral {
163+
public init(stringLiteral value: StaticString) { self.init(value) }
164+
public init(unicodeScalarLiteral value: StaticString) { self.init(value) }
165+
public init(extendedGraphemeClusterLiteral value: StaticString) { self.init(value) }
166+
}
167+
168+
extension SyntaxText: CustomStringConvertible {
169+
public var description: String { String(syntaxText: self) }
170+
}
171+
172+
extension SyntaxText: CustomDebugStringConvertible {
173+
public var debugDescription: String { description.debugDescription }
174+
}
175+
176+
extension String {
177+
/// Creates a `String` from a `SyntaxText`.
178+
///
179+
/// Ill-formed UTF-8 sequences in `syntaxText` are replaced with the Unicode
180+
/// replacement character `\u{FFFD}`.
181+
@_spi(Testing)
182+
public init(syntaxText: SyntaxText) {
183+
guard !syntaxText.isEmpty else {
184+
self = ""
185+
return
186+
}
187+
if #available(macOS 11.0, iOS 14.0, watchOS 7.0, tvOS 14.0, *) {
188+
self.init(unsafeUninitializedCapacity: syntaxText.count) { strBuffer in
189+
strBuffer.initialize(from: syntaxText.buffer).1
190+
}
191+
} else {
192+
self.init(decoding: syntaxText, as: UTF8.self)
193+
}
194+
}
195+
196+
/// Runs `body` with a `SyntaxText` that refers the contiguous memory of this
197+
/// string. Like `String.withUTF8(_:)`, this may mutates the string if this
198+
/// string was not contiguous.
199+
@_spi(Testing)
200+
public mutating func withSyntaxText<R>(
201+
_ body: (SyntaxText) throws -> R
202+
) rethrows -> R {
203+
try withUTF8 { utf8 in
204+
try body(SyntaxText(baseAddress: utf8.baseAddress, count: utf8.count))
205+
}
206+
}
207+
}
208+
209+
private func compareMemory(
210+
_ s1: UnsafePointer<UInt8>, _ s2: UnsafePointer<UInt8>, _ count: Int
211+
) -> Bool {
212+
assert(count > 0)
213+
#if canImport(Darwin)
214+
return Darwin.memcmp(s1, s2, count) == 0
215+
#elseif canImport(Glibc)
216+
return Glibc.memcmp(s1, s2, count) == 0
217+
#else
218+
return UnsafeBufferPointer(start: s1, count: count)
219+
.elementsEqual(UnsafeBufferPointer(start: s2, count: count))
220+
#endif
221+
}
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import XCTest
2+
@_spi(Testing) import SwiftSyntax
3+
4+
final class SyntaxTextTests: XCTestCase {
5+
func testLiteral() throws {
6+
let strBasic: SyntaxText = "foobar"
7+
let strASCII: SyntaxText = "A"
8+
let strHiragana: SyntaxText = ""
9+
let strEmojiSingle: SyntaxText = "🤖"
10+
let strUTF8: SyntaxText = "こんにちは世界!"
11+
12+
XCTAssertEqual(String(syntaxText: strBasic), String("foobar"))
13+
XCTAssertEqual(String(syntaxText: strASCII), String("A"))
14+
XCTAssertEqual(String(syntaxText: strHiragana), String(""))
15+
XCTAssertEqual(String(syntaxText: strEmojiSingle), String("🤖"))
16+
XCTAssertEqual(String(syntaxText: strUTF8), String("こんにちは世界!"))
17+
}
18+
19+
func testInvalid() throws {
20+
let invalidUTF8: [UInt8] = [0x43, 0x61, 0x66, 0xC3]
21+
invalidUTF8.withUnsafeBufferPointer { buffer in
22+
let fromData = SyntaxText(baseAddress: buffer.baseAddress, count: buffer.count)
23+
XCTAssertEqual(String(syntaxText: fromData), "Caf\u{FFFD}")
24+
}
25+
}
26+
27+
func testSlice() throws {
28+
let text: SyntaxText = "0123456789"
29+
30+
let slice1 = SyntaxText(rebasing: text[0..<4])
31+
let slice2 = SyntaxText(rebasing: text[0..<text.count])
32+
let slice3 = SyntaxText(rebasing: text[3..<text.count])
33+
XCTAssert(slice1.isSlice(of: text))
34+
XCTAssert(slice2.isSlice(of: text))
35+
XCTAssert(slice3.isSlice(of: text))
36+
XCTAssertNotNil(text.firstRange(of: slice1))
37+
XCTAssertNotNil(text.firstRange(of: slice2))
38+
XCTAssertNotNil(text.firstRange(of: slice3))
39+
40+
let empty: SyntaxText = ""
41+
let emptySlice: SyntaxText = SyntaxText(rebasing: empty[...])
42+
XCTAssertTrue(emptySlice.isSlice(of: empty))
43+
XCTAssertEqual(emptySlice, "")
44+
45+
XCTAssertEqual(SyntaxText(rebasing: text[2..<2]), SyntaxText(rebasing: text[3..<3]))
46+
}
47+
48+
func testFirstRange() throws {
49+
let text: SyntaxText = "0123456789012345"
50+
51+
XCTAssertEqual(text.firstRange(of: ""), nil)
52+
XCTAssertEqual(text.firstRange(of: SyntaxText(rebasing: SyntaxText("012")[1..<1])), nil)
53+
XCTAssertEqual(text.firstRange(of: "abc"), nil)
54+
XCTAssertEqual(text.firstRange(of: "01234567890123456"), nil)
55+
56+
XCTAssertEqual(text.firstRange(of: "0"), 0 ..< 1)
57+
XCTAssertEqual(text.firstRange(of: "1"), 1 ..< 2)
58+
XCTAssertEqual(text.firstRange(of: "5"), 5 ..< 6)
59+
XCTAssertEqual(text.firstRange(of: "012"), 0 ..< 3)
60+
XCTAssertEqual(text.firstRange(of: "234"), 2 ..< 5)
61+
XCTAssertEqual(text.firstRange(of: "9012345"), 9 ..< 16)
62+
63+
XCTAssertEqual(SyntaxText(rebasing: text[2..<12]).firstRange(of: "123"), nil)
64+
XCTAssertEqual(SyntaxText(rebasing: text[5...]).firstRange(of: "5"), 0 ..< 1)
65+
XCTAssertEqual(SyntaxText(rebasing: text[5...]).firstRange(of: "0"), 5 ..< 6)
66+
}
67+
68+
func testContains() throws {
69+
let text: SyntaxText = "0123456789012345"
70+
XCTAssertTrue(text.contains("123"))
71+
XCTAssertTrue(text.contains("0123456789012345"))
72+
XCTAssertTrue(text.contains("9012345"))
73+
74+
XCTAssertFalse(text.contains(""))
75+
XCTAssertFalse(text.contains("foo"))
76+
XCTAssertFalse(text.contains("01234567890123456"))
77+
}
78+
79+
func testHasPrefixSuffix() throws {
80+
let text: SyntaxText = "0123456789012345"
81+
82+
XCTAssertTrue(text.hasPrefix(""))
83+
XCTAssertTrue(text.hasPrefix("0"))
84+
XCTAssertTrue(text.hasPrefix("0123"))
85+
XCTAssertTrue(text.hasPrefix("0123456789012345"))
86+
XCTAssertFalse(text.hasPrefix("345"))
87+
XCTAssertFalse(text.hasPrefix("abc"))
88+
89+
XCTAssertTrue(text.hasSuffix(""))
90+
XCTAssertTrue(text.hasSuffix("5"))
91+
XCTAssertTrue(text.hasSuffix("12345"))
92+
XCTAssertTrue(text.hasSuffix("0123456789012345"))
93+
XCTAssertFalse(text.hasSuffix("012"))
94+
XCTAssertFalse(text.hasSuffix("abc"))
95+
}
96+
97+
func testWithSyntaxText() throws {
98+
var str = "Lorem ipsum"
99+
str.withSyntaxText { text in
100+
XCTAssertEqual(SyntaxText(rebasing: text[0..<5]), "Lorem")
101+
XCTAssertEqual(SyntaxText(rebasing: text[6..<9]), "ips")
102+
}
103+
}
104+
}
105+

utils/group.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"SyntaxClassification.swift",
3030
"SyntaxFactory.swift",
3131
"SyntaxRewriter.swift",
32+
"SyntaxText.swift",
3233
"SyntaxVisitor.swift",
3334
"Misc.swift",
3435
],

0 commit comments

Comments
 (0)