Skip to content

Commit 1acca94

Browse files
authored
[Optimization] Specialized quantification instruction (#577)
Implements a specialized quantification instruction for repeated matching of a character, dot, character class, or custom character class
1 parent 405fbcb commit 1acca94

13 files changed

+598
-40
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,10 @@ fileprivate extension Compiler.ByteCodeGen {
471471
let minTrips = low
472472
assert((extraTrips ?? 1) >= 0)
473473

474+
if tryEmitFastQuant(child, updatedKind, minTrips, extraTrips) {
475+
return
476+
}
477+
474478
// The below is a general algorithm for bounded and unbounded
475479
// quantification. It can be specialized when the min
476480
// is 0 or 1, or when extra trips is 1 or unbounded.
@@ -655,6 +659,80 @@ fileprivate extension Compiler.ByteCodeGen {
655659
builder.label(exit)
656660
}
657661

662+
/// Specialized quantification instruction for repetition of certain nodes in grapheme semantic mode
663+
/// Allowed nodes are:
664+
/// - single ascii scalar .char
665+
/// - ascii .customCharacterClass
666+
/// - single grapheme consumgin built in character classes
667+
/// - .any, .anyNonNewline, .dot
668+
mutating func tryEmitFastQuant(
669+
_ child: DSLTree.Node,
670+
_ kind: AST.Quantification.Kind,
671+
_ minTrips: Int,
672+
_ extraTrips: Int?
673+
) -> Bool {
674+
guard optimizationsEnabled
675+
&& minTrips <= QuantifyPayload.maxStorableTrips
676+
&& extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips
677+
&& options.semanticLevel == .graphemeCluster
678+
&& kind != .reluctant else {
679+
return false
680+
}
681+
switch child {
682+
case .customCharacterClass(let ccc):
683+
// ascii only custom character class
684+
guard let bitset = ccc.asAsciiBitset(options) else {
685+
return false
686+
}
687+
builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips)
688+
689+
case .atom(let atom):
690+
switch atom {
691+
case .char(let c):
692+
// Single scalar ascii value character
693+
guard let val = c._singleScalarAsciiValue else {
694+
return false
695+
}
696+
builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips)
697+
698+
case .any:
699+
builder.buildQuantifyAny(
700+
matchesNewlines: true, kind, minTrips, extraTrips)
701+
case .anyNonNewline:
702+
builder.buildQuantifyAny(
703+
matchesNewlines: false, kind, minTrips, extraTrips)
704+
case .dot:
705+
builder.buildQuantifyAny(
706+
matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips)
707+
708+
case .characterClass(let cc):
709+
// Custom character class that consumes a single grapheme
710+
let model = cc.asRuntimeModel(options)
711+
guard model.consumesSingleGrapheme else {
712+
return false
713+
}
714+
builder.buildQuantify(
715+
model: model,
716+
kind,
717+
minTrips,
718+
extraTrips)
719+
default:
720+
return false
721+
}
722+
case .convertedRegexLiteral(let node, _):
723+
return tryEmitFastQuant(node, kind, minTrips, extraTrips)
724+
case .nonCapturingGroup(let groupKind, let node):
725+
// .nonCapture nonCapturingGroups are ignored during compilation
726+
guard groupKind.ast == .nonCapture else {
727+
return false
728+
}
729+
return tryEmitFastQuant(node, kind, minTrips, extraTrips)
730+
default:
731+
return false
732+
}
733+
return true
734+
}
735+
658736
/// Coalesce any adjacent scalar members in a custom character class together.
659737
/// This is required in order to produce correct grapheme matching behavior.
660738
func coalescingCustomCharacterClassMembers(

Sources/_StringProcessing/Engine/Backtracking.swift

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,12 @@
1010
//===----------------------------------------------------------------------===//
1111

1212
extension Processor {
13-
14-
// TODO: What all do we want to save? Configurable?
15-
// TODO: Do we need to save any registers?
16-
// TODO: Is this the right place to do function stack unwinding?
1713
struct SavePoint {
1814
var pc: InstructionAddress
1915
var pos: Position?
20-
16+
// Quantifiers may store a range of positions to restore to
17+
var rangeStart: Position?
18+
var rangeEnd: Position?
2119
// The end of the call stack, so we can slice it off
2220
// when failing inside a call.
2321
//
@@ -43,7 +41,35 @@ extension Processor {
4341
intRegisters: [Int],
4442
PositionRegister: [Input.Index]
4543
) {
46-
(pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
44+
return (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
45+
}
46+
47+
var rangeIsEmpty: Bool { rangeEnd == nil }
48+
49+
mutating func updateRange(newEnd: Input.Index) {
50+
if rangeStart == nil {
51+
rangeStart = newEnd
52+
}
53+
rangeEnd = newEnd
54+
}
55+
56+
/// Move the next range position into pos, and removing it from the range
57+
mutating func takePositionFromRange(_ input: Input) {
58+
assert(!rangeIsEmpty)
59+
pos = rangeEnd!
60+
shrinkRange(input)
61+
}
62+
63+
/// Shrink the range of the save point by one index, essentially dropping the last index
64+
mutating func shrinkRange(_ input: Input) {
65+
assert(!rangeIsEmpty)
66+
if rangeEnd == rangeStart {
67+
// The range is now empty
68+
rangeStart = nil
69+
rangeEnd = nil
70+
} else {
71+
input.formIndex(before: &rangeEnd!)
72+
}
4773
}
4874
}
4975

@@ -54,6 +80,21 @@ extension Processor {
5480
SavePoint(
5581
pc: pc,
5682
pos: addressOnly ? nil : currentPosition,
83+
rangeStart: nil,
84+
rangeEnd: nil,
85+
stackEnd: .init(callStack.count),
86+
captureEnds: storedCaptures,
87+
intRegisters: registers.ints,
88+
posRegisters: registers.positions)
89+
}
90+
91+
func startQuantifierSavePoint() -> SavePoint {
92+
// Restores to the instruction AFTER the current quantifier instruction
93+
SavePoint(
94+
pc: controller.pc + 1,
95+
pos: nil,
96+
rangeStart: nil,
97+
rangeEnd: nil,
5798
stackEnd: .init(callStack.count),
5899
captureEnds: storedCaptures,
59100
intRegisters: registers.ints,

Sources/_StringProcessing/Engine/InstPayload.swift

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12+
@_implementationOnly import _RegexParser
13+
1214
extension Instruction {
1315
/// An instruction's payload packs operands and destination
1416
/// registers.
@@ -330,7 +332,9 @@ extension Instruction.Payload {
330332
) {
331333
interpretPair()
332334
}
335+
333336
// MARK: Struct payloads
337+
334338
init(_ model: _CharacterClassModel) {
335339
self.init(CharacterClassPayload(model).rawValue)
336340
}
@@ -342,11 +346,169 @@ extension Instruction.Payload {
342346
self.init(rawValue: payload.rawValue)
343347
}
344348
var assertion: AssertionPayload {
345-
AssertionPayload.init(rawValue: self.rawValue & _payloadMask)
349+
AssertionPayload.init(rawValue: rawValue & _payloadMask)
350+
}
351+
init(quantify: QuantifyPayload) {
352+
self.init(quantify.rawValue)
353+
}
354+
var quantify: QuantifyPayload {
355+
return QuantifyPayload(rawValue: rawValue & _payloadMask)
346356
}
347357
}
348358

349359
// MARK: Struct definitions
360+
struct QuantifyPayload: RawRepresentable {
361+
let rawValue: UInt64
362+
enum PayloadType: UInt64 {
363+
case bitset = 0
364+
case asciiChar = 1
365+
case any = 2
366+
case builtin = 4
367+
}
368+
369+
// Future work: optimize this layout -> payload type should be a fast switch
370+
// The top 8 bits are reserved for the opcode so we have 56 bits to work with
371+
// b55-b38 - Unused
372+
// b38-b35 - Payload type (one of 4 types, stored on 3 bits)
373+
// b35-b27 - minTrips (8 bit int)
374+
// b27-b18 - extraTrips (8 bit value, one bit for nil)
375+
// b18-b16 - Quantification type (one of three types)
376+
// b16-b0 - Payload value (depends on payload type)
377+
static var quantKindShift: UInt64 { 16 }
378+
static var extraTripsShift: UInt64 { 18 }
379+
static var minTripsShift: UInt64 { 27 }
380+
static var typeShift: UInt64 { 35 }
381+
static var maxStorableTrips: UInt64 { (1 << 8) - 1 }
382+
383+
var quantKindMask: UInt64 { 3 }
384+
var extraTripsMask: UInt64 { 0x1FF }
385+
var minTripsMask: UInt64 { 0xFF }
386+
var typeMask: UInt64 { 7 }
387+
var payloadMask: UInt64 { 0xFF_FF }
388+
389+
static func packInfoValues(
390+
_ kind: AST.Quantification.Kind,
391+
_ minTrips: Int,
392+
_ extraTrips: Int?,
393+
_ type: PayloadType
394+
) -> UInt64 {
395+
let kindVal: UInt64
396+
switch kind {
397+
case .eager:
398+
kindVal = 0
399+
case .reluctant:
400+
kindVal = 1
401+
case .possessive:
402+
kindVal = 2
403+
}
404+
let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1
405+
return (kindVal << QuantifyPayload.quantKindShift) +
406+
(extraTripsVal << QuantifyPayload.extraTripsShift) +
407+
(UInt64(minTrips) << QuantifyPayload.minTripsShift) +
408+
(type.rawValue << QuantifyPayload.typeShift)
409+
}
410+
411+
init(rawValue: UInt64) {
412+
self.rawValue = rawValue
413+
assert(rawValue & _opcodeMask == 0)
414+
}
415+
416+
init(
417+
bitset: AsciiBitsetRegister,
418+
_ kind: AST.Quantification.Kind,
419+
_ minTrips: Int,
420+
_ extraTrips: Int?
421+
) {
422+
assert(bitset.bits <= _payloadMask)
423+
self.rawValue = bitset.bits
424+
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset)
425+
}
426+
427+
init(
428+
asciiChar: UInt8,
429+
_ kind: AST.Quantification.Kind,
430+
_ minTrips: Int,
431+
_ extraTrips: Int?
432+
) {
433+
self.rawValue = UInt64(asciiChar)
434+
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar)
435+
}
436+
437+
init(
438+
matchesNewlines: Bool,
439+
_ kind: AST.Quantification.Kind,
440+
_ minTrips: Int,
441+
_ extraTrips: Int?
442+
) {
443+
self.rawValue = (matchesNewlines ? 1 : 0)
444+
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any)
445+
}
446+
447+
init(
448+
model: _CharacterClassModel,
449+
_ kind: AST.Quantification.Kind,
450+
_ minTrips: Int,
451+
_ extraTrips: Int?
452+
) {
453+
assert(model.cc.rawValue < 0xFF)
454+
assert(model.matchLevel != .unicodeScalar)
455+
let packedModel = model.cc.rawValue
456+
+ (model.isInverted ? 1 << 9 : 0)
457+
+ (model.isStrictASCII ? 1 << 10 : 0)
458+
self.rawValue = packedModel
459+
+ QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin)
460+
}
461+
462+
var type: PayloadType {
463+
PayloadType(rawValue: (self.rawValue >> QuantifyPayload.typeShift) & 7)!
464+
}
465+
466+
var quantKind: AST.Quantification.Kind {
467+
switch (self.rawValue >> QuantifyPayload.quantKindShift) & quantKindMask {
468+
case 0: return .eager
469+
case 1: return .reluctant
470+
case 2: return .possessive
471+
default:
472+
fatalError("Unreachable")
473+
}
474+
}
475+
476+
var minTrips: UInt64 {
477+
(self.rawValue >> QuantifyPayload.minTripsShift) & minTripsMask
478+
}
479+
480+
var extraTrips: UInt64? {
481+
let val = (self.rawValue >> QuantifyPayload.extraTripsShift) & extraTripsMask
482+
if val == 1 {
483+
return nil
484+
} else {
485+
return val >> 1
486+
}
487+
}
488+
489+
var bitset: AsciiBitsetRegister {
490+
TypedInt(self.rawValue & payloadMask)
491+
}
492+
493+
var asciiChar: UInt8 {
494+
UInt8(asserting: self.rawValue & payloadMask)
495+
}
496+
497+
var anyMatchesNewline: Bool {
498+
(self.rawValue & 1) == 1
499+
}
500+
501+
var builtin: _CharacterClassModel.Representation {
502+
_CharacterClassModel.Representation(rawValue: self.rawValue & 0xFF)!
503+
}
504+
var builtinIsInverted: Bool {
505+
(self.rawValue >> 9) & 1 == 1
506+
}
507+
var builtinIsStrict: Bool {
508+
(self.rawValue >> 10) & 1 == 1
509+
}
510+
}
511+
350512
struct CharacterClassPayload: RawRepresentable {
351513
let rawValue: UInt64
352514
// Layout:

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,13 @@ extension Instruction {
193193
///
194194
case splitSaving
195195

196+
/// Fused quantify, execute, save instruction
197+
/// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor
198+
/// Only quantifies specific nodes
199+
///
200+
/// quantify(_:QuantifyPayload)
201+
///
202+
case quantify
196203
/// Begin the given capture
197204
///
198205
/// beginCapture(_:CapReg)

0 commit comments

Comments
 (0)