Parse end-of-line comments in custom character classes

hamishknight · hamishknight · commit a6bf9b816eb8 · 2022-05-27T11:30:31.000+01:00
Previously we would only parse non-semantic
whitespace, but also expand to end-of-line
comments, which are supported by ICU.
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -639,7 +639,7 @@ extension Source {
   ///
   mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
     let trivia: Located<String>? = try recordLoc { src in
-      if src.tryEat(sequence: "(?#") {
+      if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") {
         return try src.lexUntil(eating: ")").value
       }
       if context.experimentalComments, src.tryEat(sequence: "/*") {
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -515,10 +515,8 @@ extension Parser {
         continue
       }
 
-      // Lex non-semantic whitespace if we're allowed.
-      // TODO: ICU allows end-of-line comments in custom character classes,
-      // which we ought to support if we want to support multi-line regex.
-      if let trivia = source.lexNonSemanticWhitespace(context: context) {
+      // Lex trivia if we're allowed.
+      if let trivia = try source.lexTrivia(context: context) {
         members.append(.trivia(trivia))
         continue
       }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -1677,12 +1677,8 @@ extension RegexTests {
       )
     )
 
-    // End of line comments aren't applicable in custom char classes.
-    // TODO: ICU supports this.
-    parseTest("(?x)[ # abc]", concat(
-      changeMatchingOptions(matchingOptions(adding: .extended)),
-      charClass("#", "a", "b", "c")
-    ))
+    parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c"))
+    parseTest("[#]", charClass("#"))
 
     parseTest("(?x)a b c[d e f]", concat(
       changeMatchingOptions(matchingOptions(adding: .extended)),
@@ -2115,6 +2111,15 @@ extension RegexTests {
       /#
       """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}"))
 
+    parseWithDelimitersTest(#"""
+      #/
+      [
+        a # interesting
+      b-c #a
+        d]
+      /#
+      """#, charClass("a", range_m("b", "c"), "d"))
+
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
     // if it's clear that it's part of the regex syntax.
 
@@ -2544,6 +2549,12 @@ extension RegexTests {
     diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b"))
     diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}"))
 
+    diagnosticTest("(?x)[(?#)]", .expected("]"))
+    diagnosticTest("(?x)[(?#abc)]", .expected("]"))
+
+    diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers)
+    diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers)
+
     // MARK: Bad escapes
 
     diagnosticTest("\\", .expectedEscape)

Original file line number	Diff line number	Diff line change
`@@ -639,7 +639,7 @@ extension Source {`
`639`	`639`	`///`
`640`	`640`	`mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {`
`641`	`641`	`let trivia: Located<String>? = try recordLoc { src in`
`642`		`- if src.tryEat(sequence: "(?#") {`
	`642`	`+ if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") {`
`643`	`643`	`return try src.lexUntil(eating: ")").value`
`644`	`644`	`}`
`645`	`645`	`if context.experimentalComments, src.tryEat(sequence: "/*") {`
Original file line number	Diff line number	Diff line change
`@@ -515,10 +515,8 @@ extension Parser {`
`515`	`515`	`continue`
`516`	`516`	`}`
`517`	`517`
`518`		`- // Lex non-semantic whitespace if we're allowed.`
`519`		`- // TODO: ICU allows end-of-line comments in custom character classes,`
`520`		`- // which we ought to support if we want to support multi-line regex.`
`521`		`- if let trivia = source.lexNonSemanticWhitespace(context: context) {`
	`518`	`+ // Lex trivia if we're allowed.`
	`519`	`+ if let trivia = try source.lexTrivia(context: context) {`
`522`	`520`	`members.append(.trivia(trivia))`
`523`	`521`	`continue`
`524`	`522`	`}`