Skip to content

Commit 28b7e28

Browse files
authored
[Support] Add \g<ref> backreferences in Regex::sub() (#67220)
The existing format of backreferences, `\<ref>`, does not allow digits to be placed directly after the reference because they are included in the reference number. The new format solves this problem by adding explicit delimiters.
1 parent c1b6ed4 commit 28b7e28

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

llvm/include/llvm/Support/Regex.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,9 @@ namespace llvm {
8585
std::string *Error = nullptr) const;
8686

8787
/// sub - Return the result of replacing the first match of the regex in
88-
/// \p String with the \p Repl string. Backreferences like "\0" in the
89-
/// replacement string are replaced with the appropriate match substring.
88+
/// \p String with the \p Repl string. Backreferences like "\0" and "\g<1>"
89+
/// in the replacement string are replaced with the appropriate match
90+
/// substring.
9091
///
9192
/// Note that the replacement string has backslash escaping performed on
9293
/// it. Invalid backreferences are ignored (replaced by empty strings).

llvm/lib/Support/Regex.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,25 @@ std::string Regex::sub(StringRef Repl, StringRef String,
163163

164164
// FIXME: We should have a StringExtras function for mapping C99 escapes.
165165
switch (Repl[0]) {
166+
167+
// Backreference with the "\g<ref>" syntax
168+
case 'g':
169+
if (Repl.size() >= 4 && Repl[1] == '<') {
170+
size_t End = Repl.find('>');
171+
StringRef Ref = Repl.slice(2, End);
172+
unsigned RefValue;
173+
if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) {
174+
Repl = Repl.substr(End + 1);
175+
if (RefValue < Matches.size())
176+
Res += Matches[RefValue];
177+
else if (Error && Error->empty())
178+
*Error =
179+
("invalid backreference string 'g<" + Twine(Ref) + ">'").str();
180+
break;
181+
}
182+
}
183+
[[fallthrough]];
184+
166185
// Treat all unrecognized characters as self-quoting.
167186
default:
168187
Res += Repl[0];

llvm/unittests/Support/RegexTest.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,34 @@ TEST_F(RegexTest, Substitution) {
127127

128128
EXPECT_EQ("aber", Regex("a[0-9]+b").sub("a\\100b", "a1234ber", &Error));
129129
EXPECT_EQ(Error, "invalid backreference string '100'");
130+
131+
EXPECT_EQ("012345", Regex("a([0-9]+).*").sub("0\\g<1>5", "a1234ber", &Error));
132+
EXPECT_EQ("", Error);
133+
134+
EXPECT_EQ("0a1234ber5",
135+
Regex("a([0-9]+).*").sub("0\\g<0>5", "a1234ber", &Error));
136+
EXPECT_EQ("", Error);
137+
138+
EXPECT_EQ("0A5", Regex("a(.)(.)(.)(.)(.)(.)(.)(.)(.)(.).*")
139+
.sub("0\\g<10>5", "a123456789Aber", &Error));
140+
EXPECT_EQ("", Error);
141+
142+
EXPECT_EQ("0g<-1>5",
143+
Regex("a([0-9]+).*").sub("0\\g<-1>5", "a1234ber", &Error));
144+
EXPECT_EQ("", Error);
145+
146+
EXPECT_EQ("0g<15", Regex("a([0-9]+).*").sub("0\\g<15", "a1234ber", &Error));
147+
EXPECT_EQ("", Error);
148+
149+
EXPECT_EQ("0g<>15", Regex("a([0-9]+).*").sub("0\\g<>15", "a1234ber", &Error));
150+
EXPECT_EQ("", Error);
151+
152+
EXPECT_EQ("0g<3e>1",
153+
Regex("a([0-9]+).*").sub("0\\g<3e>1", "a1234ber", &Error));
154+
EXPECT_EQ("", Error);
155+
156+
EXPECT_EQ("aber", Regex("a([0-9]+)b").sub("a\\g<100>b", "a1234ber", &Error));
157+
EXPECT_EQ(Error, "invalid backreference string 'g<100>'");
130158
}
131159

132160
TEST_F(RegexTest, IsLiteralERE) {

0 commit comments

Comments
 (0)