Skip to content

Commit fbef1f8

Browse files
authored
[clang-format][NFC] Make formatting Verilog faster (#121139)
A regular expression was used in the lexing process. It made the program take more than linear time with regards to the length of the input. It looked like the entire buffer could be scanned for every token lexed. Now the regular expression is replaced with code. Previously it took 20 minutes for the program to format 125 000 lines of code on my computer. Now it takes 315 milliseconds.
1 parent 493c066 commit fbef1f8

File tree

2 files changed

+71
-20
lines changed

2 files changed

+71
-20
lines changed

clang/lib/Format/FormatTokenLexer.cpp

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,34 +1392,51 @@ FormatToken *FormatTokenLexer::getNextToken() {
13921392
}
13931393

13941394
bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1395+
const char *Start = Lex->getBufferLocation();
1396+
size_t Len;
1397+
switch (Start[0]) {
13951398
// In Verilog the quote is not a character literal.
1396-
//
1399+
case '\'':
1400+
Len = 1;
1401+
break;
13971402
// Make the backtick and double backtick identifiers to match against them
13981403
// more easily.
1399-
//
1400-
// In Verilog an escaped identifier starts with backslash and ends with
1401-
// whitespace. Unless that whitespace is an escaped newline. A backslash can
1402-
// also begin an escaped newline outside of an escaped identifier. We check
1403-
// for that outside of the Regex since we can't use negative lookhead
1404-
// assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1405-
// identifier may have a length of 0 according to Section A.9.3.
1404+
case '`':
1405+
if (Start[1] == '`')
1406+
Len = 2;
1407+
else
1408+
Len = 1;
1409+
break;
1410+
// In Verilog an escaped identifier starts with a backslash and ends with
1411+
// whitespace. Unless that whitespace is an escaped newline.
14061412
// FIXME: If there is an escaped newline in the middle of an escaped
14071413
// identifier, allow for pasting the two lines together, But escaped
14081414
// identifiers usually occur only in generated code anyway.
1409-
static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1410-
"(\r?\n|\r)|[^[:space:]])*)");
1411-
1412-
SmallVector<StringRef, 4> Matches;
1413-
const char *Start = Lex->getBufferLocation();
1414-
if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1415-
&Matches)) {
1415+
case '\\':
1416+
// A backslash can also begin an escaped newline outside of an escaped
1417+
// identifier.
1418+
if (Start[1] == '\r' || Start[1] == '\n')
1419+
return false;
1420+
Len = 1;
1421+
while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1422+
Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1423+
Start[Len] != ' ') {
1424+
// There is a null byte at the end of the buffer, so we don't have to
1425+
// check whether the next byte is within the buffer.
1426+
if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1427+
Start[Len + 2] == '\n') {
1428+
Len += 3;
1429+
} else if (Start[Len] == '\\' &&
1430+
(Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1431+
Len += 2;
1432+
} else {
1433+
Len += 1;
1434+
}
1435+
}
1436+
break;
1437+
default:
14161438
return false;
14171439
}
1418-
// There is a null byte at the end of the buffer, so we don't have to check
1419-
// Start[1] is within the buffer.
1420-
if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1421-
return false;
1422-
size_t Len = Matches[0].size();
14231440

14241441
// The kind has to be an identifier so we can match it against those defined
14251442
// in Keywords. The kind has to be set before the length because the setLength

clang/unittests/Format/TokenAnnotatorTest.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2638,6 +2638,40 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
26382638
"endmodule");
26392639
ASSERT_EQ(Tokens.size(), 11u) << Tokens;
26402640
EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_VerilogMultiLineListLParen);
2641+
2642+
// Escaped identifiers.
2643+
Tokens = Annotate(R"(\busa+index)");
2644+
ASSERT_EQ(Tokens.size(), 2u) << Tokens;
2645+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2646+
Tokens = Annotate(R"(\busa+index ;)");
2647+
ASSERT_EQ(Tokens.size(), 3u) << Tokens;
2648+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2649+
EXPECT_EQ(Tokens[0]->TokenText, R"(\busa+index)");
2650+
EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
2651+
Tokens = Annotate(R"(\busa+index
2652+
;)");
2653+
ASSERT_EQ(Tokens.size(), 3u) << Tokens;
2654+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2655+
EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
2656+
// The escaped identifier can be broken by an escaped newline. The result is
2657+
// still 1 identifier.
2658+
Tokens = Annotate(R"(\busa+index\
2659+
+
2660+
;)");
2661+
ASSERT_EQ(Tokens.size(), 3u) << Tokens;
2662+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2663+
EXPECT_EQ(Tokens[0]->TokenText, R"(\busa+index\
2664+
+)");
2665+
EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
2666+
// An escaped newline should not be treated as an escaped identifier.
2667+
Tokens = Annotate("\\\n");
2668+
ASSERT_EQ(Tokens.size(), 1u) << Tokens;
2669+
EXPECT_TOKEN(Tokens[0], tok::eof, TT_Unknown);
2670+
// Macros.
2671+
Tokens = Annotate("`define x x``x");
2672+
ASSERT_EQ(Tokens.size(), 7u) << Tokens;
2673+
EXPECT_TOKEN(Tokens[0], tok::hash, TT_Unknown);
2674+
EXPECT_TOKEN(Tokens[4], tok::hashhash, TT_Unknown);
26412675
}
26422676

26432677
TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {

0 commit comments

Comments
 (0)