llvm · sstwcw · Jan 14, 2025 · Dec 26, 2024
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
@@ -1389,34 +1389,51 @@ FormatToken *FormatTokenLexer::getNextToken() {
 }
 
 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
+  const char *Start = Lex->getBufferLocation();
+  size_t Len;
+  switch (Start[0]) {
   // In Verilog the quote is not a character literal.
-  //
+  case '\'':
+    Len = 1;
+    break;
   // Make the backtick and double backtick identifiers to match against them
   // more easily.
-  //
-  // In Verilog an escaped identifier starts with backslash and ends with
-  // whitespace. Unless that whitespace is an escaped newline. A backslash can
-  // also begin an escaped newline outside of an escaped identifier. We check
-  // for that outside of the Regex since we can't use negative lookhead
-  // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
-  // identifier may have a length of 0 according to Section A.9.3.
+  case '`':
+    if (Start[1] == '`')
+      Len = 2;
+    else
+      Len = 1;
+    break;
+  // In Verilog an escaped identifier starts with a backslash and ends with
+  // whitespace. Unless that whitespace is an escaped newline.
   // FIXME: If there is an escaped newline in the middle of an escaped
   // identifier, allow for pasting the two lines together, But escaped
   // identifiers usually occur only in generated code anyway.
-  static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
-                                        "(\r?\n|\r)|[^[:space:]])*)");
-
-  SmallVector<StringRef, 4> Matches;
-  const char *Start = Lex->getBufferLocation();
-  if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
-                          &Matches)) {
+  case '\\':
+    // A backslash can also begin an escaped newline outside of an escaped
+    // identifier.
+    if (Start[1] == '\r' || Start[1] == '\n')
+      return false;
+    Len = 1;
+    while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
+           Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
+           Start[Len] != ' ') {
+      // There is a null byte at the end of the buffer, so we don't have to
+      // check whether the next byte is within the buffer.
+      if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
+          Start[Len + 2] == '\n') {
+        Len += 3;
+      } else if (Start[Len] == '\\' &&
+                 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
+        Len += 2;
+      } else {
+        Len += 1;
+      }
+    }
+    break;
+  default:
     return false;
   }
-  // There is a null byte at the end of the buffer, so we don't have to check
-  // Start[1] is within the buffer.
-  if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
-    return false;
-  size_t Len = Matches[0].size();
 
   // The kind has to be an identifier so we can match it against those defined
   // in Keywords. The kind has to be set before the length because the setLength

diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2623,6 +2623,40 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
                     "endmodule");
   ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_VerilogMultiLineListLParen);
+
+  // Escaped identifiers.
+  Tokens = Annotate(R"(\busa+index)");
+  ASSERT_EQ(Tokens.size(), 2u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  Tokens = Annotate(R"(\busa+index ;)");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  EXPECT_EQ(Tokens[0]->TokenText, R"(\busa+index)");
+  EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
+  Tokens = Annotate(R"(\busa+index
+;)");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
+  // The escaped identifier can be broken by an escaped newline. The result is
+  // still 1 identifier.
+  Tokens = Annotate(R"(\busa+index\
++
+;)");
+  ASSERT_EQ(Tokens.size(), 3u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  EXPECT_EQ(Tokens[0]->TokenText, R"(\busa+index\
++)");
+  EXPECT_TOKEN(Tokens[1], tok::semi, TT_Unknown);
+  // An escaped newline should not be treated as an escaped identifier.
+  Tokens = Annotate("\\\n");
+  ASSERT_EQ(Tokens.size(), 1u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::eof, TT_Unknown);
+  // Macros.
+  Tokens = Annotate("`define x x``x");
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::hash, TT_Unknown);
+  EXPECT_TOKEN(Tokens[4], tok::hashhash, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {