Skip to content

Commit fcb6737

Browse files
authored
[clang-format] Support of TableGen identifiers beginning with a number. (#78571)
TableGen allows the identifiers beginning with a number. This patch add the support of the recognition of such identifiers.
1 parent a8a3711 commit fcb6737

File tree

3 files changed

+68
-1
lines changed

3 files changed

+68
-1
lines changed

clang/lib/Format/FormatTokenLexer.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
9393
// string literals are correctly identified.
9494
handleCSharpVerbatimAndInterpolatedStrings();
9595
}
96-
if (Style.isTableGen())
96+
if (Style.isTableGen()) {
9797
handleTableGenMultilineString();
98+
handleTableGenNumericLikeIdentifier();
99+
}
98100
if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
99101
FirstInLineIndex = Tokens.size() - 1;
100102
} while (Tokens.back()->isNot(tok::eof));
@@ -804,6 +806,44 @@ void FormatTokenLexer::handleTableGenMultilineString() {
804806
FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
805807
}
806808

809+
void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
810+
FormatToken *Tok = Tokens.back();
811+
// TableGen identifiers can begin with digits. Such tokens are lexed as
812+
// numeric_constant now.
813+
if (Tok->isNot(tok::numeric_constant))
814+
return;
815+
StringRef Text = Tok->TokenText;
816+
// The following check is based on llvm::TGLexer::LexToken.
817+
// That lexes the token as a number if any of the following holds:
818+
// 1. It starts with '+', '-'.
819+
// 2. All the characters are digits.
820+
// 3. The first non-digit character is 'b', and the next is '0' or '1'.
821+
// 4. The first non-digit character is 'x', and the next is a hex digit.
822+
// Note that in the case 3 and 4, if the next character does not exists in
823+
// this token, the token is an identifier.
824+
if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
825+
return;
826+
const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
827+
// All the characters are digits
828+
if (NonDigitPos == StringRef::npos)
829+
return;
830+
char FirstNonDigit = Text[NonDigitPos];
831+
if (NonDigitPos < Text.size() - 1) {
832+
char TheNext = Text[NonDigitPos + 1];
833+
// Regarded as a binary number.
834+
if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
835+
return;
836+
// Regarded as hex number.
837+
if (FirstNonDigit == 'x' && isxdigit(TheNext))
838+
return;
839+
}
840+
if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
841+
// This is actually an identifier in TableGen.
842+
Tok->Tok.setKind(tok::identifier);
843+
Tok->Tok.setIdentifierInfo(nullptr);
844+
}
845+
}
846+
807847
void FormatTokenLexer::handleTemplateStrings() {
808848
FormatToken *BacktickToken = Tokens.back();
809849

clang/lib/Format/FormatTokenLexer.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ class FormatTokenLexer {
9797

9898
// Handles TableGen multiline strings. It has the form [{ ... }].
9999
void handleTableGenMultilineString();
100+
// Handles TableGen numeric like identifiers.
101+
// They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the
102+
// case it is not lexed as an integer.
103+
void handleTableGenNumericLikeIdentifier();
100104

101105
void tryParsePythonComment();
102106

clang/unittests/Format/TokenAnnotatorTest.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,6 +2209,29 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
22092209
EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1);
22102210
EXPECT_TRUE(Tokens[0]->IsMultiline);
22112211
EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof(" the string. }]") - 1);
2212+
2213+
// Identifier tokens. In TableGen, identifiers can begin with a number.
2214+
// In ambiguous cases, the lexer tries to lex it as a number.
2215+
// Even if the try fails, it does not fall back to identifier lexing and
2216+
// regard as an error.
2217+
// The ambiguity is not documented. The result of those tests are based on the
2218+
// implementation of llvm::TGLexer::LexToken.
2219+
Tokens = Annotate("1234");
2220+
EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
2221+
Tokens = Annotate("0x1abC");
2222+
EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
2223+
// This is invalid syntax of number, but not an identifier.
2224+
Tokens = Annotate("0x1234x");
2225+
EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
2226+
Tokens = Annotate("identifier");
2227+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2228+
// Identifier beginning with a number.
2229+
Tokens = Annotate("0x");
2230+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2231+
Tokens = Annotate("2dVector");
2232+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
2233+
Tokens = Annotate("01234Vector");
2234+
EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
22122235
}
22132236

22142237
TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

0 commit comments

Comments
 (0)