Skip to content

Commit 913961e

Browse files
Fznamznonyuxuanchen1997
authored andcommitted
[clang] Inject tokens containing #embed back into token stream (#97274)
Instead of playing "whack a mole" with places where #embed should be expanded as comma-separated list, just inject each byte as a token back into the stream, separated by commas.
1 parent 4e50ae9 commit 913961e

File tree

10 files changed

+63
-79
lines changed

10 files changed

+63
-79
lines changed

clang/include/clang/Basic/TokenKinds.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ TOK(raw_identifier) // Used only in raw lexing mode.
165165
// C99 6.4.4.2: Floating Constants
166166
TOK(numeric_constant) // 0x123
167167

168+
// Directly holds numerical value. Used to process C23 #embed.
169+
TOK(binary_data)
170+
168171
// C99 6.4.4: Character Constants
169172
TOK(char_constant) // 'a'
170173
TOK(wide_char_constant) // L'b'

clang/include/clang/Basic/TokenKinds.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ inline bool isLiteral(TokenKind K) {
9898
return K == tok::numeric_constant || K == tok::char_constant ||
9999
K == tok::wide_char_constant || K == tok::utf8_char_constant ||
100100
K == tok::utf16_char_constant || K == tok::utf32_char_constant ||
101-
isStringLiteral(K) || K == tok::header_name;
101+
isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
102102
}
103103

104104
/// Return true if this is any of tok::annot_* kinds.

clang/include/clang/Lex/Preprocessor.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,17 +2123,18 @@ class Preprocessor {
21232123
char
21242124
getSpellingOfSingleCharacterNumericConstant(const Token &Tok,
21252125
bool *Invalid = nullptr) const {
2126-
assert(Tok.is(tok::numeric_constant) &&
2126+
assert((Tok.is(tok::numeric_constant) || Tok.is(tok::binary_data)) &&
21272127
Tok.getLength() == 1 && "Called on unsupported token");
21282128
assert(!Tok.needsCleaning() && "Token can't need cleaning with length 1");
21292129

21302130
// If the token is carrying a literal data pointer, just use it.
21312131
if (const char *D = Tok.getLiteralData())
2132-
return *D;
2132+
return (Tok.getKind() == tok::binary_data) ? *D : *D - '0';
21332133

2134+
assert(Tok.is(tok::numeric_constant) && "binary data with no data");
21342135
// Otherwise, fall back on getCharacterData, which is slower, but always
21352136
// works.
2136-
return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid);
2137+
return *SourceMgr.getCharacterData(Tok.getLocation(), Invalid) - '0';
21372138
}
21382139

21392140
/// Retrieve the name of the immediate macro expansion.

clang/include/clang/Parse/Parser.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2127,7 +2127,7 @@ class Parser : public CodeCompletionHandler {
21272127
};
21282128
ExprResult ParseInitializerWithPotentialDesignator(DesignatorCompletionInfo);
21292129
ExprResult createEmbedExpr();
2130-
void ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs);
2130+
void injectEmbedTokens();
21312131

21322132
//===--------------------------------------------------------------------===//
21332133
// clang Expressions
@@ -3834,7 +3834,6 @@ class Parser : public CodeCompletionHandler {
38343834
AnnotateTemplateIdTokenAsType(CXXScopeSpec &SS,
38353835
ImplicitTypenameContext AllowImplicitTypename,
38363836
bool IsClassName = false);
3837-
void ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs);
38383837
bool ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
38393838
TemplateTy Template, SourceLocation OpenLoc);
38403839
ParsedTemplateArgument ParseTemplateTemplateArgument();

clang/lib/Parse/ParseExpr.cpp

Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
10991099

11001100
// primary-expression
11011101
case tok::numeric_constant:
1102+
case tok::binary_data:
11021103
// constant: integer-constant
11031104
// constant: floating-constant
11041105

@@ -1148,18 +1149,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
11481149
}
11491150

11501151
case tok::annot_embed: {
1151-
// We've met #embed in a context where a single value is expected. Take last
1152-
// element from #embed data as if it were a comma expression.
1153-
EmbedAnnotationData *Data =
1154-
reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
1155-
SourceLocation StartLoc = ConsumeAnnotationToken();
1156-
ASTContext &Context = Actions.getASTContext();
1157-
Res = IntegerLiteral::Create(Context,
1158-
llvm::APInt(CHAR_BIT, Data->BinaryData.back()),
1159-
Context.UnsignedCharTy, StartLoc);
1160-
if (Data->BinaryData.size() > 1)
1161-
Diag(StartLoc, diag::warn_unused_comma_left_operand);
1162-
break;
1152+
injectEmbedTokens();
1153+
return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
1154+
isVectorLiteral, NotPrimaryExpression);
11631155
}
11641156

11651157
case tok::kw___super:
@@ -3584,15 +3576,29 @@ ExprResult Parser::ParseFoldExpression(ExprResult LHS,
35843576
T.getCloseLocation());
35853577
}
35863578

3587-
void Parser::ExpandEmbedDirective(SmallVectorImpl<Expr *> &Exprs) {
3579+
void Parser::injectEmbedTokens() {
35883580
EmbedAnnotationData *Data =
35893581
reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
3590-
SourceLocation StartLoc = ConsumeAnnotationToken();
3591-
ASTContext &Context = Actions.getASTContext();
3592-
for (auto Byte : Data->BinaryData) {
3593-
Exprs.push_back(IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
3594-
Context.UnsignedCharTy, StartLoc));
3595-
}
3582+
MutableArrayRef<Token> Toks(PP.getPreprocessorAllocator().Allocate<Token>(
3583+
Data->BinaryData.size() * 2 - 1),
3584+
Data->BinaryData.size() * 2 - 1);
3585+
unsigned I = 0;
3586+
for (auto &Byte : Data->BinaryData) {
3587+
Toks[I].startToken();
3588+
Toks[I].setKind(tok::binary_data);
3589+
Toks[I].setLocation(Tok.getLocation());
3590+
Toks[I].setLength(1);
3591+
Toks[I].setLiteralData(&Byte);
3592+
if (I != ((Data->BinaryData.size() - 1) * 2)) {
3593+
Toks[I + 1].startToken();
3594+
Toks[I + 1].setKind(tok::comma);
3595+
Toks[I + 1].setLocation(Tok.getLocation());
3596+
}
3597+
I += 2;
3598+
}
3599+
PP.EnterTokenStream(std::move(Toks), /*DisableMacroExpansion=*/true,
3600+
/*IsReinject=*/false);
3601+
ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
35963602
}
35973603

35983604
/// ParseExpressionList - Used for C/C++ (argument-)expression-list.
@@ -3630,17 +3636,8 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
36303636
if (getLangOpts().CPlusPlus11 && Tok.is(tok::l_brace)) {
36313637
Diag(Tok, diag::warn_cxx98_compat_generalized_initializer_lists);
36323638
Expr = ParseBraceInitializer();
3633-
} else if (Tok.is(tok::annot_embed)) {
3634-
ExpandEmbedDirective(Exprs);
3635-
if (Tok.isNot(tok::comma))
3636-
break;
3637-
Token Comma = Tok;
3638-
ConsumeToken();
3639-
checkPotentialAngleBracketDelimiter(Comma);
3640-
continue;
3641-
} else {
3639+
} else
36423640
Expr = ParseAssignmentExpression();
3643-
}
36443641

36453642
if (EarlyTypoCorrection)
36463643
Expr = Actions.CorrectDelayedTyposInExpr(Expr);

clang/lib/Parse/ParseTemplate.cpp

Lines changed: 12 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1523,19 +1523,6 @@ ParsedTemplateArgument Parser::ParseTemplateArgument() {
15231523
ExprArg.get(), Loc);
15241524
}
15251525

1526-
void Parser::ExpandEmbedIntoTemplateArgList(TemplateArgList &TemplateArgs) {
1527-
EmbedAnnotationData *Data =
1528-
reinterpret_cast<EmbedAnnotationData *>(Tok.getAnnotationValue());
1529-
SourceLocation StartLoc = ConsumeAnnotationToken();
1530-
ASTContext &Context = Actions.getASTContext();
1531-
for (auto Byte : Data->BinaryData) {
1532-
Expr *E = IntegerLiteral::Create(Context, llvm::APInt(CHAR_BIT, Byte),
1533-
Context.UnsignedCharTy, StartLoc);
1534-
TemplateArgs.push_back(
1535-
ParsedTemplateArgument(ParsedTemplateArgument::NonType, E, StartLoc));
1536-
}
1537-
}
1538-
15391526
/// ParseTemplateArgumentList - Parse a C++ template-argument-list
15401527
/// (C++ [temp.names]). Returns true if there was an error.
15411528
///
@@ -1560,24 +1547,20 @@ bool Parser::ParseTemplateArgumentList(TemplateArgList &TemplateArgs,
15601547

15611548
do {
15621549
PreferredType.enterFunctionArgument(Tok.getLocation(), RunSignatureHelp);
1563-
if (Tok.is(tok::annot_embed)) {
1564-
ExpandEmbedIntoTemplateArgList(TemplateArgs);
1565-
} else {
1566-
ParsedTemplateArgument Arg = ParseTemplateArgument();
1567-
SourceLocation EllipsisLoc;
1568-
if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
1569-
Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
1570-
1571-
if (Arg.isInvalid()) {
1572-
if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
1573-
RunSignatureHelp();
1574-
return true;
1575-
}
1576-
1577-
// Save this template argument.
1578-
TemplateArgs.push_back(Arg);
1550+
ParsedTemplateArgument Arg = ParseTemplateArgument();
1551+
SourceLocation EllipsisLoc;
1552+
if (TryConsumeToken(tok::ellipsis, EllipsisLoc))
1553+
Arg = Actions.ActOnPackExpansion(Arg, EllipsisLoc);
1554+
1555+
if (Arg.isInvalid()) {
1556+
if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
1557+
RunSignatureHelp();
1558+
return true;
15791559
}
15801560

1561+
// Save this template argument.
1562+
TemplateArgs.push_back(Arg);
1563+
15811564
// If the next token is a comma, consume it and keep reading
15821565
// arguments.
15831566
} while (TryConsumeToken(tok::comma));

clang/lib/Sema/SemaExpr.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3643,9 +3643,9 @@ bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero) {
36433643
ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
36443644
// Fast path for a single digit (which is quite common). A single digit
36453645
// cannot have a trigraph, escaped newline, radix prefix, or suffix.
3646-
if (Tok.getLength() == 1) {
3646+
if (Tok.getLength() == 1 || Tok.getKind() == tok::binary_data) {
36473647
const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok);
3648-
return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
3648+
return ActOnIntegerConstant(Tok.getLocation(), Val);
36493649
}
36503650

36513651
SmallString<128> SpellingBuffer;

clang/test/Preprocessor/embed_codegen.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ a
4343
};
4444

4545
// CHECK: store i32 107, ptr %b, align 4
46-
int b =
46+
int b = (
4747
#embed<jk.txt>
48+
)
4849
;
4950

5051

clang/test/Preprocessor/embed_constexpr.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -Wno-c23-extensions
22
// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -fexperimental-new-constant-interpreter -Wno-c23-extensions
3+
// expected-no-diagnostics
34

45
constexpr int value(int a, int b) {
56
return a + b;
@@ -46,7 +47,7 @@ int array[
4647
static_assert(sizeof(array) / sizeof(int) == 'j');
4748

4849
constexpr int comma_expr = (
49-
#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
50+
#embed <jk.txt>
5051
);
5152
static_assert(comma_expr == 'k');
5253

clang/test/Preprocessor/embed_weird.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,17 @@ _Static_assert(
2727
_Static_assert(sizeof(
2828
#embed <single_byte.txt>
2929
) ==
30-
sizeof(unsigned char)
30+
sizeof(int)
3131
, ""
3232
);
3333
_Static_assert(sizeof
3434
#embed <single_byte.txt>
3535
, ""
3636
);
3737
_Static_assert(sizeof(
38-
#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
38+
#embed <jk.txt>
3939
) ==
40-
sizeof(unsigned char)
40+
sizeof(int)
4141
, ""
4242
);
4343

@@ -73,10 +73,10 @@ void do_stuff() {
7373
// Ensure that we don't accidentally allow you to initialize an unsigned char *
7474
// from embedded data; the data is modeled as a string literal internally, but
7575
// is not actually a string literal.
76-
const unsigned char *ptr =
76+
const unsigned char *ptr = (
7777
#embed <jk.txt> // expected-warning {{left operand of comma operator has no effect}}
78-
; // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'unsigned char'}} \
79-
cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'unsigned char'}}
78+
); // c-error@-2 {{incompatible integer to pointer conversion initializing 'const unsigned char *' with an expression of type 'int'}} \
79+
cxx-error@-2 {{cannot initialize a variable of type 'const unsigned char *' with an rvalue of type 'int'}}
8080

8181
// However, there are some cases where this is fine and should work.
8282
const unsigned char *null_ptr_1 =
@@ -101,11 +101,10 @@ constexpr unsigned char ch =
101101
;
102102
static_assert(ch == 0);
103103

104-
void foobar(float x, char y, char z); // cxx-note {{candidate function not viable: requires 3 arguments, but 1 was provided}}
105-
// c-note@-1 {{declared here}}
106-
void g1() { foobar((float) // cxx-error {{no matching function for call to 'foobar'}}
107-
#embed "numbers.txt" limit(3) // expected-warning {{left operand of comma operator has no effect}}
108-
); // c-error {{too few arguments to function call, expected 3, have 1}}
104+
void foobar(float x, char y, char z);
105+
void g1() { foobar((float)
106+
#embed "numbers.txt" limit(3)
107+
);
109108
}
110109

111110
#if __cplusplus

0 commit comments

Comments
 (0)