Skip to content

Commit c92056d

Browse files
committed
[Clang][C++23] P2071 Named universal character escapes
Implements [[ https://wg21.link/p2071r1 | P2071 Named Universal Character Escapes ]] - as an extension in all language mode, the patch not warn in c++23 mode will be done later once this paper is plenary approved (in July). We add * A code generator that transforms `UnicodeData.txt` and `NameAliases.txt` to a space efficient data structure that can be queried in `O(NameLength)` * A set of functions in `Unicode.h` to query that data, including * A function to find an exact match of a given Unicode character name * A function to perform a loose (ignoring case, space, underscore, medial hyphen) matching * A function returning the best matching codepoint for a given string per edit distance * Support of `\N{}` escape sequences in String and character Literals, with loose and typos diagnostics/fixits * Support of `\N{}` as UCN with loose matching diagnostics/fixits. Loose matching is considered an error to match closely the semantics of P2071. The generated data contributes to 280kB of data to the binaries. `UnicodeData.txt` and `NameAliases.txt` are not committed to the repository in this patch, and regenerating the data is a manual process. Reviewed By: tahonermann Differential Revision: https://reviews.llvm.org/D123064
1 parent f8c1c9a commit c92056d

18 files changed

+22720
-53
lines changed

clang/include/clang/Basic/DiagnosticLexKinds.td

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
128128
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
129129

130130
def ext_delimited_escape_sequence : Extension<
131-
"delimited escape sequences are a Clang extension">,
131+
"%select{delimited|named}0 escape sequences are a Clang extension">,
132132
InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
133133
def err_delimited_escape_empty : Error<
134134
"delimited escape sequence cannot be empty">;
@@ -138,17 +138,24 @@ def err_delimited_escape_invalid : Error<
138138
"invalid digit '%0' in escape sequence">;
139139
def err_hex_escape_no_digits : Error<
140140
"\\%0 used with no following hex digits">;
141+
def err_invalid_ucn_name : Error<
142+
"'%0' is not a valid Unicode character name">;
143+
def note_invalid_ucn_name_loose_matching : Note<
144+
"characters names in Unicode escape sequences are sensitive to case and whitespaces">;
145+
def note_invalid_ucn_name_candidate : Note<
146+
"did you mean %0 ('%2' U+%1)?">;
147+
141148
def warn_ucn_escape_no_digits : Warning<
142149
"\\%0 used with no following hex digits; "
143150
"treating as '\\' followed by identifier">, InGroup<Unicode>;
144151
def err_ucn_escape_incomplete : Error<
145152
"incomplete universal character name">;
146153
def warn_delimited_ucn_incomplete : Warning<
147154
"incomplete delimited universal character name; "
148-
"treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
155+
"treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
149156
def warn_delimited_ucn_empty : Warning<
150157
"empty delimited universal character name; "
151-
"treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
158+
"treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
152159
def warn_ucn_escape_incomplete : Warning<
153160
"incomplete universal character name; "
154161
"treating as '\\' followed by identifier">, InGroup<Unicode>;

clang/include/clang/Lex/Lexer.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -769,6 +769,11 @@ class Lexer : public PreprocessorLexer {
769769
void codeCompleteIncludedFile(const char *PathStart,
770770
const char *CompletionPoint, bool IsAngled);
771771

772+
llvm::Optional<uint32_t>
773+
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
774+
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
775+
Token *Result);
776+
772777
/// Read a universal character name.
773778
///
774779
/// \param StartPtr The position in the source buffer after the initial '\'.

clang/lib/Lex/Lexer.cpp

Lines changed: 127 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "llvm/Support/MathExtras.h"
3838
#include "llvm/Support/MemoryBufferRef.h"
3939
#include "llvm/Support/NativeFormatting.h"
40+
#include "llvm/Support/Unicode.h"
4041
#include "llvm/Support/UnicodeCharRanges.h"
4142
#include <algorithm>
4243
#include <cassert>
@@ -3119,27 +3120,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
31193120
return false;
31203121
}
31213122

3122-
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3123-
Token *Result) {
3123+
llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3124+
const char *SlashLoc,
3125+
Token *Result) {
31243126
unsigned CharSize;
31253127
char Kind = getCharAndSize(StartPtr, CharSize);
3126-
bool Delimited = false;
3127-
bool FoundEndDelimiter = false;
3128-
unsigned Count = 0;
3129-
bool Diagnose = Result && !isLexingRawMode();
3128+
assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
31303129

31313130
unsigned NumHexDigits;
31323131
if (Kind == 'u')
31333132
NumHexDigits = 4;
31343133
else if (Kind == 'U')
31353134
NumHexDigits = 8;
3136-
else
3137-
return 0;
3135+
3136+
bool Delimited = false;
3137+
bool FoundEndDelimiter = false;
3138+
unsigned Count = 0;
3139+
bool Diagnose = Result && !isLexingRawMode();
31383140

31393141
if (!LangOpts.CPlusPlus && !LangOpts.C99) {
31403142
if (Diagnose)
31413143
Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3142-
return 0;
3144+
return llvm::None;
31433145
}
31443146

31453147
const char *CurPtr = StartPtr + CharSize;
@@ -3166,14 +3168,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
31663168
break;
31673169
if (Diagnose)
31683170
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
3169-
<< StringRef(&C, 1);
3170-
return 0;
3171+
<< StringRef(KindLoc, 1);
3172+
return llvm::None;
31713173
}
31723174

31733175
if (CodePoint & 0xF000'0000) {
31743176
if (Diagnose)
31753177
Diag(KindLoc, diag::err_escape_too_large) << 0;
3176-
return 0;
3178+
return llvm::None;
31773179
}
31783180

31793181
CodePoint <<= 4;
@@ -3187,7 +3189,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
31873189
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
31883190
: diag::warn_ucn_escape_no_digits)
31893191
<< StringRef(KindLoc, 1);
3190-
return 0;
3192+
return llvm::None;
3193+
}
3194+
3195+
if (Delimited && Kind == 'U') {
3196+
if (Diagnose)
3197+
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3198+
return llvm::None;
31913199
}
31923200

31933201
if (!Delimited && Count != NumHexDigits) {
@@ -3200,11 +3208,11 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
32003208
<< FixItHint::CreateReplacement(URange, "u");
32013209
}
32023210
}
3203-
return 0;
3211+
return llvm::None;
32043212
}
32053213

32063214
if (Delimited && PP) {
3207-
Diag(BufferPtr, diag::ext_delimited_escape_sequence);
3215+
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
32083216
}
32093217

32103218
if (Result) {
@@ -3217,6 +3225,110 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
32173225
} else {
32183226
StartPtr = CurPtr;
32193227
}
3228+
return CodePoint;
3229+
}
3230+
3231+
llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3232+
Token *Result) {
3233+
unsigned CharSize;
3234+
bool Diagnose = Result && !isLexingRawMode();
3235+
3236+
char C = getCharAndSize(StartPtr, CharSize);
3237+
assert(C == 'N' && "expected \\N{...}");
3238+
3239+
const char *CurPtr = StartPtr + CharSize;
3240+
const char *KindLoc = &CurPtr[-1];
3241+
3242+
C = getCharAndSize(CurPtr, CharSize);
3243+
if (C != '{') {
3244+
if (Diagnose)
3245+
Diag(StartPtr, diag::warn_ucn_escape_incomplete);
3246+
return llvm::None;
3247+
}
3248+
CurPtr += CharSize;
3249+
const char *StartName = CurPtr;
3250+
bool FoundEndDelimiter = false;
3251+
llvm::SmallVector<char, 30> Buffer;
3252+
while (C) {
3253+
C = getCharAndSize(CurPtr, CharSize);
3254+
CurPtr += CharSize;
3255+
if (C == '}') {
3256+
FoundEndDelimiter = true;
3257+
break;
3258+
}
3259+
3260+
if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
3261+
break;
3262+
Buffer.push_back(C);
3263+
}
3264+
3265+
if (!FoundEndDelimiter || Buffer.empty()) {
3266+
if (Diagnose)
3267+
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3268+
: diag::warn_delimited_ucn_incomplete)
3269+
<< StringRef(KindLoc, 1);
3270+
return llvm::None;
3271+
}
3272+
3273+
StringRef Name(Buffer.data(), Buffer.size());
3274+
llvm::Optional<char32_t> Res =
3275+
llvm::sys::unicode::nameToCodepointStrict(Name);
3276+
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3277+
if (!Res) {
3278+
if (!isLexingRawMode()) {
3279+
Diag(StartPtr, diag::err_invalid_ucn_name)
3280+
<< StringRef(Buffer.data(), Buffer.size());
3281+
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3282+
if (LooseMatch) {
3283+
Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3284+
<< FixItHint::CreateReplacement(
3285+
makeCharRange(*this, StartName, CurPtr - CharSize),
3286+
LooseMatch->Name);
3287+
}
3288+
}
3289+
// When finding a match using Unicode loose matching rules
3290+
// recover after having emitted a diagnostic.
3291+
if (!LooseMatch)
3292+
return llvm::None;
3293+
// We do not offer missspelled character names suggestions here
3294+
// as the set of what would be a valid suggestion depends on context,
3295+
// and we should not make invalid suggestions.
3296+
}
3297+
3298+
if (Diagnose && PP && !LooseMatch)
3299+
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
3300+
3301+
if (LooseMatch)
3302+
Res = LooseMatch->CodePoint;
3303+
3304+
if (Result) {
3305+
Result->setFlag(Token::HasUCN);
3306+
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
3307+
StartPtr = CurPtr;
3308+
else
3309+
while (StartPtr != CurPtr)
3310+
(void)getAndAdvanceChar(StartPtr, *Result);
3311+
} else {
3312+
StartPtr = CurPtr;
3313+
}
3314+
return *Res;
3315+
}
3316+
3317+
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3318+
Token *Result) {
3319+
3320+
unsigned CharSize;
3321+
llvm::Optional<uint32_t> CodePointOpt;
3322+
char Kind = getCharAndSize(StartPtr, CharSize);
3323+
if (Kind == 'u' || Kind == 'U')
3324+
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3325+
else if (Kind == 'N')
3326+
CodePointOpt = tryReadNamedUCN(StartPtr, Result);
3327+
3328+
if (!CodePointOpt)
3329+
return 0;
3330+
3331+
uint32_t CodePoint = *CodePointOpt;
32203332

32213333
// Don't apply C family restrictions to UCNs in assembly mode
32223334
if (LangOpts.AsmPreprocessor)

0 commit comments

Comments
 (0)