37
37
#include " llvm/Support/MathExtras.h"
38
38
#include " llvm/Support/MemoryBufferRef.h"
39
39
#include " llvm/Support/NativeFormatting.h"
40
+ #include " llvm/Support/Unicode.h"
40
41
#include " llvm/Support/UnicodeCharRanges.h"
41
42
#include < algorithm>
42
43
#include < cassert>
@@ -3119,27 +3120,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3119
3120
return false ;
3120
3121
}
3121
3122
3122
- uint32_t Lexer::tryReadUCN (const char *&StartPtr, const char *SlashLoc,
3123
- Token *Result) {
3123
+ llvm::Optional<uint32_t > Lexer::tryReadNumericUCN (const char *&StartPtr,
3124
+ const char *SlashLoc,
3125
+ Token *Result) {
3124
3126
unsigned CharSize;
3125
3127
char Kind = getCharAndSize (StartPtr, CharSize);
3126
- bool Delimited = false ;
3127
- bool FoundEndDelimiter = false ;
3128
- unsigned Count = 0 ;
3129
- bool Diagnose = Result && !isLexingRawMode ();
3128
+ assert ((Kind == ' u' || Kind == ' U' ) && " expected a UCN" );
3130
3129
3131
3130
unsigned NumHexDigits;
3132
3131
if (Kind == ' u' )
3133
3132
NumHexDigits = 4 ;
3134
3133
else if (Kind == ' U' )
3135
3134
NumHexDigits = 8 ;
3136
- else
3137
- return 0 ;
3135
+
3136
+ bool Delimited = false ;
3137
+ bool FoundEndDelimiter = false ;
3138
+ unsigned Count = 0 ;
3139
+ bool Diagnose = Result && !isLexingRawMode ();
3138
3140
3139
3141
if (!LangOpts.CPlusPlus && !LangOpts.C99 ) {
3140
3142
if (Diagnose)
3141
3143
Diag (SlashLoc, diag::warn_ucn_not_valid_in_c89);
3142
- return 0 ;
3144
+ return llvm::None ;
3143
3145
}
3144
3146
3145
3147
const char *CurPtr = StartPtr + CharSize;
@@ -3166,14 +3168,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3166
3168
break ;
3167
3169
if (Diagnose)
3168
3170
Diag (BufferPtr, diag::warn_delimited_ucn_incomplete)
3169
- << StringRef (&C , 1 );
3170
- return 0 ;
3171
+ << StringRef (KindLoc , 1 );
3172
+ return llvm::None ;
3171
3173
}
3172
3174
3173
3175
if (CodePoint & 0xF000'0000 ) {
3174
3176
if (Diagnose)
3175
3177
Diag (KindLoc, diag::err_escape_too_large) << 0 ;
3176
- return 0 ;
3178
+ return llvm::None ;
3177
3179
}
3178
3180
3179
3181
CodePoint <<= 4 ;
@@ -3187,7 +3189,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3187
3189
Diag (StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3188
3190
: diag::warn_ucn_escape_no_digits)
3189
3191
<< StringRef (KindLoc, 1 );
3190
- return 0 ;
3192
+ return llvm::None;
3193
+ }
3194
+
3195
+ if (Delimited && Kind == ' U' ) {
3196
+ if (Diagnose)
3197
+ Diag (StartPtr, diag::err_hex_escape_no_digits) << StringRef (KindLoc, 1 );
3198
+ return llvm::None;
3191
3199
}
3192
3200
3193
3201
if (!Delimited && Count != NumHexDigits) {
@@ -3200,11 +3208,11 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3200
3208
<< FixItHint::CreateReplacement (URange, " u" );
3201
3209
}
3202
3210
}
3203
- return 0 ;
3211
+ return llvm::None ;
3204
3212
}
3205
3213
3206
3214
if (Delimited && PP) {
3207
- Diag (BufferPtr, diag::ext_delimited_escape_sequence);
3215
+ Diag (BufferPtr, diag::ext_delimited_escape_sequence) << /* delimited */ 0 ;
3208
3216
}
3209
3217
3210
3218
if (Result) {
@@ -3217,6 +3225,110 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3217
3225
} else {
3218
3226
StartPtr = CurPtr;
3219
3227
}
3228
+ return CodePoint;
3229
+ }
3230
+
3231
+ llvm::Optional<uint32_t > Lexer::tryReadNamedUCN (const char *&StartPtr,
3232
+ Token *Result) {
3233
+ unsigned CharSize;
3234
+ bool Diagnose = Result && !isLexingRawMode ();
3235
+
3236
+ char C = getCharAndSize (StartPtr, CharSize);
3237
+ assert (C == ' N' && " expected \\ N{...}" );
3238
+
3239
+ const char *CurPtr = StartPtr + CharSize;
3240
+ const char *KindLoc = &CurPtr[-1 ];
3241
+
3242
+ C = getCharAndSize (CurPtr, CharSize);
3243
+ if (C != ' {' ) {
3244
+ if (Diagnose)
3245
+ Diag (StartPtr, diag::warn_ucn_escape_incomplete);
3246
+ return llvm::None;
3247
+ }
3248
+ CurPtr += CharSize;
3249
+ const char *StartName = CurPtr;
3250
+ bool FoundEndDelimiter = false ;
3251
+ llvm::SmallVector<char , 30 > Buffer;
3252
+ while (C) {
3253
+ C = getCharAndSize (CurPtr, CharSize);
3254
+ CurPtr += CharSize;
3255
+ if (C == ' }' ) {
3256
+ FoundEndDelimiter = true ;
3257
+ break ;
3258
+ }
3259
+
3260
+ if (!isAlphanumeric (C) && C != ' _' && C != ' -' && C != ' ' )
3261
+ break ;
3262
+ Buffer.push_back (C);
3263
+ }
3264
+
3265
+ if (!FoundEndDelimiter || Buffer.empty ()) {
3266
+ if (Diagnose)
3267
+ Diag (StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3268
+ : diag::warn_delimited_ucn_incomplete)
3269
+ << StringRef (KindLoc, 1 );
3270
+ return llvm::None;
3271
+ }
3272
+
3273
+ StringRef Name (Buffer.data (), Buffer.size ());
3274
+ llvm::Optional<char32_t > Res =
3275
+ llvm::sys::unicode::nameToCodepointStrict (Name);
3276
+ llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3277
+ if (!Res) {
3278
+ if (!isLexingRawMode ()) {
3279
+ Diag (StartPtr, diag::err_invalid_ucn_name)
3280
+ << StringRef (Buffer.data (), Buffer.size ());
3281
+ LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching (Name);
3282
+ if (LooseMatch) {
3283
+ Diag (StartName, diag::note_invalid_ucn_name_loose_matching)
3284
+ << FixItHint::CreateReplacement (
3285
+ makeCharRange (*this , StartName, CurPtr - CharSize),
3286
+ LooseMatch->Name );
3287
+ }
3288
+ }
3289
+ // When finding a match using Unicode loose matching rules
3290
+ // recover after having emitted a diagnostic.
3291
+ if (!LooseMatch)
3292
+ return llvm::None;
3293
+ // We do not offer missspelled character names suggestions here
3294
+ // as the set of what would be a valid suggestion depends on context,
3295
+ // and we should not make invalid suggestions.
3296
+ }
3297
+
3298
+ if (Diagnose && PP && !LooseMatch)
3299
+ Diag (BufferPtr, diag::ext_delimited_escape_sequence) << /* named*/ 1 ;
3300
+
3301
+ if (LooseMatch)
3302
+ Res = LooseMatch->CodePoint ;
3303
+
3304
+ if (Result) {
3305
+ Result->setFlag (Token::HasUCN);
3306
+ if (CurPtr - StartPtr == (ptrdiff_t )(Buffer.size () + 4 ))
3307
+ StartPtr = CurPtr;
3308
+ else
3309
+ while (StartPtr != CurPtr)
3310
+ (void )getAndAdvanceChar (StartPtr, *Result);
3311
+ } else {
3312
+ StartPtr = CurPtr;
3313
+ }
3314
+ return *Res;
3315
+ }
3316
+
3317
+ uint32_t Lexer::tryReadUCN (const char *&StartPtr, const char *SlashLoc,
3318
+ Token *Result) {
3319
+
3320
+ unsigned CharSize;
3321
+ llvm::Optional<uint32_t > CodePointOpt;
3322
+ char Kind = getCharAndSize (StartPtr, CharSize);
3323
+ if (Kind == ' u' || Kind == ' U' )
3324
+ CodePointOpt = tryReadNumericUCN (StartPtr, SlashLoc, Result);
3325
+ else if (Kind == ' N' )
3326
+ CodePointOpt = tryReadNamedUCN (StartPtr, Result);
3327
+
3328
+ if (!CodePointOpt)
3329
+ return 0 ;
3330
+
3331
+ uint32_t CodePoint = *CodePointOpt;
3220
3332
3221
3333
// Don't apply C family restrictions to UCNs in assembly mode
3222
3334
if (LangOpts.AsmPreprocessor )
0 commit comments