-
Notifications
You must be signed in to change notification settings - Fork 10.5k
[SR-331] Generate fix-its for confusable characters #732
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
//===--- Confusables.h - Swift Confusable Character Diagnostics -*- C++ -*-===// | ||
// | ||
// This source file is part of the Swift.org open source project | ||
// | ||
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors | ||
// Licensed under Apache License v2.0 with Runtime Library Exception | ||
// | ||
// See http://swift.org/LICENSE.txt for license information | ||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef SWIFT_CONFUSABLES_H | ||
#define SWIFT_CONFUSABLES_H | ||
|
||
#include <stdint.h> | ||
|
||
namespace swift { | ||
namespace confusable { | ||
/// Given a UTF-8 codepoint, determines whether it appears on the Unicode | ||
/// specification table of confusable characters and maps to punctuation, | ||
/// and either returns either the expected ASCII character or 0. | ||
uint32_t tryConvertConfusableCharacterToASCII(uint32_t codepoint); | ||
} | ||
} | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
add_swift_library(swiftParse | ||
Confusables.cpp.gyb | ||
Lexer.cpp | ||
Parser.cpp | ||
ParseDecl.cpp | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
%# -*- mode: C++ -*- | ||
|
||
%# Ignore the following admonition; it applies to the resulting .cpp file only | ||
//// Automatically Generated From Confusables.cpp.gyb. | ||
//// Do Not Edit Directly! | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// This source file is part of the Swift.org open source project | ||
// | ||
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors | ||
// Licensed under Apache License v2.0 with Runtime Library Exception | ||
// | ||
// See http://swift.org/LICENSE.txt for license information | ||
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
%{ | ||
import re | ||
|
||
characters = [ | ||
u"(", u")", u"{", | ||
u"}", u"[", u"]", | ||
u".", u",", u":", | ||
u";", u"=", u"@", | ||
u"#", u"&", u"/", | ||
u"|", u"\\", u"-", | ||
u"*", u"+", u">", | ||
u"<", u"!", u"?" | ||
] | ||
|
||
hexValues = [ hex(ord(char)) for char in characters ] | ||
|
||
modifiedHex = [] | ||
for hexString in hexValues: | ||
noPrefix = hexString[2:] | ||
modifiedHex.append((((4 - len(noPrefix)) * "0") + noPrefix).upper()) | ||
|
||
pairs = [] | ||
with open('../../utils/UnicodeData/confusables.txt', 'r') as f: | ||
pattern = re.compile("(.+)\W+;\W+(.+)\W+;") | ||
for line in f: | ||
match = pattern.match(line) | ||
if match != None: | ||
confusedString = match.group(1).replace(" ", "") | ||
normalString = match.group(2).replace(" ", "") | ||
for hexValue in modifiedHex: | ||
if hexValue == normalString: | ||
confused = hex(int(confusedString, 16)) | ||
normal = hex(int(normalString, 16)) | ||
pairs.append((confused, normal)) | ||
}% | ||
|
||
#include "swift/Parse/Confusables.h" | ||
|
||
uint32_t swift::confusable::tryConvertConfusableCharacterToASCII(uint32_t codepoint) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this really only supports ASCII, why return |
||
switch (codepoint) { | ||
% for (confused, expected) in pairs: | ||
case ${confused}: return ${expected}; | ||
% end | ||
default: return 0; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
//===----------------------------------------------------------------------===// | ||
|
||
#include "swift/Parse/Lexer.h" | ||
#include "swift/Parse/Confusables.h" | ||
#include "swift/AST/DiagnosticsParse.h" | ||
#include "swift/AST/Identifier.h" | ||
#include "swift/Basic/Fallthrough.h" | ||
|
@@ -44,11 +45,7 @@ using clang::isWhitespace; | |
// UTF8 Validation/Encoding/Decoding helper functions | ||
//===----------------------------------------------------------------------===// | ||
|
||
/// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return | ||
/// true if it is an erroneous code point. | ||
static bool EncodeToUTF8(unsigned CharValue, | ||
SmallVectorImpl<char> &Result) { | ||
assert(CharValue >= 0x80 && "Single-byte encoding should be already handled"); | ||
bool EncodeToUTF8(unsigned CharValue, SmallVectorImpl<char> &Result) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't used outside the file anymore, so you can leave it alone. |
||
// Number of bits in the value, ignoring leading zeros. | ||
unsigned NumBits = 32-llvm::countLeadingZeros(CharValue); | ||
|
||
|
@@ -99,10 +96,7 @@ static bool isStartOfUTF8Character(unsigned char C) { | |
return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000 | ||
} | ||
|
||
/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a | ||
/// UTF8 character, validate it and advance the lexer past it. This returns the | ||
/// encoded character or ~0U if the encoding is invalid. | ||
static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr, | ||
uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr, | ||
const char *End) { | ||
if (Ptr >= End) | ||
return ~0U; | ||
|
@@ -483,7 +477,7 @@ static bool isValidIdentifierStartCodePoint(uint32_t c) { | |
static bool advanceIf(char const *&ptr, char const *end, | ||
bool (*predicate)(uint32_t)) { | ||
char const *next = ptr; | ||
uint32_t c = validateUTF8CharacterAndAdvance(next, end); | ||
uint32_t c = swift::validateUTF8CharacterAndAdvance(next, end); | ||
if (c == ~0U) | ||
return false; | ||
if (predicate(c)) { | ||
|
@@ -1477,6 +1471,22 @@ void Lexer::lexImpl() { | |
} else { | ||
diagnose(CurPtr-1, diag::lex_invalid_character) | ||
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " "); | ||
|
||
llvm::SmallString<64> buffer; | ||
EncodeToUTF8(codepoint, buffer); | ||
StringRef confusedChar = buffer.str(); | ||
uint32_t expectedCodepoint; | ||
if ((expectedCodepoint = | ||
confusable::tryConvertConfusableCharacterToASCII(codepoint))) { | ||
std::string expected = std::string(1, (char)expectedCodepoint); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use a SmallString here too (which implicitly converts to StringRef), so that we don't have any heap allocation. |
||
StringRef expectedChar = StringRef(expected); | ||
diagnose(CurPtr-1, diag::lex_confusable_character, | ||
confusedChar, expectedChar) | ||
.fixItReplaceChars(getSourceLoc(CurPtr-1), | ||
getSourceLoc(tmp), | ||
expectedChar); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Recovering here would mean jumping to the top of the switch with the expected character instead, but I'm not sure we're set up to handle that very well. We probably assume all over the place that all the ASCII characters are only one byte wide. I guess it's okay not to do anything here. |
||
|
||
CurPtr = tmp; | ||
goto Restart; // Skip presumed whitespace. | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
#include "swift/AST/TypeCheckerDebugConsumer.h" | ||
#include "swift/Basic/Fallthrough.h" | ||
#include "swift/Parse/Lexer.h" | ||
#include "swift/Parse/Confusables.h" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please keep the includes sorted. |
||
#include "llvm/ADT/APInt.h" | ||
#include "llvm/ADT/DenseMap.h" | ||
#include "llvm/ADT/FoldingSet.h" | ||
|
@@ -451,6 +452,36 @@ resolveDeclRefExpr(UnresolvedDeclRefExpr *UDRE, DeclContext *DC) { | |
diagnose(Loc, diag::use_unresolved_identifier, Name, | ||
UDRE->getName().isOperator()) | ||
.highlight(Loc); | ||
|
||
const char *buffer = Name.get(); | ||
std::string expectedIdentifier = std::string(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that this is at the top level, you can use a SmallString instead of std::string here too. |
||
bool isConfused = false; | ||
uint32_t codepoint; | ||
int offset = 0; | ||
while ((codepoint = validateUTF8CharacterAndAdvance(buffer, | ||
buffer + | ||
strlen(buffer))) | ||
!= ~0U) { | ||
int length = (buffer - Name.get()) - offset; | ||
uint32_t expectedCodepoint; | ||
if ((expectedCodepoint = | ||
confusable::tryConvertConfusableCharacterToASCII(codepoint))) { | ||
isConfused = true; | ||
expectedIdentifier += (char)expectedCodepoint; | ||
} else { | ||
expectedIdentifier += (char)codepoint; | ||
} | ||
|
||
offset += length; | ||
} | ||
|
||
if (isConfused) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, to recover I would call (Well, maybe it will because maybe you've confused characters and juxtaposed operators, but that seems kind of unlikely.) |
||
diagnose(Loc, diag::confusable_character, | ||
UDRE->getName().isOperator(), | ||
Name.str(), StringRef(expectedIdentifier.c_str())) | ||
.fixItReplaceChars(Loc, Loc.getAdvancedLoc(Name.getLength()), | ||
expectedIdentifier.c_str()); | ||
} | ||
} | ||
return new (Context) ErrorExpr(Loc); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// RUN: %target-parse-verify-swift | ||
|
||
// expected-error @+4 {{type annotation missing in pattern}} | ||
// expected-error @+3 {{use of unresolved operator '⁚'}} | ||
// expected-error @+2 {{operator with postfix spacing cannot start a subexpression}} | ||
// expected-error @+1 {{consecutive statements on a line must be separated by ';'}} | ||
let number⁚ Int // expected-note {{operator '⁚' contains possibly confused characters; did you mean to use ':'?}} {{11-14=:}} | ||
|
||
// expected-error @+3 2 {{invalid character in source file}} | ||
// expected-error @+2 {{consecutive statements on a line must be separated by ';'}} | ||
// expected-note @+1 2 {{unicode character '‒' looks similar to '-'; did you mean to use '-'?}} {{3-6=-}} | ||
5 ‒ 5 | ||
|
||
// expected-error @+2 {{use of unresolved identifier 'ꝸꝸꝸ'}} | ||
// expected-error @+1 2 {{expected ',' separator}} | ||
if (true ꝸꝸꝸ false) {} // expected-note {{identifier 'ꝸꝸꝸ' contains possibly confused characters; did you mean to use '&&&'?}} {{10-19=&&&}} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add the final line: