Skip to content

[SR-331] Generate fix-its for confusable characters #732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/swift/AST/DiagnosticsParse.def
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ ERROR(lex_single_quote_string,lexing,none,
"single-quoted string literal found, use '\"'", ())
ERROR(lex_invalid_curly_quote,lexing,none,
"unicode curly quote found, replace with '\"'", ())

NOTE(lex_confusable_character,lexing,none,
"unicode character '%0' looks similar to '%1'; did you mean to use '%1'?",
(StringRef, StringRef))

ERROR(lex_unterminated_block_comment,lexing,none,
"unterminated '/*' comment", ())
Expand Down
4 changes: 4 additions & 0 deletions include/swift/AST/DiagnosticsSema.def
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,10 @@ ERROR(unspaced_unary_operator,sema_nb,none,

ERROR(use_unresolved_identifier,sema_nb,none,
"use of unresolved %select{identifier|operator}1 %0", (Identifier, bool))
NOTE(confusable_character,sema_nb,none,
"%select{identifier|operator}0 '%1' contains possibly confused characters; "
"did you mean to use '%2'?",
(bool, StringRef, StringRef))
ERROR(use_undeclared_type,sema_nb,none,
"use of undeclared type %0", (Identifier))
ERROR(use_undeclared_type_did_you_mean,sema_nb,none,
Expand Down
27 changes: 27 additions & 0 deletions include/swift/Parse/Confusables.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
//===--- Confusables.h - Swift Confusable Character Diagnostics -*- C++ -*-===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add the final line:

//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//

#ifndef SWIFT_CONFUSABLES_H
#define SWIFT_CONFUSABLES_H

#include <stdint.h>

namespace swift {
namespace confusable {
/// Given a UTF-8 codepoint, determines whether it appears on the Unicode
/// specification table of confusable characters and maps to punctuation,
/// and either returns either the expected ASCII character or 0.
uint32_t tryConvertConfusableCharacterToASCII(uint32_t codepoint);
}
}

#endif
5 changes: 5 additions & 0 deletions include/swift/Parse/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
#include "llvm/ADT/SmallVector.h"

namespace swift {
/// Given a pointer to the starting byte of a UTF8 character, validate it and
/// advance the lexer past it. This returns the encoded character or ~0U if
/// the encoding is invalid.
uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr, const char *End);

class DiagnosticEngine;
class InFlightDiagnostic;
class LangOptions;
Expand Down
1 change: 1 addition & 0 deletions lib/Parse/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_swift_library(swiftParse
Confusables.cpp.gyb
Lexer.cpp
Parser.cpp
ParseDecl.cpp
Expand Down
63 changes: 63 additions & 0 deletions lib/Parse/Confusables.cpp.gyb
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
%# -*- mode: C++ -*-

%# Ignore the following admonition; it applies to the resulting .cpp file only
//// Automatically Generated From Confusables.cpp.gyb.
//// Do Not Edit Directly!
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

%{
import re

characters = [
u"(", u")", u"{",
u"}", u"[", u"]",
u".", u",", u":",
u";", u"=", u"@",
u"#", u"&", u"/",
u"|", u"\\", u"-",
u"*", u"+", u">",
u"<", u"!", u"?"
]

hexValues = [ hex(ord(char)) for char in characters ]

modifiedHex = []
for hexString in hexValues:
noPrefix = hexString[2:]
modifiedHex.append((((4 - len(noPrefix)) * "0") + noPrefix).upper())

pairs = []
with open('../../utils/UnicodeData/confusables.txt', 'r') as f:
pattern = re.compile("(.+)\W+;\W+(.+)\W+;")
for line in f:
match = pattern.match(line)
if match != None:
confusedString = match.group(1).replace(" ", "")
normalString = match.group(2).replace(" ", "")
for hexValue in modifiedHex:
if hexValue == normalString:
confused = hex(int(confusedString, 16))
normal = hex(int(normalString, 16))
pairs.append((confused, normal))
}%

#include "swift/Parse/Confusables.h"

uint32_t swift::confusable::tryConvertConfusableCharacterToASCII(uint32_t codepoint) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this really only supports ASCII, why return uint32_t instead of char?

switch (codepoint) {
% for (confused, expected) in pairs:
case ${confused}: return ${expected};
% end
default: return 0;
}
}
30 changes: 20 additions & 10 deletions lib/Parse/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//

#include "swift/Parse/Lexer.h"
#include "swift/Parse/Confusables.h"
#include "swift/AST/DiagnosticsParse.h"
#include "swift/AST/Identifier.h"
#include "swift/Basic/Fallthrough.h"
Expand Down Expand Up @@ -44,11 +45,7 @@ using clang::isWhitespace;
// UTF8 Validation/Encoding/Decoding helper functions
//===----------------------------------------------------------------------===//

/// EncodeToUTF8 - Encode the specified code point into a UTF8 stream. Return
/// true if it is an erroneous code point.
static bool EncodeToUTF8(unsigned CharValue,
SmallVectorImpl<char> &Result) {
assert(CharValue >= 0x80 && "Single-byte encoding should be already handled");
bool EncodeToUTF8(unsigned CharValue, SmallVectorImpl<char> &Result) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't used outside the file anymore, so you can leave it alone.

// Number of bits in the value, ignoring leading zeros.
unsigned NumBits = 32-llvm::countLeadingZeros(CharValue);

Expand Down Expand Up @@ -99,10 +96,7 @@ static bool isStartOfUTF8Character(unsigned char C) {
return (signed char)C >= 0 || C >= 0xC0; // C0 = 0b11000000
}

/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
/// UTF8 character, validate it and advance the lexer past it. This returns the
/// encoded character or ~0U if the encoding is invalid.
static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr,
const char *End) {
if (Ptr >= End)
return ~0U;
Expand Down Expand Up @@ -483,7 +477,7 @@ static bool isValidIdentifierStartCodePoint(uint32_t c) {
static bool advanceIf(char const *&ptr, char const *end,
bool (*predicate)(uint32_t)) {
char const *next = ptr;
uint32_t c = validateUTF8CharacterAndAdvance(next, end);
uint32_t c = swift::validateUTF8CharacterAndAdvance(next, end);
if (c == ~0U)
return false;
if (predicate(c)) {
Expand Down Expand Up @@ -1477,6 +1471,22 @@ void Lexer::lexImpl() {
} else {
diagnose(CurPtr-1, diag::lex_invalid_character)
.fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");

llvm::SmallString<64> buffer;
EncodeToUTF8(codepoint, buffer);
StringRef confusedChar = buffer.str();
uint32_t expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
std::string expected = std::string(1, (char)expectedCodepoint);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a SmallString here too (which implicitly converts to StringRef), so that we don't have any heap allocation.

StringRef expectedChar = StringRef(expected);
diagnose(CurPtr-1, diag::lex_confusable_character,
confusedChar, expectedChar)
.fixItReplaceChars(getSourceLoc(CurPtr-1),
getSourceLoc(tmp),
expectedChar);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recovering here would mean jumping to the top of the switch with the expected character instead, but I'm not sure we're set up to handle that very well. We probably assume all over the place that all the ASCII characters are only one byte wide. I guess it's okay not to do anything here.


CurPtr = tmp;
goto Restart; // Skip presumed whitespace.
}
Expand Down
31 changes: 31 additions & 0 deletions lib/Sema/TypeCheckConstraints.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "swift/AST/TypeCheckerDebugConsumer.h"
#include "swift/Basic/Fallthrough.h"
#include "swift/Parse/Lexer.h"
#include "swift/Parse/Confusables.h"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep the includes sorted.

#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/FoldingSet.h"
Expand Down Expand Up @@ -451,6 +452,36 @@ resolveDeclRefExpr(UnresolvedDeclRefExpr *UDRE, DeclContext *DC) {
diagnose(Loc, diag::use_unresolved_identifier, Name,
UDRE->getName().isOperator())
.highlight(Loc);

const char *buffer = Name.get();
std::string expectedIdentifier = std::string();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that this is at the top level, you can use a SmallString instead of std::string here too.

bool isConfused = false;
uint32_t codepoint;
int offset = 0;
while ((codepoint = validateUTF8CharacterAndAdvance(buffer,
buffer +
strlen(buffer)))
!= ~0U) {
int length = (buffer - Name.get()) - offset;
uint32_t expectedCodepoint;
if ((expectedCodepoint =
confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
isConfused = true;
expectedIdentifier += (char)expectedCodepoint;
} else {
expectedIdentifier += (char)codepoint;
}

offset += length;
}

if (isConfused) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, to recover I would call lookupUnqualified again here and see if you get results back. If you do get results, you can diagnose this as a single ERROR with a fix-it and then continue in this function. If you don't get any results, I'm not sure we should even say it's confused; using the ASCII characters won't improve anything.

(Well, maybe it will because maybe you've confused characters and juxtaposed operators, but that seems kind of unlikely.)

diagnose(Loc, diag::confusable_character,
UDRE->getName().isOperator(),
Name.str(), StringRef(expectedIdentifier.c_str()))
.fixItReplaceChars(Loc, Loc.getAdvancedLoc(Name.getLength()),
expectedIdentifier.c_str());
}
}
return new (Context) ErrorExpr(Loc);
}
Expand Down
17 changes: 17 additions & 0 deletions test/Parse/confusables.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// RUN: %target-parse-verify-swift

// expected-error @+4 {{type annotation missing in pattern}}
// expected-error @+3 {{use of unresolved operator '⁚'}}
// expected-error @+2 {{operator with postfix spacing cannot start a subexpression}}
// expected-error @+1 {{consecutive statements on a line must be separated by ';'}}
let number⁚ Int // expected-note {{operator '⁚' contains possibly confused characters; did you mean to use ':'?}} {{11-14=:}}

// expected-error @+3 2 {{invalid character in source file}}
// expected-error @+2 {{consecutive statements on a line must be separated by ';'}}
// expected-note @+1 2 {{unicode character '‒' looks similar to '-'; did you mean to use '-'?}} {{3-6=-}}
5 ‒ 5

// expected-error @+2 {{use of unresolved identifier 'ꝸꝸꝸ'}}
// expected-error @+1 2 {{expected ',' separator}}
if (true ꝸꝸꝸ false) {} // expected-note {{identifier 'ꝸꝸꝸ' contains possibly confused characters; did you mean to use '&&&'?}} {{10-19=&&&}}

Loading