swiftlang · amackworth · Dec 22, 2015 · gribozavr · Dec 24, 2015 · jrose-apple
diff --git a/include/swift/AST/DiagnosticsParse.def b/include/swift/AST/DiagnosticsParse.def
@@ -91,7 +91,9 @@ ERROR(lex_single_quote_string,lexing,none,
       "single-quoted string literal found, use '\"'", ())
 ERROR(lex_invalid_curly_quote,lexing,none,
       "unicode curly quote found, replace with '\"'", ())
-
+NOTE(lex_confusable_character,lexing,none,
+      "unicode character '%0' looks similar to '%1'; did you mean to use '%1'?",
+      (StringRef, StringRef))
 
 ERROR(lex_unterminated_block_comment,lexing,none,
       "unterminated '/*' comment", ())

diff --git a/include/swift/AST/DiagnosticsSema.def b/include/swift/AST/DiagnosticsSema.def
@@ -436,6 +436,10 @@ ERROR(unspaced_unary_operator,sema_nb,none,
 
 ERROR(use_unresolved_identifier,sema_nb,none,
       "use of unresolved %select{identifier|operator}1 %0", (Identifier, bool))
+NOTE(confusable_character,sema_nb,none,
+      "%select{identifier|operator}0 '%1' contains possibly confused characters; "
+      "did you mean to use '%2'?",
+      (bool, StringRef, StringRef))
 ERROR(use_undeclared_type,sema_nb,none,
       "use of undeclared type %0", (Identifier))
 ERROR(use_undeclared_type_did_you_mean,sema_nb,none,

diff --git a/include/swift/Parse/Confusables.h b/include/swift/Parse/Confusables.h
@@ -0,0 +1,27 @@
+//===--- Confusables.h - Swift Confusable Character Diagnostics -*- C++ -*-===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SWIFT_CONFUSABLES_H
+#define SWIFT_CONFUSABLES_H
+
+#include <stdint.h>
+
+namespace swift {
+namespace confusable {
+  /// Given a UTF-8 codepoint, determines whether it appears on the Unicode
+  /// specification table of confusable characters and maps to punctuation,
+  /// and either returns either the expected ASCII character or 0.
+  uint32_t tryConvertConfusableCharacterToASCII(uint32_t codepoint);
+}
+}
+
+#endif
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
@@ -24,6 +24,11 @@
 #include "llvm/ADT/SmallVector.h"
 
 namespace swift {
+  /// Given a pointer to the starting byte of a UTF8 character, validate it and
+  /// advance the lexer past it.  This returns the encoded character or ~0U if
+  /// the encoding is invalid.
+  uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr, const char *End);
+
   class DiagnosticEngine;
   class InFlightDiagnostic;
   class LangOptions;

diff --git a/lib/Parse/CMakeLists.txt b/lib/Parse/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_swift_library(swiftParse
+  Confusables.cpp.gyb
   Lexer.cpp
   Parser.cpp
   ParseDecl.cpp

diff --git a/lib/Parse/Confusables.cpp.gyb b/lib/Parse/Confusables.cpp.gyb
@@ -0,0 +1,63 @@
+%# -*- mode: C++ -*-
+
+%# Ignore the following admonition; it applies to the resulting .cpp file only
+//// Automatically Generated From Confusables.cpp.gyb.
+//// Do Not Edit Directly!
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See http://swift.org/LICENSE.txt for license information
+// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+%{
+import re
+
+characters = [
+u"(", u")", u"{",
+u"}", u"[", u"]",
+u".", u",", u":",
+u";", u"=", u"@",
+u"#", u"&", u"/",
+u"|", u"\\", u"-",
+u"*", u"+", u">",
+u"<", u"!", u"?"
+]
+
+hexValues = [ hex(ord(char)) for char in characters ]
+
+modifiedHex = []
+for hexString in hexValues:
+    noPrefix = hexString[2:]
+    modifiedHex.append((((4 - len(noPrefix)) * "0") + noPrefix).upper())
+
+pairs = []
+with open('../../utils/UnicodeData/confusables.txt', 'r') as f:
+    pattern = re.compile("(.+)\W+;\W+(.+)\W+;")
+    for line in f:
+        match = pattern.match(line)
+        if match != None:
+            confusedString = match.group(1).replace(" ", "")
+            normalString = match.group(2).replace(" ", "")
+            for hexValue in modifiedHex:
+                if hexValue == normalString:
+                    confused = hex(int(confusedString, 16))
+                    normal = hex(int(normalString, 16))
+                    pairs.append((confused, normal))
+}%
+
+#include "swift/Parse/Confusables.h"
+
+uint32_t swift::confusable::tryConvertConfusableCharacterToASCII(uint32_t codepoint) {
+  switch (codepoint) {
+% for (confused, expected) in pairs:
+  case ${confused}: return ${expected};
+% end
+  default: return 0;
+  }
+}
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "swift/Parse/Lexer.h"
+#include "swift/Parse/Confusables.h"
 #include "swift/AST/DiagnosticsParse.h"
 #include "swift/AST/Identifier.h"
 #include "swift/Basic/Fallthrough.h"
@@ -44,11 +45,7 @@ using clang::isWhitespace;
 // UTF8 Validation/Encoding/Decoding helper functions
 //===----------------------------------------------------------------------===//
 
-/// EncodeToUTF8 - Encode the specified code point into a UTF8 stream.  Return
-/// true if it is an erroneous code point.
-static bool EncodeToUTF8(unsigned CharValue,
-                         SmallVectorImpl<char> &Result) {
-  assert(CharValue >= 0x80 && "Single-byte encoding should be already handled");
+bool EncodeToUTF8(unsigned CharValue, SmallVectorImpl<char> &Result) {
   // Number of bits in the value, ignoring leading zeros.
   unsigned NumBits = 32-llvm::countLeadingZeros(CharValue);
 
@@ -99,10 +96,7 @@ static bool isStartOfUTF8Character(unsigned char C) {
   return (signed char)C >= 0 || C >= 0xC0;  // C0 = 0b11000000
 }
 
-/// validateUTF8CharacterAndAdvance - Given a pointer to the starting byte of a
-/// UTF8 character, validate it and advance the lexer past it.  This returns the
-/// encoded character or ~0U if the encoding is invalid.
-static uint32_t validateUTF8CharacterAndAdvance(const char *&Ptr,
+uint32_t swift::validateUTF8CharacterAndAdvance(const char *&Ptr,
                                                 const char *End) {
   if (Ptr >= End)
     return ~0U;
@@ -483,7 +477,7 @@ static bool isValidIdentifierStartCodePoint(uint32_t c) {
 static bool advanceIf(char const *&ptr, char const *end,
                       bool (*predicate)(uint32_t)) {
   char const *next = ptr;
-  uint32_t c = validateUTF8CharacterAndAdvance(next, end);
+  uint32_t c = swift::validateUTF8CharacterAndAdvance(next, end);
   if (c == ~0U)
     return false;
   if (predicate(c)) {
@@ -1477,6 +1471,22 @@ void Lexer::lexImpl() {
       } else {
         diagnose(CurPtr-1, diag::lex_invalid_character)
           .fixItReplaceChars(getSourceLoc(CurPtr-1), getSourceLoc(tmp), " ");
+
+        llvm::SmallString<64> buffer;
+        EncodeToUTF8(codepoint, buffer);
+        StringRef confusedChar = buffer.str();
+        uint32_t expectedCodepoint;
+        if ((expectedCodepoint =
+            confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
+          std::string expected = std::string(1, (char)expectedCodepoint);
+          StringRef expectedChar = StringRef(expected);
+          diagnose(CurPtr-1, diag::lex_confusable_character,
+                   confusedChar, expectedChar)
+            .fixItReplaceChars(getSourceLoc(CurPtr-1),
+                               getSourceLoc(tmp),
+                               expectedChar);
+        }
+
         CurPtr = tmp;
         goto Restart;  // Skip presumed whitespace.
       }

diff --git a/lib/Sema/TypeCheckConstraints.cpp b/lib/Sema/TypeCheckConstraints.cpp
@@ -27,6 +27,7 @@
 #include "swift/AST/TypeCheckerDebugConsumer.h"
 #include "swift/Basic/Fallthrough.h"
 #include "swift/Parse/Lexer.h"
+#include "swift/Parse/Confusables.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -451,6 +452,36 @@ resolveDeclRefExpr(UnresolvedDeclRefExpr *UDRE, DeclContext *DC) {
       diagnose(Loc, diag::use_unresolved_identifier, Name,
                UDRE->getName().isOperator())
         .highlight(Loc);
+
+      const char *buffer = Name.get();
+      std::string expectedIdentifier = std::string();
+      bool isConfused = false;
+      uint32_t codepoint;
+      int offset = 0;
+      while ((codepoint = validateUTF8CharacterAndAdvance(buffer,
+                                                          buffer +
+                                                          strlen(buffer)))
+             != ~0U) {
+        int length = (buffer - Name.get()) - offset;
+        uint32_t expectedCodepoint;
+        if ((expectedCodepoint =
+             confusable::tryConvertConfusableCharacterToASCII(codepoint))) {
+          isConfused = true;
+          expectedIdentifier += (char)expectedCodepoint;
+        } else {
+          expectedIdentifier += (char)codepoint;
+        }
+
+        offset += length;
+      }
+
+      if (isConfused) {
+        diagnose(Loc, diag::confusable_character,
+                 UDRE->getName().isOperator(),
+                 Name.str(), StringRef(expectedIdentifier.c_str()))
+        .fixItReplaceChars(Loc, Loc.getAdvancedLoc(Name.getLength()),
+                           expectedIdentifier.c_str());
+      }
     }
     return new (Context) ErrorExpr(Loc);
   }

diff --git a/test/Parse/confusables.swift b/test/Parse/confusables.swift
@@ -0,0 +1,17 @@
+// RUN: %target-parse-verify-swift
+
+// expected-error @+4 {{type annotation missing in pattern}}
+// expected-error @+3 {{use of unresolved operator '⁚'}}
+// expected-error @+2 {{operator with postfix spacing cannot start a subexpression}}
+// expected-error @+1 {{consecutive statements on a line must be separated by ';'}}
+let number⁚ Int // expected-note {{operator '⁚' contains possibly confused characters; did you mean to use ':'?}} {{11-14=:}}
+
+// expected-error @+3 2 {{invalid character in source file}}
+// expected-error @+2 {{consecutive statements on a line must be separated by ';'}}
+// expected-note @+1 2 {{unicode character '‒' looks similar to '-'; did you mean to use '-'?}} {{3-6=-}}
+5 ‒ 5
+
+// expected-error @+2 {{use of unresolved identifier 'ꝸꝸꝸ'}}
+// expected-error @+1 2 {{expected ',' separator}}
+if (true ꝸꝸꝸ false) {} // expected-note {{identifier 'ꝸꝸꝸ' contains possibly confused characters; did you mean to use '&&&'?}} {{10-19=&&&}}
+