Skip to content

Commit a468926

Browse files
authored
Warn on confusable non-ascii identifiers (UTS 39, C2) (#11582)
1 parent 3f062a4 commit a468926

File tree

8 files changed

+9800
-23
lines changed

8 files changed

+9800
-23
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ unicode: $(UNICODE)
105105
$(UNICODE): lib/elixir/unicode/*
106106
@ echo "==> unicode (compile)";
107107
$(Q) $(ELIXIRC) lib/elixir/unicode/unicode.ex -o lib/elixir/ebin;
108+
$(Q) $(ELIXIRC) lib/elixir/unicode/security.ex -o lib/elixir/ebin;
108109
$(Q) $(ELIXIRC) lib/elixir/unicode/tokenizer.ex -o lib/elixir/ebin;
109110

110111
$(eval $(call APP_TEMPLATE,ex_unit,ExUnit))

lib/elixir/pages/unicode-security.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,22 @@ Elixir will not allow tokenization of identifiers with codepoints in `\p{Identif
1414
1515
For instance, the 'HANGUL FILLER' (``) character, which is often invisible, is an uncommon codepoint and will trigger this warning.
1616

17-
## C2, C3 (planned)
17+
## C2. Confusable detection
1818

19-
Elixir may implement Confusable Detection, and Mixed-Script Confusable detection, in the future, and will likely emit warnings in those cases; there is a reference implementation.
19+
Elixir will warn on identifiers that look the same, but aren't. Examples: in `а = a = 1`, the two 'a' characters are Cyrillic and Latin, and could be confused for each other; in `力 = カ = 1`, both are Japanese, but different codepoints, in different scripts of that writing system. Confusable identifiers can lead to hard-to-catch bugs (say, due to copy-pasted code) and can be unsafe, so we will warn about identifiers within a single file that could be confused with each other.
20+
21+
We use the means described in Section 4, 'Confusable Detection', with one noted modification
22+
23+
> Alternatively, it shall declare that it uses a modification, and provide a precise list of character mappings that are added to or removed from the provided ones.
24+
25+
Elixir will not warn on confusability for identifiers made up exclusively of characters in a-z, A-Z, 0-9, and _. This is because ASCII identifiers have existed for so long that the programming community has had their own means of dealing with confusability between identifiers like `l,1` or `O,0` (for instance, fonts designed for programming usually make it easy to differentiate between those characters).
26+
27+
## C3. (not yet implemented)
28+
29+
C3 has to do with detecting mixed-script-confusable characters -- like, say, a file in which several Cyrillic 'a' characters are present in a file of mostly latin identifiers. Conformance with this clause is not yet claimed.
2030

2131
## C4, C5 (inapplicable)
2232

23-
'C4 - Restriction Level detection' conformance is not claimed and is inapplicable. (It applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels).
33+
'C4 - Restriction Level detection' conformance is not claimed and does not apply to identifiers in code; rather, it applies to classifying the level of safety of a given arbitrary string into one of 5 restriction levels.
2434

2535
'C5 - Mixed number detection' conformance is inapplicable as Elixir does not support Unicode numbers.

lib/elixir/src/elixir.hrl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
static_atoms_encoder=nil,
3434
preserve_comments=nil,
3535
identifier_tokenizer=elixir_tokenizer,
36+
ascii_identifiers_only=true,
3637
indentation=0,
3738
mismatch_hints=[],
3839
warn_on_unnecessary_quotes=true,

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -134,14 +134,15 @@ tokenize(String, Line, Column, Opts) ->
134134
tokenize(String, Line, Opts) ->
135135
tokenize(String, Line, 1, Opts).
136136

137-
tokenize([], Line, Column, #elixir_tokenizer{cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
138-
#elixir_tokenizer{terminators=Terminators, warnings=Warnings} = Scope,
137+
tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
138+
#elixir_tokenizer{file=File, terminators=Terminators, warnings=Warnings} = Scope,
139139

140140
{CursorColumn, CursorTerminators, CursorTokens} =
141141
add_cursor(Line, Column, Cursor, Terminators, Tokens),
142142

143+
AllWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File, Warnings),
143144
AccTokens = cursor_complete(Line, CursorColumn, CursorTerminators, CursorTokens),
144-
{ok, Line, Column, Warnings, AccTokens};
145+
{ok, Line, Column, AllWarnings, AccTokens};
145146

146147
tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine, _} | _]} = Scope, Tokens) ->
147148
End = terminator(Start),
@@ -150,8 +151,9 @@ tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine,
150151
Formatted = io_lib:format(Message, [End, Start, StartLine]),
151152
error({EndLine, Column, [Formatted, Hint], []}, [], Scope, Tokens);
152153

153-
tokenize([], Line, Column, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
154-
{ok, Line, Column, Warnings, lists:reverse(Tokens)};
154+
tokenize([], Line, Column, #elixir_tokenizer{ascii_identifiers_only=Ascii, file=File, warnings=Warnings}, Tokens) ->
155+
AllWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, File, Warnings),
156+
{ok, Line, Column, AllWarnings, lists:reverse(Tokens)};
155157

156158
% VC merge conflict
157159

@@ -541,10 +543,11 @@ tokenize([$:, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H)
541543

542544
tokenize([$: | String] = Original, Line, Column, Scope, Tokens) ->
543545
case tokenize_identifier(String, Line, Column, Scope, false) of
544-
{_Kind, Unencoded, Atom, Rest, Length, _Ascii, _Special} ->
546+
{_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} ->
545547
NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope),
548+
TrackedScope = track_ascii(Ascii, NewScope),
546549
Token = {atom, {Line, Column, nil}, Atom},
547-
tokenize(Rest, Line, Column + 1 + Length, NewScope, [Token | Tokens]);
550+
tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]);
548551
empty when Scope#elixir_tokenizer.cursor_completion == false ->
549552
unexpected_token(Original, Line, Column, Scope, Tokens);
550553
empty ->
@@ -643,10 +646,11 @@ tokenize([$. | T], Line, Column, Scope, Tokens) ->
643646

644647
% Identifiers
645648

646-
tokenize(String, Line, Column, Scope, Tokens) ->
647-
case tokenize_identifier(String, Line, Column, Scope, not previous_was_dot(Tokens)) of
649+
tokenize(String, Line, Column, OriginalScope, Tokens) ->
650+
case tokenize_identifier(String, Line, Column, OriginalScope, not previous_was_dot(Tokens)) of
648651
{Kind, Unencoded, Atom, Rest, Length, Ascii, Special} ->
649652
HasAt = lists:member($@, Special),
653+
Scope = track_ascii(Ascii, OriginalScope),
650654

651655
case Rest of
652656
[$: | T] when ?is_space(hd(T)) ->
@@ -678,20 +682,20 @@ tokenize(String, Line, Column, Scope, Tokens) ->
678682
end;
679683

680684
{keyword, Atom, Type, Rest, Length} ->
681-
tokenize_keyword(Type, Rest, Line, Column, Atom, Length, Scope, Tokens);
685+
tokenize_keyword(Type, Rest, Line, Column, Atom, Length, OriginalScope, Tokens);
682686

683-
empty when Scope#elixir_tokenizer.cursor_completion == false ->
684-
unexpected_token(String, Line, Column, Scope, Tokens);
687+
empty when OriginalScope#elixir_tokenizer.cursor_completion == false ->
688+
unexpected_token(String, Line, Column, OriginalScope, Tokens);
685689

686690
empty ->
687691
case String of
688-
[$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, Scope, Tokens);
689-
[$~] -> tokenize([], Line, Column, Scope, Tokens);
690-
_ -> unexpected_token(String, Line, Column, Scope, Tokens)
692+
[$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, OriginalScope, Tokens);
693+
[$~] -> tokenize([], Line, Column, OriginalScope, Tokens);
694+
_ -> unexpected_token(String, Line, Column, OriginalScope, Tokens)
691695
end;
692696

693697
{error, Reason} ->
694-
error(Reason, String, Scope, Tokens)
698+
error(Reason, String, OriginalScope, Tokens)
695699
end.
696700

697701
previous_was_dot([{'.', _} | _]) -> true;
@@ -1578,6 +1582,14 @@ maybe_warn_for_ambiguous_bang_before_equals(_Kind, _Atom, _Rest, _Line, _Column,
15781582
prepend_warning(Line, Column, File, Msg, #elixir_tokenizer{warnings=Warnings} = Scope) ->
15791583
Scope#elixir_tokenizer{warnings = [{{Line, Column}, File, Msg} | Warnings]}.
15801584

1585+
track_ascii(true, Scope) -> Scope;
1586+
track_ascii(false, Scope) -> Scope#elixir_tokenizer{ascii_identifiers_only=false}.
1587+
1588+
maybe_unicode_lint_warnings(_Ascii=false, Tokens, File, Warnings) ->
1589+
'Elixir.String.Tokenizer.Security':unicode_lint_warnings(lists:reverse(Tokens), File) ++ Warnings;
1590+
maybe_unicode_lint_warnings(_Ascii=true, _Tokens, _File, Warnings) ->
1591+
Warnings.
1592+
15811593
error(Reason, Rest, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
15821594
{error, Reason, Rest, Warnings, Tokens}.
15831595

@@ -1667,4 +1679,4 @@ prune_tokens([], [], Terminators) ->
16671679

16681680
drop_including([{Token, _} | Tokens], Token) -> Tokens;
16691681
drop_including([_ | Tokens], Token) -> drop_including(Tokens, Token);
1670-
drop_including([], _Token) -> [].
1682+
drop_including([], _Token) -> [].

lib/elixir/test/elixir/kernel/warning_test.exs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,20 @@ defmodule Kernel.WarningTest do
2626
assert {:error, _} = Code.string_to_quoted(~s[:"foobar" do])
2727
end
2828

29+
describe "unicode identifier security" do
30+
test "warns on confusables" do
31+
assert capture_err(fn -> Code.eval_string("аdmin=1; admin=1") end) =~
32+
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
33+
34+
assert capture_err(fn -> Code.eval_string("力=1; カ=1") end) =~
35+
"confusable identifier: 'カ' looks like '力' on line 1"
36+
37+
# by convention, doesn't warn on ascii-only confusables
38+
assert capture_err(fn -> Code.eval_string("x0 = xO = 1") end) == ""
39+
assert capture_err(fn -> Code.eval_string("l1 = ll = 1") end) == ""
40+
end
41+
end
42+
2943
test "operators formed by many of the same character followed by that character" do
3044
output =
3145
capture_err(fn ->

0 commit comments

Comments
 (0)