Skip to content

Commit b401d15

Browse files
committed
Run unicode security linting on more tokens
1 parent a468926 commit b401d15

File tree

5 files changed

+179
-153
lines changed

5 files changed

+179
-153
lines changed

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ tokenize([$:, T | Rest], Line, Column, Scope, Tokens) when
354354

355355
tokenize("..." ++ Rest, Line, Column, Scope, Tokens) ->
356356
NewScope = maybe_warn_too_many_of_same_char("...", Rest, Line, Column, Scope),
357-
Token = check_call_identifier(Line, Column, '...', Rest),
357+
Token = check_call_identifier(Line, Column, "...", '...', Rest),
358358
tokenize(Rest, Line, Column + 3, NewScope, [Token | Tokens]);
359359

360360
tokenize("=>" ++ Rest, Line, Column, Scope, Tokens) ->
@@ -546,7 +546,7 @@ tokenize([$: | String] = Original, Line, Column, Scope, Tokens) ->
546546
{_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} ->
547547
NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope),
548548
TrackedScope = track_ascii(Ascii, NewScope),
549-
Token = {atom, {Line, Column, nil}, Atom},
549+
Token = {atom, {Line, Column, Unencoded}, Atom},
550550
tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]);
551551
empty when Scope#elixir_tokenizer.cursor_completion == false ->
552552
unexpected_token(Original, Line, Column, Scope, Tokens);
@@ -654,7 +654,7 @@ tokenize(String, Line, Column, OriginalScope, Tokens) ->
654654

655655
case Rest of
656656
[$: | T] when ?is_space(hd(T)) ->
657-
Token = {kw_identifier, {Line, Column, nil}, Atom},
657+
Token = {kw_identifier, {Line, Column, Unencoded}, Atom},
658658
tokenize(T, Line, Column + Length + 1, Scope, [Token | Tokens]);
659659

660660
[$: | T] when hd(T) =/= $: ->
@@ -670,11 +670,11 @@ tokenize(String, Line, Column, OriginalScope, Tokens) ->
670670
error({Line, Column, "reserved token: ", atom_to_list(Atom)}, Rest, Scope, Tokens);
671671

672672
_ when Kind == alias ->
673-
tokenize_alias(Rest, Line, Column, Atom, Length, Ascii, Special, Scope, Tokens);
673+
tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scope, Tokens);
674674

675675
_ when Kind == identifier ->
676676
NewScope = maybe_warn_for_ambiguous_bang_before_equals(identifier, Unencoded, Rest, Line, Column, Scope),
677-
Token = check_call_identifier(Line, Column, Atom, Rest),
677+
Token = check_call_identifier(Line, Column, Unencoded, Atom, Rest),
678678
tokenize(Rest, Line, Column + Length, NewScope, [Token | Tokens]);
679679

680680
_ ->
@@ -876,19 +876,19 @@ handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
876876
handle_dot([$., T1, T2, T3 | Rest], Line, Column, DotInfo, Scope, Tokens) when
877877
?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3);
878878
?arrow_op3(T1, T2, T3); ?xor_op3(T1, T2, T3); ?concat_op3(T1, T2, T3) ->
879-
handle_call_identifier(Rest, Line, Column, DotInfo, 3, list_to_atom([T1, T2, T3]), Scope, Tokens);
879+
handle_call_identifier(Rest, Line, Column, DotInfo, 3, [T1, T2, T3], Scope, Tokens);
880880

881881
% ## Two Token Operators
882882
handle_dot([$., T1, T2 | Rest], Line, Column, DotInfo, Scope, Tokens) when
883883
?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2);
884884
?arrow_op(T1, T2); ?in_match_op(T1, T2); ?concat_op(T1, T2); ?power_op(T1, T2); ?type_op(T1, T2) ->
885-
handle_call_identifier(Rest, Line, Column, DotInfo, 2, list_to_atom([T1, T2]), Scope, Tokens);
885+
handle_call_identifier(Rest, Line, Column, DotInfo, 2, [T1, T2], Scope, Tokens);
886886

887887
% ## Single Token Operators
888888
handle_dot([$., T | Rest], Line, Column, DotInfo, Scope, Tokens) when
889889
?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T);
890890
?rel_op(T); ?match_op(T); ?pipe_op(T) ->
891-
handle_call_identifier(Rest, Line, Column, DotInfo, 1, list_to_atom([T]), Scope, Tokens);
891+
handle_call_identifier(Rest, Line, Column, DotInfo, 1, [T], Scope, Tokens);
892892

893893
% ## Exception for .( as it needs to be treated specially in the parser
894894
handle_dot([$., $( | Rest], Line, Column, DotInfo, Scope, Tokens) ->
@@ -914,7 +914,7 @@ handle_dot([$., H | T] = Original, Line, Column, DotInfo, Scope, Tokens) when ?i
914914

915915
case unsafe_to_atom(Part, Line, Column, NewScope) of
916916
{ok, Atom} ->
917-
Token = check_call_identifier(Line, Column, Atom, Rest),
917+
Token = check_call_identifier(Line, Column, Part, Atom, Rest),
918918
TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
919919
tokenize(Rest, NewLine, NewColumn, NewScope, [Token | TokensSoFar]);
920920

@@ -932,8 +932,8 @@ handle_dot([$. | Rest], Line, Column, DotInfo, Scope, Tokens) ->
932932
TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
933933
tokenize(Rest, Line, Column, Scope, TokensSoFar).
934934

935-
handle_call_identifier(Rest, Line, Column, DotInfo, Length, Op, Scope, Tokens) ->
936-
Token = check_call_identifier(Line, Column, Op, Rest),
935+
handle_call_identifier(Rest, Line, Column, DotInfo, Length, UnencodedOp, Scope, Tokens) ->
936+
Token = check_call_identifier(Line, Column, UnencodedOp, list_to_atom(UnencodedOp), Rest),
937937
TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
938938
tokenize(Rest, Line, Column + Length, Scope, [Token | TokensSoFar]).
939939

@@ -1263,30 +1263,28 @@ maybe_keyword(_) -> true.
12631263
list_to_codepoint_hex(List) ->
12641264
[io_lib:format(" 0x~4.16.0B", [Codepoint]) || Codepoint <- List].
12651265

1266-
tokenize_alias(Rest, Line, Column, Atom, Length, Ascii, Special, Scope, Tokens) ->
1266+
tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scope, Tokens) ->
12671267
if
12681268
not Ascii ->
1269-
AtomName = atom_to_list(Atom),
1270-
Invalid = hd([C || C <- AtomName, C > 127]),
1271-
Reason = {Line, Column, invalid_character_error("alias (only ASCII characters are allowed)", Invalid), AtomName},
1272-
error(Reason, AtomName ++ Rest, Scope, Tokens);
1269+
Invalid = hd([C || C <- Unencoded, C > 127]),
1270+
Reason = {Line, Column, invalid_character_error("alias (only ASCII characters are allowed)", Invalid), Unencoded},
1271+
error(Reason, Unencoded ++ Rest, Scope, Tokens);
12731272
Special /= [] ->
1274-
AtomName = atom_to_list(Atom),
1275-
Reason = {Line, Column, invalid_character_error("alias", hd(Special)), AtomName},
1276-
error(Reason, AtomName ++ Rest, Scope, Tokens);
1273+
Reason = {Line, Column, invalid_character_error("alias", hd(Special)), Unencoded},
1274+
error(Reason, Unencoded ++ Rest, Scope, Tokens);
12771275
true ->
1278-
AliasesToken = {alias, {Line, Column, nil}, Atom},
1276+
AliasesToken = {alias, {Line, Column, Unencoded}, Atom},
12791277
tokenize(Rest, Line, Column + Length, Scope, [AliasesToken | Tokens])
12801278
end.
12811279

12821280
%% Check if it is a call identifier (paren | bracket | do)
12831281

1284-
check_call_identifier(Line, Column, Atom, [$( | _]) ->
1285-
{paren_identifier, {Line, Column, nil}, Atom};
1286-
check_call_identifier(Line, Column, Atom, [$[ | _]) ->
1287-
{bracket_identifier, {Line, Column, nil}, Atom};
1288-
check_call_identifier(Line, Column, Atom, _Rest) ->
1289-
{identifier, {Line, Column, nil}, Atom}.
1282+
check_call_identifier(Line, Column, Unencoded, Atom, [$( | _]) ->
1283+
{paren_identifier, {Line, Column, Unencoded}, Atom};
1284+
check_call_identifier(Line, Column, Unencoded, Atom, [$[ | _]) ->
1285+
{bracket_identifier, {Line, Column, Unencoded}, Atom};
1286+
check_call_identifier(Line, Column, Unencoded, Atom, _Rest) ->
1287+
{identifier, {Line, Column, Unencoded}, Atom}.
12901288

12911289
add_token_with_eol({unary_op, _, _} = Left, T) -> [Left | T];
12921290
add_token_with_eol(Left, [{eol, _} | T]) -> [Left | T];

lib/elixir/test/elixir/kernel/warning_test.exs

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,32 @@ defmodule Kernel.WarningTest do
2828

2929
describe "unicode identifier security" do
3030
test "warns on confusables" do
31-
assert capture_err(fn -> Code.eval_string("аdmin=1; admin=1") end) =~
31+
assert capture_err(fn -> Code.string_to_quoted("аdmin=1; admin=1") end) =~
3232
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
3333

34-
assert capture_err(fn -> Code.eval_string("力=1; カ=1") end) =~
34+
assert capture_err(fn -> Code.string_to_quoted("[{:аdmin, 1}, {:admin, 1}]") end) =~
35+
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
36+
37+
assert capture_err(fn -> Code.string_to_quoted("[аdmin: 1, admin: 1]") end) =~
38+
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
39+
40+
assert capture_err(fn -> Code.string_to_quoted("quote do: [аdmin(1), admin(1)]") end) =~
41+
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
42+
43+
assert capture_err(fn -> Code.string_to_quoted("力=1; カ=1") end) =~
3544
"confusable identifier: 'カ' looks like '力' on line 1"
3645

3746
# by convention, doesn't warn on ascii-only confusables
38-
assert capture_err(fn -> Code.eval_string("x0 = xO = 1") end) == ""
39-
assert capture_err(fn -> Code.eval_string("l1 = ll = 1") end) == ""
47+
assert capture_err(fn -> Code.string_to_quoted("x0 = xO = 1") end) == ""
48+
assert capture_err(fn -> Code.string_to_quoted("l1 = ll = 1") end) == ""
49+
50+
# works with a custom atom encoder
51+
assert capture_err(fn ->
52+
Code.string_to_quoted("[{:аdmin, 1}, {:admin, 1}]",
53+
static_atoms_encoder: fn token, _ -> {:ok, {:wrapped, token}} end
54+
)
55+
end) =~
56+
"confusable identifier: 'admin' looks like 'аdmin' on line 1"
4057
end
4158
end
4259

lib/elixir/test/erlang/string_test.erl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@ extract_interpolations_with_escaped_interpolation_test() ->
2828

2929
extract_interpolations_with_interpolation_test() ->
3030
["f",
31-
{{1, 2, nil}, {1, 6, nil}, [{atom, {1, 4, nil}, o}]},
31+
{{1, 2, nil}, {1, 6, nil}, [{atom, {1, 4, _}, o}]},
3232
"o"] = extract_interpolations("f#{:o}o").
3333

3434
extract_interpolations_with_two_interpolations_test() ->
3535
["f",
36-
{{1, 2, nil}, {1, 6, nil}, [{atom, {1, 4, nil}, o}]},
37-
{{1, 7, nil}, {1, 11, nil}, [{atom, {1, 9, nil}, o}]},
36+
{{1, 2, nil}, {1, 6, nil}, [{atom, {1, 4, _}, o}]},
37+
{{1, 7, nil}, {1, 11, nil}, [{atom, {1, 9, _}, o}]},
3838
"o"] = extract_interpolations("f#{:o}#{:o}o").
3939

4040
extract_interpolations_with_only_two_interpolations_test() ->
41-
[{{1, 1, nil}, {1, 5, nil}, [{atom, {1, 3, nil}, o}]},
42-
{{1, 6, nil}, {1, 10, nil}, [{atom, {1, 8, nil}, o}]}] = extract_interpolations("#{:o}#{:o}").
41+
[{{1, 1, nil}, {1, 5, nil}, [{atom, {1, 3, _}, o}]},
42+
{{1, 6, nil}, {1, 10, nil}, [{atom, {1, 8, _}, o}]}] = extract_interpolations("#{:o}#{:o}").
4343

4444
extract_interpolations_with_tuple_inside_interpolation_test() ->
4545
["f",

0 commit comments

Comments
 (0)