|
| 1 | +%% Copyright (c) 2016-2024, Loïc Hoguin <[email protected]> |
| 2 | +%% |
| 3 | +%% Permission to use, copy, modify, and/or distribute this software for any |
| 4 | +%% purpose with or without fee is hereby granted, provided that the above |
| 5 | +%% copyright notice and this permission notice appear in all copies. |
| 6 | +%% |
| 7 | +%% THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 8 | +%% WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 9 | +%% MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 10 | +%% ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 11 | +%% WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 12 | +%% ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 13 | +%% OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 14 | + |
| 15 | +%% ------------------------------------------------------------------------- %% |
| 16 | +%% This file is a partial copy of |
| 17 | +%% https://github.com/ninenines/cowlib/blob/optimise-urldecode/src/cow_uri.erl |
| 18 | +%% We use this copy because: |
| 19 | +%% 1. uri_string:unquote/1 is lax: It doesn't validate that characters that are |
| 20 | +%% required to be percent encoded are indeed percent encoded. In RabbitMQ, |
| 21 | +%% we want to enforce that proper percent encoding is done by AMQP clients. |
| 22 | +%% 2. uri_string:unquote/1 and cow_uri:urldecode/1 in cowlib v2.13.0 are both |
| 23 | +%% slow because they allocate a new binary for the common case where no |
| 24 | +%% character was percent encoded. |
| 25 | +%% When a new cowlib version is released, we should make app rabbit depend on |
| 26 | +%% app cowlib calling cow_uri:urldecode/1 and delete this file (rabbit_uri.erl). |
| 27 | +%% ------------------------------------------------------------------------- %% |
| 28 | + |
| 29 | +-module(rabbit_uri). |
| 30 | + |
| 31 | +-export([urldecode/1]). |
| 32 | + |
| 33 | +-define(UNHEX(H, L), (?UNHEX(H) bsl 4 bor ?UNHEX(L))). |
| 34 | + |
| 35 | +-define(UNHEX(C), |
| 36 | + case C of |
| 37 | + $0 -> 0; |
| 38 | + $1 -> 1; |
| 39 | + $2 -> 2; |
| 40 | + $3 -> 3; |
| 41 | + $4 -> 4; |
| 42 | + $5 -> 5; |
| 43 | + $6 -> 6; |
| 44 | + $7 -> 7; |
| 45 | + $8 -> 8; |
| 46 | + $9 -> 9; |
| 47 | + $A -> 10; |
| 48 | + $B -> 11; |
| 49 | + $C -> 12; |
| 50 | + $D -> 13; |
| 51 | + $E -> 14; |
| 52 | + $F -> 15; |
| 53 | + $a -> 10; |
| 54 | + $b -> 11; |
| 55 | + $c -> 12; |
| 56 | + $d -> 13; |
| 57 | + $e -> 14; |
| 58 | + $f -> 15 |
| 59 | + end |
| 60 | +). |
| 61 | + |
| 62 | +%% Decode a percent encoded string. (RFC3986 2.1) |
| 63 | +%% |
| 64 | +%% Inspiration for some of the optimisations done here come |
| 65 | +%% from the new `json` module as it was in mid-2024. |
| 66 | +%% |
| 67 | +%% Possible input includes: |
| 68 | +%% |
| 69 | +%% * nothing encoded (no % character): |
| 70 | +%% We want to return the binary as-is to avoid an allocation. |
| 71 | +%% |
| 72 | +%% * small number of encoded characters: |
| 73 | +%% We can "skip" words of text. |
| 74 | +%% |
| 75 | +%% * mostly encoded characters (non-ascii languages) |
| 76 | +%% We can decode characters in bulk. |
| 77 | + |
| 78 | +-define(IS_PLAIN(C), ( |
| 79 | + (C =:= $!) orelse (C =:= $$) orelse (C =:= $&) orelse (C =:= $') orelse |
| 80 | + (C =:= $() orelse (C =:= $)) orelse (C =:= $*) orelse (C =:= $+) orelse |
| 81 | + (C =:= $,) orelse (C =:= $-) orelse (C =:= $.) orelse (C =:= $0) orelse |
| 82 | + (C =:= $1) orelse (C =:= $2) orelse (C =:= $3) orelse (C =:= $4) orelse |
| 83 | + (C =:= $5) orelse (C =:= $6) orelse (C =:= $7) orelse (C =:= $8) orelse |
| 84 | + (C =:= $9) orelse (C =:= $:) orelse (C =:= $;) orelse (C =:= $=) orelse |
| 85 | + (C =:= $@) orelse (C =:= $A) orelse (C =:= $B) orelse (C =:= $C) orelse |
| 86 | + (C =:= $D) orelse (C =:= $E) orelse (C =:= $F) orelse (C =:= $G) orelse |
| 87 | + (C =:= $H) orelse (C =:= $I) orelse (C =:= $J) orelse (C =:= $K) orelse |
| 88 | + (C =:= $L) orelse (C =:= $M) orelse (C =:= $N) orelse (C =:= $O) orelse |
| 89 | + (C =:= $P) orelse (C =:= $Q) orelse (C =:= $R) orelse (C =:= $S) orelse |
| 90 | + (C =:= $T) orelse (C =:= $U) orelse (C =:= $V) orelse (C =:= $W) orelse |
| 91 | + (C =:= $X) orelse (C =:= $Y) orelse (C =:= $Z) orelse (C =:= $_) orelse |
| 92 | + (C =:= $a) orelse (C =:= $b) orelse (C =:= $c) orelse (C =:= $d) orelse |
| 93 | + (C =:= $e) orelse (C =:= $f) orelse (C =:= $g) orelse (C =:= $h) orelse |
| 94 | + (C =:= $i) orelse (C =:= $j) orelse (C =:= $k) orelse (C =:= $l) orelse |
| 95 | + (C =:= $m) orelse (C =:= $n) orelse (C =:= $o) orelse (C =:= $p) orelse |
| 96 | + (C =:= $q) orelse (C =:= $r) orelse (C =:= $s) orelse (C =:= $t) orelse |
| 97 | + (C =:= $u) orelse (C =:= $v) orelse (C =:= $w) orelse (C =:= $x) orelse |
| 98 | + (C =:= $y) orelse (C =:= $z) orelse (C =:= $~) |
| 99 | +)). |
| 100 | + |
| 101 | +urldecode(Binary) -> |
| 102 | + skip_dec(Binary, Binary, 0). |
| 103 | + |
| 104 | +%% This functions helps avoid a binary allocation when |
| 105 | +%% there is nothing to decode. |
| 106 | +skip_dec(Binary, Orig, Len) -> |
| 107 | + case Binary of |
| 108 | + <<C1, C2, C3, C4, Rest/bits>> |
| 109 | + when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) |
| 110 | + andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> |
| 111 | + skip_dec(Rest, Orig, Len + 4); |
| 112 | + _ -> |
| 113 | + dec(Binary, [], Orig, 0, Len) |
| 114 | + end. |
| 115 | + |
| 116 | +-dialyzer({no_improper_lists, [dec/5]}). |
| 117 | +%% This clause helps speed up decoding of highly encoded values. |
| 118 | +dec(<<$%, H1, L1, $%, H2, L2, $%, H3, L3, $%, H4, L4, Rest/bits>>, Acc, Orig, Skip, Len) -> |
| 119 | + C1 = ?UNHEX(H1, L1), |
| 120 | + C2 = ?UNHEX(H2, L2), |
| 121 | + C3 = ?UNHEX(H3, L3), |
| 122 | + C4 = ?UNHEX(H4, L4), |
| 123 | + case Len of |
| 124 | + 0 -> |
| 125 | + dec(Rest, [Acc|<<C1, C2, C3, C4>>], Orig, Skip + 12, 0); |
| 126 | + _ -> |
| 127 | + Part = binary_part(Orig, Skip, Len), |
| 128 | + dec(Rest, [Acc, Part|<<C1, C2, C3, C4>>], Orig, Skip + Len + 12, 0) |
| 129 | + end; |
| 130 | +dec(<<$%, H, L, Rest/bits>>, Acc, Orig, Skip, Len) -> |
| 131 | + C = ?UNHEX(H, L), |
| 132 | + case Len of |
| 133 | + 0 -> |
| 134 | + dec(Rest, [Acc|<<C>>], Orig, Skip + 3, 0); |
| 135 | + _ -> |
| 136 | + Part = binary_part(Orig, Skip, Len), |
| 137 | + dec(Rest, [Acc, Part|<<C>>], Orig, Skip + Len + 3, 0) |
| 138 | + end; |
| 139 | +%% This clause helps speed up decoding of barely encoded values. |
| 140 | +dec(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len) |
| 141 | + when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) |
| 142 | + andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> |
| 143 | + dec(Rest, Acc, Orig, Skip, Len + 4); |
| 144 | +dec(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) -> |
| 145 | + dec(Rest, Acc, Orig, Skip, Len + 1); |
| 146 | +dec(<<>>, _, Orig, 0, _) -> |
| 147 | + Orig; |
| 148 | +dec(<<>>, Acc, _, _, 0) -> |
| 149 | + iolist_to_binary(Acc); |
| 150 | +dec(<<>>, Acc, Orig, Skip, Len) -> |
| 151 | + Part = binary_part(Orig, Skip, Len), |
| 152 | + iolist_to_binary([Acc|Part]); |
| 153 | +dec(_, _, Orig, Skip, Len) -> |
| 154 | + error({invalid_byte, binary:at(Orig, Skip + Len)}). |
| 155 | + |
| 156 | +-ifdef(TEST). |
| 157 | +urldecode_test_() -> |
| 158 | + Tests = [ |
| 159 | + {<<"%20">>, <<" ">>}, |
| 160 | + {<<"+">>, <<"+">>}, |
| 161 | + {<<"%00">>, <<0>>}, |
| 162 | + {<<"%fF">>, <<255>>}, |
| 163 | + {<<"123">>, <<"123">>}, |
| 164 | + {<<"%i5">>, error}, |
| 165 | + {<<"%5">>, error} |
| 166 | + ], |
| 167 | + [{Qs, fun() -> |
| 168 | + E = try urldecode(Qs) of |
| 169 | + R -> R |
| 170 | + catch _:_ -> |
| 171 | + error |
| 172 | + end |
| 173 | + end} || {Qs, E} <- Tests]. |
| 174 | +-endif. |
0 commit comments