Skip to content

Commit 622806c

Browse files
blake2-ppcthestinger
authored andcommitted
---
yaml --- r: 79392 b: refs/heads/snap-stage3 c: b153219 h: refs/heads/master v: v3
1 parent c26ad12 commit 622806c

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
refs/heads/master: 124eb2119c78651cfaaa7a046a101fa2e20f83ca
33
refs/heads/snap-stage1: e33de59e47c5076a89eadeb38f4934f58a3618a6
4-
refs/heads/snap-stage3: b49e9fa794addc197e58743bdc120cb9740b73c0
4+
refs/heads/snap-stage3: b153219556e20cb9f0e70c6a064cdfd10469ea32
55
refs/heads/try: ac820906c0e53eab79a98ee64f7231f57c3887b4
66
refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
77
refs/heads/ndm: f3868061cd7988080c30d6d5bf352a5a5fe2460b

branches/snap-stage3/src/libstd/str.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,8 @@ pub fn is_utf8(v: &[u8]) -> bool {
799799
// first C2 80 last DF BF
800800
// 3-byte encoding is for codepoints \u0800 to \uffff
801801
// first E0 A0 80 last EF BF BF
802+
// excluding surrogates codepoints \ud800 to \udfff
803+
// ED A0 80 to ED BF BF
802804
// 4-byte encoding is for codepoints \u10000 to \u10ffff
803805
// first F0 90 80 80 last F4 8F BF BF
804806
//
@@ -812,8 +814,6 @@ pub fn is_utf8(v: &[u8]) -> bool {
812814
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
813815
// %xF4 %x80-8F 2( UTF8-tail )
814816
// UTF8-tail = %x80-BF
815-
// --
816-
// This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
817817
match w {
818818
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
819819
return false
@@ -822,7 +822,9 @@ pub fn is_utf8(v: &[u8]) -> bool {
822822
unsafe_get(v, i + 1),
823823
unsafe_get(v, i + 2) & 192u8) {
824824
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
825-
(0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
825+
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826+
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
827+
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826828
_ => return false,
827829
},
828830
_ => match (v_i,
@@ -3012,6 +3014,7 @@ mod tests {
30123014
30133015
#[test]
30143016
fn test_is_utf8() {
3017+
// deny overlong encodings
30153018
assert!(!is_utf8([0xc0, 0x80]));
30163019
assert!(!is_utf8([0xc0, 0xae]));
30173020
assert!(!is_utf8([0xe0, 0x80, 0x80]));
@@ -3020,9 +3023,15 @@ mod tests {
30203023
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
30213024
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
30223025
3026+
// deny surrogates
3027+
assert!(!is_utf8([0xED, 0xA0, 0x80]));
3028+
assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3029+
30233030
assert!(is_utf8([0xC2, 0x80]));
30243031
assert!(is_utf8([0xDF, 0xBF]));
30253032
assert!(is_utf8([0xE0, 0xA0, 0x80]));
3033+
assert!(is_utf8([0xED, 0x9F, 0xBF]));
3034+
assert!(is_utf8([0xEE, 0x80, 0x80]));
30263035
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
30273036
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
30283037
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));

0 commit comments

Comments
 (0)