Skip to content

Commit 05eb149

Browse files
blake2-ppcthestinger
authored andcommitted
---
yaml --- r: 144725 b: refs/heads/try2 c: b153219 h: refs/heads/master i: 144723: 885da3d v: v3
1 parent c125ccb commit 05eb149

File tree

2 files changed

+13
-4
lines changed

2 files changed

+13
-4
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ refs/heads/snap-stage3: 78a7676898d9f80ab540c6df5d4c9ce35bb50463
55
refs/heads/try: 519addf6277dbafccbb4159db4b710c37eaa2ec5
66
refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
77
refs/heads/ndm: f3868061cd7988080c30d6d5bf352a5a5fe2460b
8-
refs/heads/try2: b49e9fa794addc197e58743bdc120cb9740b73c0
8+
refs/heads/try2: b153219556e20cb9f0e70c6a064cdfd10469ea32
99
refs/heads/dist-snap: ba4081a5a8573875fed17545846f6f6902c8ba8d
1010
refs/tags/release-0.2: c870d2dffb391e14efb05aa27898f1f6333a9596
1111
refs/tags/release-0.3: b5f0d0f648d9a6153664837026ba1be43d3e2503

branches/try2/src/libstd/str.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,8 @@ pub fn is_utf8(v: &[u8]) -> bool {
799799
// first C2 80 last DF BF
800800
// 3-byte encoding is for codepoints \u0800 to \uffff
801801
// first E0 A0 80 last EF BF BF
802+
// excluding surrogates codepoints \ud800 to \udfff
803+
// ED A0 80 to ED BF BF
802804
// 4-byte encoding is for codepoints \u10000 to \u10ffff
803805
// first F0 90 80 80 last F4 8F BF BF
804806
//
@@ -812,8 +814,6 @@ pub fn is_utf8(v: &[u8]) -> bool {
812814
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
813815
// %xF4 %x80-8F 2( UTF8-tail )
814816
// UTF8-tail = %x80-BF
815-
// --
816-
// This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
817817
match w {
818818
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
819819
return false
@@ -822,7 +822,9 @@ pub fn is_utf8(v: &[u8]) -> bool {
822822
unsafe_get(v, i + 1),
823823
unsafe_get(v, i + 2) & 192u8) {
824824
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
825-
(0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
825+
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826+
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
827+
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826828
_ => return false,
827829
},
828830
_ => match (v_i,
@@ -3012,6 +3014,7 @@ mod tests {
30123014
30133015
#[test]
30143016
fn test_is_utf8() {
3017+
// deny overlong encodings
30153018
assert!(!is_utf8([0xc0, 0x80]));
30163019
assert!(!is_utf8([0xc0, 0xae]));
30173020
assert!(!is_utf8([0xe0, 0x80, 0x80]));
@@ -3020,9 +3023,15 @@ mod tests {
30203023
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
30213024
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
30223025
3026+
// deny surrogates
3027+
assert!(!is_utf8([0xED, 0xA0, 0x80]));
3028+
assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3029+
30233030
assert!(is_utf8([0xC2, 0x80]));
30243031
assert!(is_utf8([0xDF, 0xBF]));
30253032
assert!(is_utf8([0xE0, 0xA0, 0x80]));
3033+
assert!(is_utf8([0xED, 0x9F, 0xBF]));
3034+
assert!(is_utf8([0xEE, 0x80, 0x80]));
30263035
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
30273036
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
30283037
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));

0 commit comments

Comments
 (0)