Rollup merge of #76971 - bugadani:issue-75659, r=dtolnay

jonas-schievink · web-flow · commit a5eb196b9985 · 2020-09-25T19:31:51.000+02:00
Refactor memchr to allow optimization Closes #75659 The implementation already uses naive search if the slice if short enough, but the case is complicated enough to not be optimized away. This PR refactors memchr so that it exists early when the slice is short enough. Codegen-wise, as shown in #75659, memchr was not inlined previously so the only way I could find to test this is to check if there is no memchr call. Let me know if there is a more robust solution here.
diff --git a/library/core/src/slice/memchr.rs b/library/core/src/slice/memchr.rs
@@ -46,11 +46,17 @@ pub fn memchr(x: u8, text: &[u8]) -> Option<usize> {
     // - body, scan by 2 words at a time
     // - the last remaining part, < 2 word size
     let len = text.len();
-    let ptr = text.as_ptr();
     let usize_bytes = mem::size_of::<usize>();
 
+    // Fast path for small slices
+    if len < 2 * usize_bytes {
+        return text.iter().position(|elt| *elt == x);
+    }
+
     // search up to an aligned boundary
+    let ptr = text.as_ptr();
     let mut offset = ptr.align_offset(usize_bytes);
+
     if offset > 0 {
         offset = cmp::min(offset, len);
         if let Some(index) = text[..offset].iter().position(|elt| *elt == x) {
@@ -60,22 +66,19 @@ pub fn memchr(x: u8, text: &[u8]) -> Option<usize> {
 
     // search the body of the text
     let repeated_x = repeat_byte(x);
+    while offset <= len - 2 * usize_bytes {
+        unsafe {
+            let u = *(ptr.add(offset) as *const usize);
+            let v = *(ptr.add(offset + usize_bytes) as *const usize);
 
-    if len >= 2 * usize_bytes {
-        while offset <= len - 2 * usize_bytes {
-            unsafe {
-                let u = *(ptr.add(offset) as *const usize);
-                let v = *(ptr.add(offset + usize_bytes) as *const usize);
-
-                // break if there is a matching byte
-                let zu = contains_zero_byte(u ^ repeated_x);
-                let zv = contains_zero_byte(v ^ repeated_x);
-                if zu || zv {
-                    break;
-                }
+            // break if there is a matching byte
+            let zu = contains_zero_byte(u ^ repeated_x);
+            let zv = contains_zero_byte(v ^ repeated_x);
+            if zu || zv {
+                break;
             }
-            offset += usize_bytes * 2;
         }
+        offset += usize_bytes * 2;
     }
 
     // Find the byte after the point the body loop stopped.
diff --git a/src/test/codegen/issue-75659.rs b/src/test/codegen/issue-75659.rs
@@ -0,0 +1,42 @@
+// This test checks that the call to memchr is optimized away when searching in small slices.
+
+// compile-flags: -O
+
+#![crate_type = "lib"]
+
+type T = u8;
+
+// CHECK-LABEL: @foo1
+#[no_mangle]
+pub fn foo1(x: T, data: &[T; 1]) -> bool {
+    // CHECK-NOT: memchr
+    data.contains(&x)
+}
+
+// CHECK-LABEL: @foo2
+#[no_mangle]
+pub fn foo2(x: T, data: &[T; 2]) -> bool {
+    // CHECK-NOT: memchr
+    data.contains(&x)
+}
+
+// CHECK-LABEL: @foo3
+#[no_mangle]
+pub fn foo3(x: T, data: &[T; 3]) -> bool {
+    // CHECK-NOT: memchr
+    data.contains(&x)
+}
+
+// CHECK-LABEL: @foo4
+#[no_mangle]
+pub fn foo4(x: T, data: &[T; 4]) -> bool {
+    // CHECK-NOT: memchr
+    data.contains(&x)
+}
+
+// CHECK-LABEL: @foo16
+#[no_mangle]
+pub fn foo16(x: T, data: &[T; 16]) -> bool {
+    // CHECK-NOT: memchr
+    data.contains(&x)
+}