Skip to content

Implement faster memcmp for x86_64 #467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 31, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#![feature(compiler_builtins)]
#![feature(core_ffi_c)]
#![feature(core_intrinsics)]
#![feature(inline_const)]
#![feature(lang_items)]
#![feature(linkage)]
#![feature(naked_functions)]
Expand Down
14 changes: 14 additions & 0 deletions src/mem/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
}
set_bytes_bytes(s, c, n);
}

#[inline(always)]
pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) -> i32 {
let mut i = 0;
while i < n {
let a = *s1.add(i);
let b = *s2.add(i);
if a != b {
return a as i32 - b as i32;
}
i += 1;
}
0
}
11 changes: 1 addition & 10 deletions src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,7 @@ intrinsics! {
#[mem_builtin]
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
let mut i = 0;
while i < n {
let a = *s1.add(i);
let b = *s2.add(i);
if a != b {
return a as i32 - b as i32;
}
i += 1;
}
0
impls::compare_bytes(s1, s2, n)
}

#[mem_builtin]
Expand Down
46 changes: 46 additions & 0 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// feature is present at compile-time. We don't bother detecting other features.
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".

use core::mem;

#[inline(always)]
#[cfg(target_feature = "ermsb")]
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
Expand Down Expand Up @@ -98,3 +100,47 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
options(att_syntax, nostack, preserves_flags)
);
}

#[inline(always)]
pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
#[inline(always)]
unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
where
T: Clone + Copy + Eq,
U: Clone + Copy + Eq,
F: FnOnce(*const U, *const U, usize) -> i32,
{
// Just to be sure we're actually working with powers of two...
let _ = const { 1 - mem::size_of::<T>().count_ones() }; // <= 1
let _ = const { mem::size_of::<T>().count_ones() - 1 }; // >= 1

// This should be equivalent to division with power-of-two sizes, except the former
// somehow still leaves a call to panic because ??
let end = a.add(n >> mem::size_of::<T>().trailing_zeros());
while a != end {
if a.read_unaligned() != b.read_unaligned() {
return f(a.cast(), b.cast(), mem::size_of::<T>());
}
a = a.add(1);
b = b.add(1);
}
// Ditto
f(a.cast(), b.cast(), n & (mem::size_of::<T>() - 1))
}
let c1 = |mut a: *const u8, mut b: *const u8, n| {
for _ in 0..n {
if a.read() != b.read() {
return i32::from(a.read()) - i32::from(b.read());
}
a = a.add(1);
b = b.add(1);
}
0
};
let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
let c32 = |a: *const [u128; 2], b, n| cmp(a, b, n, c16);
c32(a.cast(), b.cast(), n)
}
104 changes: 104 additions & 0 deletions testcrate/benches/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,18 @@ fn memcmp_builtin(b: &mut Bencher, n: usize) {
})
}

fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1[0..]);
let s2: &[u8] = black_box(&v2[1..]);
s1.cmp(s2)
})
}

fn memcmp_rust(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
Expand All @@ -108,6 +120,18 @@ fn memcmp_rust(b: &mut Bencher, n: usize) {
})
}

fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
let v1 = AlignedVec::new(0, n);
let mut v2 = AlignedVec::new(0, n);
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1[0..]);
let s2: &[u8] = black_box(&v2[1..]);
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
})
}

fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
let mut v = AlignedVec::new(0, n + n / 2 + offset);
b.bytes = n as u64;
Expand Down Expand Up @@ -209,6 +233,38 @@ fn memset_rust_1048576_offset(b: &mut Bencher) {
memset_rust(b, 1048576, 65)
}

#[bench]
fn memcmp_builtin_8(b: &mut Bencher) {
memcmp_builtin(b, 8)
}
#[bench]
fn memcmp_rust_8(b: &mut Bencher) {
memcmp_rust(b, 8)
}
#[bench]
fn memcmp_builtin_16(b: &mut Bencher) {
memcmp_builtin(b, 16)
}
#[bench]
fn memcmp_rust_16(b: &mut Bencher) {
memcmp_rust(b, 16)
}
#[bench]
fn memcmp_builtin_32(b: &mut Bencher) {
memcmp_builtin(b, 32)
}
#[bench]
fn memcmp_rust_32(b: &mut Bencher) {
memcmp_rust(b, 32)
}
#[bench]
fn memcmp_builtin_64(b: &mut Bencher) {
memcmp_builtin(b, 64)
}
#[bench]
fn memcmp_rust_64(b: &mut Bencher) {
memcmp_rust(b, 64)
}
#[bench]
fn memcmp_builtin_4096(b: &mut Bencher) {
memcmp_builtin(b, 4096)
Expand All @@ -225,6 +281,54 @@ fn memcmp_builtin_1048576(b: &mut Bencher) {
fn memcmp_rust_1048576(b: &mut Bencher) {
memcmp_rust(b, 1048576)
}
#[bench]
fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 8)
}
#[bench]
fn memcmp_rust_unaligned_7(b: &mut Bencher) {
memcmp_rust_unaligned(b, 8)
}
#[bench]
fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 16)
}
#[bench]
fn memcmp_rust_unaligned_15(b: &mut Bencher) {
memcmp_rust_unaligned(b, 16)
}
#[bench]
fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 32)
}
#[bench]
fn memcmp_rust_unaligned_31(b: &mut Bencher) {
memcmp_rust_unaligned(b, 32)
}
#[bench]
fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 64)
}
#[bench]
fn memcmp_rust_unaligned_63(b: &mut Bencher) {
memcmp_rust_unaligned(b, 64)
}
#[bench]
fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 4096)
}
#[bench]
fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
memcmp_rust_unaligned(b, 4096)
}
#[bench]
fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
memcmp_builtin_unaligned(b, 1048576)
}
#[bench]
fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
memcmp_rust_unaligned(b, 1048576)
}

#[bench]
fn memmove_builtin_4096(b: &mut Bencher) {
Expand Down
25 changes: 15 additions & 10 deletions testcrate/tests/mem.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,21 +116,26 @@ fn memset_nonzero() {

#[test]
fn memcmp_eq() {
let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
unsafe {
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0);
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0);
let arr1 @ arr2 = gen_arr::<256>();
for i in 0..256 {
unsafe {
assert_eq!(memcmp(arr1.0.as_ptr(), arr2.0.as_ptr(), i), 0);
assert_eq!(memcmp(arr2.0.as_ptr(), arr1.0.as_ptr(), i), 0);
}
}
}

#[test]
fn memcmp_ne() {
let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 7, 7];
unsafe {
assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8) < 0);
assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 8) > 0);
let arr1 @ arr2 = gen_arr::<256>();
for i in 0..256 {
let mut diff_arr = arr1;
diff_arr.0[i] = 127;
let expect = diff_arr.0[i].cmp(&arr2.0[i]);
for k in i + 1..256 {
let result = unsafe { memcmp(diff_arr.0.as_ptr(), arr2.0.as_ptr(), k) };
assert_eq!(expect, result.cmp(&0));
}
}
}

Expand Down