Skip to content

Commit f18ce3c

Browse files
committed
Implement faster memcmp for x86_64
x86_64 can load unaligned words in a single cache line as fast as aligned words. Even when crossing cache or page boundaries it is just as fast to do an unaligned word read instead of multiple byte reads. Also add a couple more tests & benchmarks.
1 parent b5065a0 commit f18ce3c

File tree

5 files changed

+193
-15
lines changed

5 files changed

+193
-15
lines changed

src/mem/impls.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,3 +265,17 @@ pub unsafe fn set_bytes(mut s: *mut u8, c: u8, mut n: usize) {
265265
}
266266
set_bytes_bytes(s, c, n);
267267
}
268+
269+
#[inline(always)]
270+
pub unsafe fn compare_bytes(s1: *const u8, s2: *const u8, n: usize) {
271+
let mut i = 0;
272+
while i < n {
273+
let a = *s1.add(i);
274+
let b = *s2.add(i);
275+
if a != b {
276+
return a as i32 - b as i32;
277+
}
278+
i += 1;
279+
}
280+
0
281+
}

src/mem/mod.rs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,7 @@ intrinsics! {
5151
#[mem_builtin]
5252
#[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), linkage = "weak")]
5353
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
54-
let mut i = 0;
55-
while i < n {
56-
let a = *s1.add(i);
57-
let b = *s2.add(i);
58-
if a != b {
59-
return a as i32 - b as i32;
60-
}
61-
i += 1;
62-
}
63-
0
54+
impls::compare_bytes(s1, s2, n)
6455
}
6556

6657
#[mem_builtin]

src/mem/x86_64.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
// feature is present at compile-time. We don't bother detecting other features.
1717
// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
1818

19+
use core::mem;
20+
1921
#[inline(always)]
2022
#[cfg(target_feature = "ermsb")]
2123
pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
@@ -98,3 +100,42 @@ pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
98100
options(att_syntax, nostack, preserves_flags)
99101
);
100102
}
103+
104+
#[inline(always)]
105+
pub unsafe fn compare_bytes(
106+
a: *const u8,
107+
b: *const u8,
108+
n: usize,
109+
) -> i32 {
110+
unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
111+
where
112+
T: Clone + Copy + Eq,
113+
U: Clone + Copy + Eq,
114+
F: FnOnce(*const U, *const U, usize) -> i32,
115+
{
116+
for _ in 0..n / mem::size_of::<T>() {
117+
if a.read_unaligned() != b.read_unaligned() {
118+
return f(a.cast(), b.cast(), mem::size_of::<T>());
119+
}
120+
a = a.add(1);
121+
b = b.add(1);
122+
}
123+
f(a.cast(), b.cast(), n % mem::size_of::<T>())
124+
}
125+
let c1 = |mut a: *const u8, mut b: *const u8, n| {
126+
for _ in 0..n {
127+
if a.read() != b.read() {
128+
return i32::from(a.read()) - i32::from(b.read());
129+
}
130+
a = a.add(1);
131+
b = b.add(1);
132+
}
133+
0
134+
};
135+
let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
136+
let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
137+
let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
138+
let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
139+
let c32 = |a: *const [u128; 2], b, n| cmp(a, b, n, c16);
140+
c32(a.cast(), b.cast(), n)
141+
}

testcrate/benches/mem.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,18 @@ fn memcmp_builtin(b: &mut Bencher, n: usize) {
9696
})
9797
}
9898

99+
fn memcmp_builtin_unaligned(b: &mut Bencher, n: usize) {
100+
let v1 = AlignedVec::new(0, n);
101+
let mut v2 = AlignedVec::new(0, n);
102+
v2[n - 1] = 1;
103+
b.bytes = n as u64;
104+
b.iter(|| {
105+
let s1: &[u8] = black_box(&v1[0..]);
106+
let s2: &[u8] = black_box(&v2[1..]);
107+
s1.cmp(s2)
108+
})
109+
}
110+
99111
fn memcmp_rust(b: &mut Bencher, n: usize) {
100112
let v1 = AlignedVec::new(0, n);
101113
let mut v2 = AlignedVec::new(0, n);
@@ -108,6 +120,18 @@ fn memcmp_rust(b: &mut Bencher, n: usize) {
108120
})
109121
}
110122

123+
fn memcmp_rust_unaligned(b: &mut Bencher, n: usize) {
124+
let v1 = AlignedVec::new(0, n);
125+
let mut v2 = AlignedVec::new(0, n);
126+
v2[n - 1] = 1;
127+
b.bytes = n as u64;
128+
b.iter(|| {
129+
let s1: &[u8] = black_box(&v1[0..]);
130+
let s2: &[u8] = black_box(&v2[1..]);
131+
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n - 1) }
132+
})
133+
}
134+
111135
fn memmove_builtin(b: &mut Bencher, n: usize, offset: usize) {
112136
let mut v = AlignedVec::new(0, n + n / 2 + offset);
113137
b.bytes = n as u64;
@@ -209,6 +233,38 @@ fn memset_rust_1048576_offset(b: &mut Bencher) {
209233
memset_rust(b, 1048576, 65)
210234
}
211235

236+
#[bench]
237+
fn memcmp_builtin_8(b: &mut Bencher) {
238+
memcmp_builtin(b, 8)
239+
}
240+
#[bench]
241+
fn memcmp_rust_8(b: &mut Bencher) {
242+
memcmp_rust(b, 8)
243+
}
244+
#[bench]
245+
fn memcmp_builtin_16(b: &mut Bencher) {
246+
memcmp_builtin(b, 16)
247+
}
248+
#[bench]
249+
fn memcmp_rust_16(b: &mut Bencher) {
250+
memcmp_rust(b, 16)
251+
}
252+
#[bench]
253+
fn memcmp_builtin_32(b: &mut Bencher) {
254+
memcmp_builtin(b, 32)
255+
}
256+
#[bench]
257+
fn memcmp_rust_32(b: &mut Bencher) {
258+
memcmp_rust(b, 32)
259+
}
260+
#[bench]
261+
fn memcmp_builtin_64(b: &mut Bencher) {
262+
memcmp_builtin(b, 64)
263+
}
264+
#[bench]
265+
fn memcmp_rust_64(b: &mut Bencher) {
266+
memcmp_rust(b, 64)
267+
}
212268
#[bench]
213269
fn memcmp_builtin_4096(b: &mut Bencher) {
214270
memcmp_builtin(b, 4096)
@@ -225,6 +281,54 @@ fn memcmp_builtin_1048576(b: &mut Bencher) {
225281
fn memcmp_rust_1048576(b: &mut Bencher) {
226282
memcmp_rust(b, 1048576)
227283
}
284+
#[bench]
285+
fn memcmp_builtin_unaligned_7(b: &mut Bencher) {
286+
memcmp_builtin_unaligned(b, 8)
287+
}
288+
#[bench]
289+
fn memcmp_rust_unaligned_7(b: &mut Bencher) {
290+
memcmp_rust_unaligned(b, 8)
291+
}
292+
#[bench]
293+
fn memcmp_builtin_unaligned_15(b: &mut Bencher) {
294+
memcmp_builtin_unaligned(b, 16)
295+
}
296+
#[bench]
297+
fn memcmp_rust_unaligned_15(b: &mut Bencher) {
298+
memcmp_rust_unaligned(b, 16)
299+
}
300+
#[bench]
301+
fn memcmp_builtin_unaligned_31(b: &mut Bencher) {
302+
memcmp_builtin_unaligned(b, 32)
303+
}
304+
#[bench]
305+
fn memcmp_rust_unaligned_31(b: &mut Bencher) {
306+
memcmp_rust_unaligned(b, 32)
307+
}
308+
#[bench]
309+
fn memcmp_builtin_unaligned_63(b: &mut Bencher) {
310+
memcmp_builtin_unaligned(b, 64)
311+
}
312+
#[bench]
313+
fn memcmp_rust_unaligned_63(b: &mut Bencher) {
314+
memcmp_rust_unaligned(b, 64)
315+
}
316+
#[bench]
317+
fn memcmp_builtin_unaligned_4095(b: &mut Bencher) {
318+
memcmp_builtin_unaligned(b, 4096)
319+
}
320+
#[bench]
321+
fn memcmp_rust_unaligned_4095(b: &mut Bencher) {
322+
memcmp_rust_unaligned(b, 4096)
323+
}
324+
#[bench]
325+
fn memcmp_builtin_unaligned_1048575(b: &mut Bencher) {
326+
memcmp_builtin_unaligned(b, 1048576)
327+
}
328+
#[bench]
329+
fn memcmp_rust_unaligned_1048575(b: &mut Bencher) {
330+
memcmp_rust_unaligned(b, 1048576)
331+
}
228332

229333
#[bench]
230334
fn memmove_builtin_4096(b: &mut Bencher) {

testcrate/tests/mem.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,13 @@ fn memset_nonzero() {
116116

117117
#[test]
118118
fn memcmp_eq() {
119-
let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
120-
let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7];
121-
unsafe {
122-
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0);
123-
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0);
119+
let arr1: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
120+
let arr2: [u8; 32] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
121+
for i in 0..32 {
122+
unsafe {
123+
assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), i), 0);
124+
assert_eq!(memcmp(arr2.as_ptr(), arr1.as_ptr(), i), 0);
125+
}
124126
}
125127
}
126128

@@ -134,6 +136,32 @@ fn memcmp_ne() {
134136
}
135137
}
136138

139+
#[test]
140+
fn memcmp_ne_16() {
141+
let arr1: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
142+
let arr2: [u8; 16] = [0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15];
143+
unsafe {
144+
assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 16) < 0);
145+
assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 16) > 0);
146+
}
147+
}
148+
149+
#[test]
150+
fn memcmp_ne_32() {
151+
let arr1: [u8; 32] = [
152+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
153+
0, 0, 0, 0,
154+
];
155+
let arr2: [u8; 32] = [
156+
0, 1, 2, 3, 4, 5, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157+
0, 0, 0, 0,
158+
];
159+
unsafe {
160+
assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 32) < 0);
161+
assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 32) > 0);
162+
}
163+
}
164+
137165
#[derive(Clone, Copy)]
138166
struct AlignedStorage<const N: usize>([u8; N], [usize; 0]);
139167

0 commit comments

Comments
 (0)