Skip to content

Commit 5e32765

Browse files
committed
[libc] Improve memcmp latency and codegen
This is based on ideas from @nafi to: - use a branchless version of 'cmp' for 'uint32_t', - completely resolve the lexicographic comparison through vector operations when wide types are available. We also get rid of byte reloads and serializing '__builtin_ctzll'. I did not include the suggestion to replace comparisons of 'uint16_t' with two 'uint8_t' as it did not seem to help the codegen. This can be revisited in sub-sequent patches. The code been rewritten to reduce nested function calls, making the job of the inliner easier and preventing harmful code duplication. Reviewed By: nafi3000 Differential Revision: https://reviews.llvm.org/D148717
1 parent aa28875 commit 5e32765

File tree

16 files changed

+901
-644
lines changed

16 files changed

+901
-644
lines changed

libc/src/__support/macros/properties/architectures.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
#define LIBC_TARGET_ARCH_IS_AARCH64
4646
#endif
4747

48+
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
49+
#define LIBC_TARGET_ARCH_IS_ANY_ARM
50+
#endif
51+
4852
#if defined(__riscv) && (__riscv_xlen == 64)
4953
#define LIBC_TARGET_ARCH_IS_RISCV64
5054
#endif
@@ -53,8 +57,9 @@
5357
#define LIBC_TARGET_ARCH_IS_RISCV32
5458
#endif
5559

56-
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
57-
#define LIBC_TARGET_ARCH_IS_ANY_ARM
60+
#if (defined(LIBC_TARGET_ARCH_IS_RISCV64) || \
61+
defined(LIBC_TARGET_ARCH_IS_RISCV32))
62+
#define LIBC_TARGET_ARCH_IS_ANY_RISCV
5863
#endif
5964

6065
#endif // LLVM_LIBC_SUPPORT_MACROS_PROPERTIES_ARCHITECTURES_H

libc/src/string/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,12 @@ function(add_implementation name impl_name)
450450
endforeach()
451451
endif()
452452

453+
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
454+
# Prevent warning when passing x86 SIMD types as template arguments.
455+
# e.g. "warning: ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
456+
list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes")
457+
endif()
458+
453459
add_entrypoint_object(${impl_name}
454460
NAME ${name}
455461
SRCS ${ADD_IMPL_SRCS}
@@ -564,7 +570,7 @@ endfunction()
564570
if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
565571
add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2)
566572
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
567-
add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
573+
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX)
568574
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
569575
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
570576
add_memcpy(memcpy)

libc/src/string/memory_utils/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ add_header_library(
2424
libc.src.__support.CPP.type_traits
2525
libc.src.__support.macros.config
2626
libc.src.__support.macros.optimization
27+
libc.src.__support.macros.properties.architectures
2728
)
2829

2930
add_header_library(

libc/src/string/memory_utils/aarch64/memcmp_implementations.h

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,47 +19,48 @@ namespace __llvm_libc {
1919
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
2020
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
2121
if (LIBC_UNLIKELY(count >= 384)) {
22-
if (auto value = generic::Memcmp<16>::block(p1, p2))
22+
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
2323
return value;
2424
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
2525
}
26-
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
26+
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1, p2, count);
2727
}
2828

2929
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
3030
inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
3131
if (LIBC_UNLIKELY(count >= 128)) { // [128, ∞]
32-
if (auto value = generic::Memcmp<16>::block(p1, p2))
32+
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
3333
return value;
3434
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
35-
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
35+
return generic::Memcmp<uint8x16x2_t>::loop_and_tail(p1, p2, count);
3636
}
37-
if (generic::Bcmp<16>::block(p1, p2)) // [16, 16]
38-
return generic::Memcmp<16>::block(p1, p2);
37+
if (generic::Bcmp<uint8x16_t>::block(p1, p2)) // [16, 16]
38+
return generic::Memcmp<uint8x16_t>::block(p1, p2);
3939
if (count < 32) // [17, 31]
40-
return generic::Memcmp<16>::tail(p1, p2, count);
41-
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
42-
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
40+
return generic::Memcmp<uint8x16_t>::tail(p1, p2, count);
41+
if (generic::Bcmp<uint8x16_t>::block(p1 + 16, p2 + 16)) // [32, 32]
42+
return generic::Memcmp<uint8x16_t>::block(p1 + 16, p2 + 16);
4343
if (count < 64) // [33, 63]
44-
return generic::Memcmp<32>::tail(p1, p2, count);
44+
return generic::Memcmp<uint8x16x2_t>::tail(p1, p2, count);
4545
// [64, 127]
46-
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
46+
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1 + 32, p2 + 32,
47+
count - 32);
4748
}
4849

4950
LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2,
5051
size_t count) {
5152
if (count == 0)
5253
return MemcmpReturnType::ZERO();
5354
if (count == 1)
54-
return generic::Memcmp<1>::block(p1, p2);
55+
return generic::Memcmp<uint8_t>::block(p1, p2);
5556
if (count == 2)
56-
return generic::Memcmp<2>::block(p1, p2);
57+
return generic::Memcmp<uint16_t>::block(p1, p2);
5758
if (count == 3)
58-
return generic::Memcmp<3>::block(p1, p2);
59+
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
5960
if (count <= 8)
60-
return generic::Memcmp<4>::head_tail(p1, p2, count);
61+
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
6162
if (count <= 16)
62-
return generic::Memcmp<8>::head_tail(p1, p2, count);
63+
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
6364
if constexpr (aarch64::kNeon)
6465
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
6566
else

libc/src/string/memory_utils/bcmp_implementations.h

Lines changed: 59 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,28 +15,25 @@
1515
#include "src/string/memory_utils/op_aarch64.h"
1616
#include "src/string/memory_utils/op_builtin.h"
1717
#include "src/string/memory_utils/op_generic.h"
18+
#include "src/string/memory_utils/op_riscv.h"
1819
#include "src/string/memory_utils/op_x86.h"
1920

2021
#include <stddef.h> // size_t
2122

2223
namespace __llvm_libc {
2324

2425
[[maybe_unused]] LIBC_INLINE BcmpReturnType
25-
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
26-
LIBC_LOOP_NOUNROLL
27-
for (; offset < count; ++offset)
28-
if (p1[offset] != p2[offset])
29-
return BcmpReturnType::NONZERO();
30-
return BcmpReturnType::ZERO();
26+
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
27+
return generic::Bcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
3128
}
3229

3330
[[maybe_unused]] LIBC_INLINE BcmpReturnType
3431
inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
3532
constexpr size_t kAlign = sizeof(uint64_t);
3633
if (count <= 2 * kAlign)
37-
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
34+
return inline_bcmp_byte_per_byte(p1, p2, count);
3835
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
39-
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
36+
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
4037
return value;
4138
size_t offset = bytes_to_p1_align;
4239
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -55,16 +52,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
5552
if (a != b)
5653
return BcmpReturnType::NONZERO();
5754
}
58-
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
55+
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
5956
}
6057

6158
[[maybe_unused]] LIBC_INLINE BcmpReturnType
6259
inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
6360
constexpr size_t kAlign = sizeof(uint32_t);
6461
if (count <= 2 * kAlign)
65-
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
62+
return inline_bcmp_byte_per_byte(p1, p2, count);
6663
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
67-
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
64+
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
6865
return value;
6966
size_t offset = bytes_to_p1_align;
7067
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -80,89 +77,82 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
8077
if (a != b)
8178
return BcmpReturnType::NONZERO();
8279
}
83-
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
80+
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
8481
}
8582

8683
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
8784
[[maybe_unused]] LIBC_INLINE BcmpReturnType
8885
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
89-
if (count < 256)
90-
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
91-
if (auto value = generic::Bcmp<64>::block(p1, p2))
92-
return value;
93-
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
94-
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
86+
return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
9587
}
9688
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
9789
// defined(LIBC_TARGET_ARCH_IS_AARCH64)
9890

9991
#if defined(LIBC_TARGET_ARCH_IS_X86)
92+
#if defined(__SSE4_1__)
10093
[[maybe_unused]] LIBC_INLINE BcmpReturnType
101-
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
94+
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
10295
if (count <= 32)
103-
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
104-
if (count < 256)
105-
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
106-
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
107-
return value;
108-
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
109-
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
96+
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
97+
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
11098
}
99+
#endif // __SSE4_1__
111100

101+
#if defined(__AVX__)
112102
[[maybe_unused]] LIBC_INLINE BcmpReturnType
113-
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
103+
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
114104
if (count <= 32)
115-
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
105+
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
116106
if (count <= 64)
117-
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
118-
if (count <= 128)
119-
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
120-
if (LIBC_UNLIKELY(count >= 256)) {
121-
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
122-
return value;
123-
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
124-
}
125-
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
107+
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
108+
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
126109
}
110+
#endif // __AVX__
127111

112+
#if defined(__AVX512BW__)
128113
[[maybe_unused]] LIBC_INLINE BcmpReturnType
129114
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
130115
if (count <= 32)
131-
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
116+
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
132117
if (count <= 64)
133-
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
118+
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
134119
if (count <= 128)
135-
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
136-
if (LIBC_UNLIKELY(count >= 256)) {
137-
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
138-
return value;
139-
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
140-
}
141-
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
120+
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
121+
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
142122
}
123+
#endif // __AVX512BW__
143124

144125
[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
145126
size_t count) {
146127
if (count == 0)
147128
return BcmpReturnType::ZERO();
148129
if (count == 1)
149-
return generic::Bcmp<1>::block(p1, p2);
130+
return generic::Bcmp<uint8_t>::block(p1, p2);
150131
if (count == 2)
151-
return generic::Bcmp<2>::block(p1, p2);
152-
if (count <= 4)
153-
return generic::Bcmp<2>::head_tail(p1, p2, count);
154-
if (count <= 8)
155-
return generic::Bcmp<4>::head_tail(p1, p2, count);
132+
return generic::Bcmp<uint16_t>::block(p1, p2);
133+
if (count == 3)
134+
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
135+
if (count == 4)
136+
return generic::Bcmp<uint32_t>::block(p1, p2);
137+
if (count == 5)
138+
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
139+
if (count == 6)
140+
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
141+
if (count == 7)
142+
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
143+
if (count == 8)
144+
return generic::Bcmp<uint64_t>::block(p1, p2);
156145
if (count <= 16)
157-
return generic::Bcmp<8>::head_tail(p1, p2, count);
158-
if constexpr (x86::kAvx512BW)
159-
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
160-
else if constexpr (x86::kAvx2)
161-
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
162-
else if constexpr (x86::kSse2)
163-
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
164-
else
165-
return inline_bcmp_generic_gt16(p1, p2, count);
146+
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
147+
#if defined(__AVX512BW__)
148+
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
149+
#elif defined(__AVX__)
150+
return inline_bcmp_x86_avx_gt16(p1, p2, count);
151+
#elif defined(__SSE4_1__)
152+
return inline_bcmp_x86_sse41_gt16(p1, p2, count);
153+
#else
154+
return inline_bcmp_generic_gt16(p1, p2, count);
155+
#endif
166156
}
167157
#endif // defined(LIBC_TARGET_ARCH_IS_X86)
168158

@@ -178,27 +168,27 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
178168
case 0:
179169
return BcmpReturnType::ZERO();
180170
case 1:
181-
return generic::Bcmp<1>::block(p1, p2);
171+
return generic::Bcmp<uint8_t>::block(p1, p2);
182172
case 2:
183-
return generic::Bcmp<2>::block(p1, p2);
173+
return generic::Bcmp<uint16_t>::block(p1, p2);
184174
case 3:
185-
return generic::Bcmp<2>::head_tail(p1, p2, count);
175+
return generic::Bcmp<uint16_t>::head_tail(p1, p2, count);
186176
case 4:
187-
return generic::Bcmp<4>::block(p1, p2);
177+
return generic::Bcmp<uint32_t>::block(p1, p2);
188178
case 5:
189179
case 6:
190180
case 7:
191-
return generic::Bcmp<4>::head_tail(p1, p2, count);
181+
return generic::Bcmp<uint32_t>::head_tail(p1, p2, count);
192182
case 8:
193-
return generic::Bcmp<8>::block(p1, p2);
183+
return generic::Bcmp<uint64_t>::block(p1, p2);
194184
case 9:
195185
case 10:
196186
case 11:
197187
case 12:
198188
case 13:
199189
case 14:
200190
case 15:
201-
return generic::Bcmp<8>::head_tail(p1, p2, count);
191+
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
202192
}
203193
}
204194

@@ -225,7 +215,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
225215
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
226216
return inline_bcmp_aligned_access_32bit(p1, p2, count);
227217
#else
228-
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
218+
return inline_bcmp_byte_per_byte(p1, p2, count);
229219
#endif
230220
}
231221

0 commit comments

Comments
 (0)