Skip to content

Commit 0284148

Browse files
committed
[libc] Switch to new implementation of mem* functions
The new framework makes it explicit which processor feature is being used and allows for easier per platform customization: - ARM cpu now uses trivial implementations to reduce code size. - Memcmp, Bcmp and Memmove have been optimized for x86 - Bcmp has been optimized for aarch64. This is a reland of https://reviews.llvm.org/D135134 (b3f1d58) Differential Revision: https://reviews.llvm.org/D136595
1 parent 791fe26 commit 0284148

File tree

13 files changed

+523
-323
lines changed

13 files changed

+523
-323
lines changed

libc/src/stdio/printf_core/string_writer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
3333
len = available_capacity;
3434

3535
if (len > 0) {
36-
inline_memset(cur_buffer, new_char, len);
36+
inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
3737
cur_buffer += len;
3838
available_capacity -= len;
3939
}

libc/src/string/bcmp.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ namespace __llvm_libc {
1414

1515
LLVM_LIBC_FUNCTION(int, bcmp,
1616
(const void *lhs, const void *rhs, size_t count)) {
17-
return inline_bcmp(static_cast<const char *>(lhs),
18-
static_cast<const char *>(rhs), count);
17+
return inline_bcmp(lhs, rhs, count);
1918
}
2019

2120
} // namespace __llvm_libc

libc/src/string/memcmp.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ namespace __llvm_libc {
1515

1616
LLVM_LIBC_FUNCTION(int, memcmp,
1717
(const void *lhs, const void *rhs, size_t count)) {
18-
return inline_memcmp(static_cast<const char *>(lhs),
19-
static_cast<const char *>(rhs), count);
18+
return inline_memcmp(lhs, rhs, count);
2019
}
2120

2221
} // namespace __llvm_libc

libc/src/string/memcpy.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ namespace __llvm_libc {
1515
LLVM_LIBC_FUNCTION(void *, memcpy,
1616
(void *__restrict dst, const void *__restrict src,
1717
size_t size)) {
18-
inline_memcpy(reinterpret_cast<char *>(dst),
19-
reinterpret_cast<const char *>(src), size);
18+
inline_memcpy(dst, src, size);
2019
return dst;
2120
}
2221

libc/src/string/memmove.cpp

Lines changed: 86 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,42 +9,110 @@
99
#include "src/string/memmove.h"
1010

1111
#include "src/__support/common.h"
12-
#include "src/__support/integer_operations.h"
13-
#include "src/string/memory_utils/elements.h"
12+
#include "src/string/memory_utils/op_aarch64.h"
13+
#include "src/string/memory_utils/op_builtin.h"
14+
#include "src/string/memory_utils/op_generic.h"
15+
#include "src/string/memory_utils/op_x86.h"
1416
#include <stddef.h> // size_t, ptrdiff_t
1517

18+
#include <stdio.h>
19+
1620
namespace __llvm_libc {
1721

18-
static inline void inline_memmove(char *dst, const char *src, size_t count) {
19-
using namespace __llvm_libc::scalar;
22+
[[maybe_unused]] static inline void
23+
inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
24+
if ((count == 0) || (dst == src))
25+
return;
26+
if (dst < src) {
27+
#pragma nounroll
28+
for (size_t offset = 0; offset < count; ++offset)
29+
builtin::Memcpy<1>::block(dst + offset, src + offset);
30+
} else {
31+
#pragma nounroll
32+
for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
33+
builtin::Memcpy<1>::block(dst + offset, src + offset);
34+
}
35+
}
36+
37+
template <size_t MaxSize>
38+
[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
39+
size_t count) {
2040
if (count == 0)
2141
return;
2242
if (count == 1)
23-
return move<_1>(dst, src);
43+
return generic::Memmove<1, MaxSize>::block(dst, src);
2444
if (count <= 4)
25-
return move<HeadTail<_2>>(dst, src, count);
45+
return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
2646
if (count <= 8)
27-
return move<HeadTail<_4>>(dst, src, count);
47+
return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
2848
if (count <= 16)
29-
return move<HeadTail<_8>>(dst, src, count);
49+
return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
3050
if (count <= 32)
31-
return move<HeadTail<_16>>(dst, src, count);
51+
return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
3252
if (count <= 64)
33-
return move<HeadTail<_32>>(dst, src, count);
53+
return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
3454
if (count <= 128)
35-
return move<HeadTail<_64>>(dst, src, count);
55+
return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
56+
if (dst < src) {
57+
generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
58+
count);
59+
return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
60+
count);
61+
} else {
62+
generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
63+
count);
64+
return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
65+
count);
66+
}
67+
}
3668

37-
using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
38-
if (dst < src)
39-
return move<AlignedMoveLoop>(dst, src, count);
40-
else if (dst > src)
41-
return move_backward<AlignedMoveLoop>(dst, src, count);
69+
static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
70+
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
71+
#if defined(LLVM_LIBC_ARCH_X86)
72+
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
73+
: x86::kAvx ? 32
74+
: x86::kSse2 ? 16
75+
: 8;
76+
#elif defined(LLVM_LIBC_ARCH_AARCH64)
77+
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
78+
#endif
79+
// return inline_memmove_generic<kMaxSize>(dst, src, count);
80+
if (count == 0)
81+
return;
82+
if (count == 1)
83+
return generic::Memmove<1, kMaxSize>::block(dst, src);
84+
if (count <= 4)
85+
return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
86+
if (count <= 8)
87+
return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
88+
if (count <= 16)
89+
return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
90+
if (count <= 32)
91+
return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
92+
if (count <= 64)
93+
return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
94+
if (count <= 128)
95+
return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
96+
if (dst < src) {
97+
generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
98+
return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
99+
count);
100+
} else {
101+
generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
102+
return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
103+
count);
104+
}
105+
#elif defined(LLVM_LIBC_ARCH_ARM)
106+
return inline_memmove_embedded_tiny(dst, src, count);
107+
#else
108+
#error "Unsupported platform"
109+
#endif
42110
}
43111

44112
LLVM_LIBC_FUNCTION(void *, memmove,
45113
(void *dst, const void *src, size_t count)) {
46-
inline_memmove(reinterpret_cast<char *>(dst),
47-
reinterpret_cast<const char *>(src), count);
114+
inline_memmove(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src),
115+
count);
48116
return dst;
49117
}
50118

libc/src/string/memory_utils/bcmp_implementations.h

Lines changed: 148 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,49 +11,169 @@
1111

1212
#include "src/__support/architectures.h"
1313
#include "src/__support/common.h"
14-
#include "src/string/memory_utils/elements.h"
14+
#include "src/string/memory_utils/op_aarch64.h"
15+
#include "src/string/memory_utils/op_builtin.h"
16+
#include "src/string/memory_utils/op_generic.h"
17+
#include "src/string/memory_utils/op_x86.h"
1518

1619
#include <stddef.h> // size_t
1720

1821
namespace __llvm_libc {
1922

20-
// Fixed-size difference between 'lhs' and 'rhs'.
21-
template <typename Element> bool differs(const char *lhs, const char *rhs) {
22-
return !Element::equals(lhs, rhs);
23+
[[maybe_unused]] static inline BcmpReturnType
24+
inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
25+
#pragma nounroll
26+
for (size_t offset = 0; offset < count; ++offset)
27+
if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
28+
return value;
29+
return BcmpReturnType::ZERO();
2330
}
24-
// Runtime-size difference between 'lhs' and 'rhs'.
25-
template <typename Element>
26-
bool differs(const char *lhs, const char *rhs, size_t size) {
27-
return !Element::equals(lhs, rhs, size);
31+
32+
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
33+
[[maybe_unused]] static inline BcmpReturnType
34+
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
35+
if (count < 256)
36+
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
37+
if (auto value = generic::Bcmp<64>::block(p1, p2))
38+
return value;
39+
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
40+
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
2841
}
42+
#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
2943

30-
static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
3144
#if defined(LLVM_LIBC_ARCH_X86)
32-
using namespace ::__llvm_libc::x86;
33-
#elif defined(LLVM_LIBC_ARCH_AARCH64)
34-
using namespace ::__llvm_libc::aarch64;
35-
#else
36-
using namespace ::__llvm_libc::scalar;
37-
#endif
45+
[[maybe_unused]] static inline BcmpReturnType
46+
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
47+
if (count <= 32)
48+
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
49+
if (count < 256)
50+
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
51+
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
52+
return value;
53+
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
54+
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
55+
}
56+
57+
[[maybe_unused]] static inline BcmpReturnType
58+
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
59+
if (count <= 32)
60+
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
61+
if (count <= 64)
62+
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
63+
if (count <= 128)
64+
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
65+
if (unlikely(count >= 256)) {
66+
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
67+
return value;
68+
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
69+
}
70+
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
71+
}
72+
73+
[[maybe_unused]] static inline BcmpReturnType
74+
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
75+
if (count <= 32)
76+
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
77+
if (count <= 64)
78+
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
79+
if (count <= 128)
80+
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
81+
if (unlikely(count >= 256)) {
82+
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
83+
return value;
84+
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
85+
}
86+
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
87+
}
88+
89+
[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
90+
size_t count) {
3891
if (count == 0)
39-
return 0;
92+
return BcmpReturnType::ZERO();
4093
if (count == 1)
41-
return differs<_1>(lhs, rhs);
94+
return generic::Bcmp<1>::block(p1, p2);
4295
if (count == 2)
43-
return differs<_2>(lhs, rhs);
44-
if (count == 3)
45-
return differs<_3>(lhs, rhs);
96+
return generic::Bcmp<2>::block(p1, p2);
97+
if (count <= 4)
98+
return generic::Bcmp<2>::head_tail(p1, p2, count);
4699
if (count <= 8)
47-
return differs<HeadTail<_4>>(lhs, rhs, count);
100+
return generic::Bcmp<4>::head_tail(p1, p2, count);
48101
if (count <= 16)
49-
return differs<HeadTail<_8>>(lhs, rhs, count);
50-
if (count <= 32)
51-
return differs<HeadTail<_16>>(lhs, rhs, count);
102+
return generic::Bcmp<8>::head_tail(p1, p2, count);
103+
if constexpr (x86::kAvx512BW)
104+
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
105+
else if constexpr (x86::kAvx2)
106+
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
107+
else if constexpr (x86::kSse2)
108+
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
109+
else
110+
return inline_bcmp_generic_gt16(p1, p2, count);
111+
}
112+
#endif // defined(LLVM_LIBC_ARCH_X86)
113+
114+
#if defined(LLVM_LIBC_ARCH_AARCH64)
115+
[[maybe_unused]] static inline BcmpReturnType
116+
inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
117+
if (likely(count <= 32)) {
118+
if (unlikely(count >= 16)) {
119+
return generic::Bcmp<16>::head_tail(p1, p2, count);
120+
}
121+
switch (count) {
122+
case 0:
123+
return BcmpReturnType::ZERO();
124+
case 1:
125+
return generic::Bcmp<1>::block(p1, p2);
126+
case 2:
127+
return generic::Bcmp<2>::block(p1, p2);
128+
case 3:
129+
return generic::Bcmp<2>::head_tail(p1, p2, count);
130+
case 4:
131+
return generic::Bcmp<4>::block(p1, p2);
132+
case 5:
133+
case 6:
134+
case 7:
135+
return generic::Bcmp<4>::head_tail(p1, p2, count);
136+
case 8:
137+
return generic::Bcmp<8>::block(p1, p2);
138+
case 9:
139+
case 10:
140+
case 11:
141+
case 12:
142+
case 13:
143+
case 14:
144+
case 15:
145+
return generic::Bcmp<8>::head_tail(p1, p2, count);
146+
}
147+
}
148+
52149
if (count <= 64)
53-
return differs<HeadTail<_32>>(lhs, rhs, count);
54-
if (count <= 128)
55-
return differs<HeadTail<_64>>(lhs, rhs, count);
56-
return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
150+
return generic::Bcmp<32>::head_tail(p1, p2, count);
151+
152+
// Aligned loop if > 256, otherwise normal loop
153+
if (count > 256) {
154+
if (auto value = generic::Bcmp<32>::block(p1, p2))
155+
return value;
156+
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
157+
}
158+
return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
159+
}
160+
#endif // defined(LLVM_LIBC_ARCH_AARCH64)
161+
162+
static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
163+
#if defined(LLVM_LIBC_ARCH_X86)
164+
return inline_bcmp_x86(p1, p2, count);
165+
#elif defined(LLVM_LIBC_ARCH_AARCH64)
166+
return inline_bcmp_aarch64(p1, p2, count);
167+
#elif defined(LLVM_LIBC_ARCH_ARM)
168+
return inline_bcmp_embedded_tiny(p1, p2, count);
169+
#else
170+
#error "Unsupported platform"
171+
#endif
172+
}
173+
174+
static inline int inline_bcmp(const void *p1, const void *p2, size_t count) {
175+
return static_cast<int>(inline_bcmp(reinterpret_cast<CPtr>(p1),
176+
reinterpret_cast<CPtr>(p2), count));
57177
}
58178

59179
} // namespace __llvm_libc

libc/src/string/memory_utils/bzero_implementations.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,14 @@
1515

1616
namespace __llvm_libc {
1717

18-
inline static void inline_bzero(char *dst, size_t count) {
18+
inline static void inline_bzero(Ptr dst, size_t count) {
1919
inline_memset(dst, 0, count);
2020
}
2121

22+
inline static void inline_bzero(void *dst, size_t count) {
23+
inline_bzero(reinterpret_cast<Ptr>(dst), count);
24+
}
25+
2226
} // namespace __llvm_libc
2327

2428
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H

0 commit comments

Comments
 (0)