Skip to content

Commit b3f1d58

Browse files
committed
[libc] New version of the mem* framework
This version is more composable and also simpler at the expense of being more explicit and more verbose. It also provides minimal implementations for ARM platforms. Codegen can be checked here https://godbolt.org/z/chf1Y6eGM Differential Revision: https://reviews.llvm.org/D135134
1 parent 06da9b9 commit b3f1d58

26 files changed

+1740
-1895
lines changed

libc/src/stdio/printf_core/string_writer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
3333
len = available_capacity;
3434

3535
if (len > 0) {
36-
inline_memset(cur_buffer, new_char, len);
36+
inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
3737
cur_buffer += len;
3838
available_capacity -= len;
3939
}

libc/src/string/bcmp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ namespace __llvm_libc {
1414

1515
LLVM_LIBC_FUNCTION(int, bcmp,
1616
(const void *lhs, const void *rhs, size_t count)) {
17-
return inline_bcmp(static_cast<const char *>(lhs),
18-
static_cast<const char *>(rhs), count);
17+
return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
18+
static_cast<const char *>(rhs), count));
1919
}
2020

2121
} // namespace __llvm_libc

libc/src/string/memcmp.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ namespace __llvm_libc {
1515

1616
LLVM_LIBC_FUNCTION(int, memcmp,
1717
(const void *lhs, const void *rhs, size_t count)) {
18-
return inline_memcmp(static_cast<const char *>(lhs),
19-
static_cast<const char *>(rhs), count);
18+
return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
19+
static_cast<const char *>(rhs), count));
2020
}
2121

2222
} // namespace __llvm_libc

libc/src/string/memmove.cpp

Lines changed: 84 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,104 @@
99
#include "src/string/memmove.h"
1010

1111
#include "src/__support/common.h"
12-
#include "src/__support/integer_operations.h"
13-
#include "src/string/memory_utils/elements.h"
12+
#include "src/string/memory_utils/op_aarch64.h"
13+
#include "src/string/memory_utils/op_builtin.h"
14+
#include "src/string/memory_utils/op_generic.h"
15+
#include "src/string/memory_utils/op_x86.h"
1416
#include <stddef.h> // size_t, ptrdiff_t
1517

18+
#include <stdio.h>
19+
1620
namespace __llvm_libc {
1721

18-
static inline void inline_memmove(char *dst, const char *src, size_t count) {
19-
using namespace __llvm_libc::scalar;
22+
[[maybe_unused]] static inline void
23+
inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
24+
if ((count == 0) || (dst == src))
25+
return;
26+
if (dst < src) {
27+
#pragma nounroll
28+
for (size_t offset = 0; offset < count; ++offset)
29+
builtin::Memcpy<1>::block(dst + offset, src + offset);
30+
} else {
31+
#pragma nounroll
32+
for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
33+
builtin::Memcpy<1>::block(dst + offset, src + offset);
34+
}
35+
}
36+
37+
template <size_t MaxSize>
38+
[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
39+
size_t count) {
2040
if (count == 0)
2141
return;
2242
if (count == 1)
23-
return move<_1>(dst, src);
43+
return generic::Memmove<1, MaxSize>::block(dst, src);
2444
if (count <= 4)
25-
return move<HeadTail<_2>>(dst, src, count);
45+
return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
2646
if (count <= 8)
27-
return move<HeadTail<_4>>(dst, src, count);
47+
return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
2848
if (count <= 16)
29-
return move<HeadTail<_8>>(dst, src, count);
49+
return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
3050
if (count <= 32)
31-
return move<HeadTail<_16>>(dst, src, count);
51+
return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
3252
if (count <= 64)
33-
return move<HeadTail<_32>>(dst, src, count);
53+
return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
3454
if (count <= 128)
35-
return move<HeadTail<_64>>(dst, src, count);
55+
return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
56+
if (dst < src) {
57+
generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
58+
count);
59+
return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
60+
count);
61+
} else {
62+
generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
63+
count);
64+
return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
65+
count);
66+
}
67+
}
3668

37-
using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
38-
if (dst < src)
39-
return move<AlignedMoveLoop>(dst, src, count);
40-
else if (dst > src)
41-
return move_backward<AlignedMoveLoop>(dst, src, count);
69+
static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
70+
#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
71+
#if defined(LLVM_LIBC_ARCH_X86)
72+
static constexpr size_t kMaxSize = x86::kAvx512F ? 64
73+
: x86::kAvx ? 32
74+
: x86::kSse2 ? 16
75+
: 8;
76+
#elif defined(LLVM_LIBC_ARCH_AARCH64)
77+
static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
78+
#endif
79+
// return inline_memmove_generic<kMaxSize>(dst, src, count);
80+
if (count == 0)
81+
return;
82+
if (count == 1)
83+
return generic::Memmove<1, kMaxSize>::block(dst, src);
84+
if (count <= 4)
85+
return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
86+
if (count <= 8)
87+
return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
88+
if (count <= 16)
89+
return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
90+
if (count <= 32)
91+
return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
92+
if (count <= 64)
93+
return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
94+
if (count <= 128)
95+
return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
96+
if (dst < src) {
97+
generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
98+
return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
99+
count);
100+
} else {
101+
generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
102+
return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
103+
count);
104+
}
105+
#elif defined(LLVM_LIBC_ARCH_ARM)
106+
return inline_memmove_embedded_tiny(dst, src, count);
107+
#else
108+
#error "Unsupported platform"
109+
#endif
42110
}
43111

44112
LLVM_LIBC_FUNCTION(void *, memmove,

libc/src/string/memory_utils/CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,17 @@
22
add_header_library(
33
memory_utils
44
HDRS
5-
utils.h
6-
elements.h
75
bcmp_implementations.h
86
bzero_implementations.h
97
memcmp_implementations.h
108
memcpy_implementations.h
119
memset_implementations.h
10+
op_aarch64.h
11+
op_higher_order.h
12+
op_builtin.h
13+
op_generic.h
14+
op_x86.h
15+
utils.h
1216
DEPS
1317
libc.src.__support.CPP.bit
1418
)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# The mem* framework
2+
3+
The framework handles the following mem* functions:
4+
- `memcpy`
5+
- `memmove`
6+
- `memset`
7+
- `bzero`
8+
- `bcmp`
9+
- `memcmp`
10+
11+
## Building blocks
12+
13+
These functions can be built out of a set of lower-level operations:
14+
- **`block`** : operates on a block of `SIZE` bytes.
15+
- **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
16+
- **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
17+
- **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
18+
19+
As an illustration, let's take the example of a trivial `memset` implementation:
20+
21+
```C++
22+
extern "C" void memset(const char* dst, int value, size_t count) {
23+
if (count == 0) return;
24+
if (count == 1) return Memset<1>::block(dst, value);
25+
if (count == 2) return Memset<2>::block(dst, value);
26+
if (count == 3) return Memset<3>::block(dst, value);
27+
if (count <= 8) return Memset<4>::head_tail(dst, value, count); // Note that 0 to 4 bytes are written twice.
28+
if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
29+
return Memset<16>::loop_and_tail(dst, value, count);
30+
}
31+
```
32+
33+
Now let's have a look into the `Memset` structure:
34+
35+
```C++
36+
template <size_t Size>
37+
struct Memset {
38+
static constexpr size_t SIZE = Size;
39+
40+
static inline void block(Ptr dst, uint8_t value) {
41+
// Implement me
42+
}
43+
44+
static inline void tail(Ptr dst, uint8_t value, size_t count) {
45+
block(dst + count - SIZE, value);
46+
}
47+
48+
static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
49+
block(dst, value);
50+
tail(dst, value, count);
51+
}
52+
53+
static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
54+
size_t offset = 0;
55+
do {
56+
block(dst + offset, value);
57+
offset += SIZE;
58+
} while (offset < count - SIZE);
59+
tail(dst, value, count);
60+
}
61+
};
62+
```
63+
64+
As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
65+
In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
66+
**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
67+
68+
## Scoped specializations
69+
70+
We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
71+
72+
In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
73+
```C++
74+
extern "C" void memset(const char* dst, int value, size_t count) {
75+
if (count == 0) return;
76+
if (count == 1) return generic::Memset<1>::block(dst, value);
77+
if (count == 2) return generic::Memset<2>::block(dst, value);
78+
if (count == 3) return generic::Memset<3>::block(dst, value);
79+
if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
80+
if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
81+
return x86::Memset<16>::loop_and_tail(dst, value, count);
82+
}
83+
```
84+
85+
### The `builtin` scope
86+
87+
Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
88+
89+
### The `generic` scope
90+
91+
In this scope we define pure C++ implementations using native integral types and clang vector extensions.
92+
93+
### The arch specific scopes
94+
95+
Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
96+
97+
The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.

0 commit comments

Comments
 (0)