Skip to content

Commit 3153aa4

Browse files
authored
[libc] Adding a version of memset with software prefetching (#70857)
Software prefetching helps recover performance when hardware prefetching is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile time option allows users to use this patch.
1 parent f7bbb58 commit 3153aa4

File tree

7 files changed

+79
-21
lines changed

7 files changed

+79
-21
lines changed

libc/config/config.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
2222
"value": false,
2323
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
24+
},
25+
"LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
26+
"value": false,
27+
"doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled."
2428
}
2529
}
2630
}

libc/src/string/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
33
if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
44
list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
55
endif()
6+
if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
7+
list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
8+
endif()
69
if(string_config_options)
710
list(PREPEND string_config_options "COMPILE_OPTIONS")
811
endif()
@@ -656,6 +659,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
656659
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
657660
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
658661
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
662+
add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
659663
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
660664
add_memset(memset)
661665
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})

libc/src/string/memory_utils/op_generic.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,19 @@ template <typename T> struct Memset {
154154
tail(dst, value, count);
155155
}
156156

157-
LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
157+
LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
158+
size_t count, size_t offset) {
158159
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
159-
size_t offset = 0;
160160
do {
161161
block(dst + offset, value);
162162
offset += SIZE;
163163
} while (offset < count - SIZE);
164164
tail(dst, value, count);
165165
}
166+
167+
LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
168+
return loop_and_tail_offset(dst, value, count, 0);
169+
}
166170
};
167171

168172
template <typename T, typename... TS> struct MemsetSequence {

libc/src/string/memory_utils/utils.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,14 @@ template <size_t SIZE> struct AlignHelper {
374374
uintptr_t offset_;
375375
};
376376

377+
LIBC_INLINE void prefetch_for_write(CPtr dst) {
378+
__builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
379+
}
380+
381+
LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
382+
__builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
383+
}
384+
377385
} // namespace LIBC_NAMESPACE
378386

379387
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H

libc/src/string/memory_utils/x86_64/inline_memcpy.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
4747

4848
} // namespace x86
4949

50-
// TODO: Move to a shared header when appropriate.
51-
[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
52-
__builtin_prefetch(addr, 0, 3);
53-
}
54-
5550
[[maybe_unused]] LIBC_INLINE void
5651
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
5752
size_t count) {

libc/src/string/memory_utils/x86_64/inline_memset.h

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,27 +16,67 @@
1616
#include <stddef.h> // size_t
1717

1818
namespace LIBC_NAMESPACE {
19+
namespace x86 {
20+
// Size of one cache line for software prefetching
21+
LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
22+
LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
23+
LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
24+
25+
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
26+
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
27+
28+
} // namespace x86
1929

20-
[[maybe_unused]] LIBC_INLINE static void
21-
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
2230
#if defined(__AVX512F__)
23-
using uint128_t = generic_v128;
24-
using uint256_t = generic_v256;
25-
using uint512_t = generic_v512;
31+
using uint128_t = generic_v128;
32+
using uint256_t = generic_v256;
33+
using uint512_t = generic_v512;
2634
#elif defined(__AVX__)
27-
using uint128_t = generic_v128;
28-
using uint256_t = generic_v256;
29-
using uint512_t = cpp::array<generic_v256, 2>;
35+
using uint128_t = generic_v128;
36+
using uint256_t = generic_v256;
37+
using uint512_t = cpp::array<generic_v256, 2>;
3038
#elif defined(__SSE2__)
31-
using uint128_t = generic_v128;
32-
using uint256_t = cpp::array<generic_v128, 2>;
33-
using uint512_t = cpp::array<generic_v128, 4>;
39+
using uint128_t = generic_v128;
40+
using uint256_t = cpp::array<generic_v128, 2>;
41+
using uint512_t = cpp::array<generic_v128, 4>;
3442
#else
35-
using uint128_t = cpp::array<uint64_t, 2>;
36-
using uint256_t = cpp::array<uint64_t, 4>;
37-
using uint512_t = cpp::array<uint64_t, 8>;
43+
using uint128_t = cpp::array<uint64_t, 2>;
44+
using uint256_t = cpp::array<uint64_t, 4>;
45+
using uint512_t = cpp::array<uint64_t, 8>;
3846
#endif
3947

48+
[[maybe_unused]] LIBC_INLINE static void
49+
inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
50+
constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
51+
constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
52+
constexpr size_t SIZE = sizeof(uint256_t);
53+
// Prefetch one cache line
54+
prefetch_for_write(dst + x86::kOneCachelineSize);
55+
if (count <= 128)
56+
return generic::Memset<uint512_t>::head_tail(dst, value, count);
57+
// Prefetch the second cache line
58+
prefetch_for_write(dst + x86::kTwoCachelinesSize);
59+
// Aligned loop
60+
generic::Memset<uint256_t>::block(dst, value);
61+
align_to_next_boundary<32>(dst, count);
62+
if (count <= 192) {
63+
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
64+
} else {
65+
generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
66+
size_t offset = 96;
67+
while (offset + PREFETCH_DEGREE + SIZE <= count) {
68+
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
69+
prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
70+
x86::kOneCachelineSize);
71+
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
72+
generic::Memset<uint256_t>::block(dst + offset, value);
73+
}
74+
generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
75+
}
76+
}
77+
78+
[[maybe_unused]] LIBC_INLINE static void
79+
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
4080
if (count == 0)
4181
return;
4282
if (count == 1)
@@ -53,6 +93,8 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
5393
return generic::Memset<uint128_t>::head_tail(dst, value, count);
5494
if (count <= 64)
5595
return generic::Memset<uint256_t>::head_tail(dst, value, count);
96+
if constexpr (x86::kUseSoftwarePrefetchingMemset)
97+
return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
5698
if (count <= 128)
5799
return generic::Memset<uint512_t>::head_tail(dst, value, count);
58100
// Aligned loop

utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ PRINTF_COPTS = [
3232
MEMORY_COPTS = [
3333
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
3434
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
35+
# "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
3536
]
3637

3738
# A flag to pick which `mpfr` to use for math tests.

0 commit comments

Comments
 (0)