Skip to content

[libc][x86] Use prefetch for write for memcpy #90450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions libc/src/string/memory_utils/x86_64/inline_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}

[[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst,
CPtr __restrict src,
size_t distance) {
prefetch_to_local_cache(src + distance);
prefetch_for_write(dst + distance);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be an option to turn that off ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The prefetching behavior is controlled by LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING, I don't want to split this further down into "prefetch for write" and "prefetch for read", we already have a lot of combinations to test.

}

[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + K_ONE_CACHELINE);
inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + K_TWO_CACHELINES);
inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
// Aligning 'dst' on a 32B boundary.
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
Expand All @@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
if (count < 352) {
// Two cache lines at a time.
while (offset + K_TWO_CACHELINES + 32 <= count) {
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
offset += K_TWO_CACHELINES;
}
} else {
// Three cache lines at a time.
while (offset + K_THREE_CACHELINES + 32 <= count) {
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
// It is likely that this copy will be turned into a 'rep;movsb' on
// non-AVX machines.
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
Expand All @@ -114,11 +121,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + K_ONE_CACHELINE);
inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + K_TWO_CACHELINES);
prefetch_to_local_cache(src + K_THREE_CACHELINES);
inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
// Aligning 'dst' on a 32B boundary.
Expand All @@ -133,9 +140,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
// - count >= 128.
while (offset + K_THREE_CACHELINES + 64 <= count) {
// Three cache lines at a time.
prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
offset += K_THREE_CACHELINES;
}
Expand Down