Skip to content

[libc] Add support for string/memory_utils functions for AArch64 without HW FP/SIMD #137592

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libc/src/__support/FPUtil/FEnvImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include "src/__support/macros/properties/architectures.h"
#include "src/errno/libc_errno.h"

#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
#if defined(__APPLE__)
#include "aarch64/fenv_darwin_impl.h"
#else
Expand Down
2 changes: 1 addition & 1 deletion libc/src/__support/FPUtil/nearest_integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#if (defined(LIBC_TARGET_ARCH_IS_X86_64) && defined(LIBC_TARGET_CPU_HAS_SSE4_2))
#include "x86_64/nearest_integer.h"
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#elif (defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP))
#include "aarch64/nearest_integer.h"
#elif defined(LIBC_TARGET_ARCH_IS_GPU)

Expand Down
2 changes: 1 addition & 1 deletion libc/src/__support/FPUtil/sqrt.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ template <> LIBC_INLINE double sqrt<double>(double x) {
// Use inline assembly when __builtin_elementwise_sqrt is not available.
#if defined(LIBC_TARGET_CPU_HAS_SSE2)
#include "x86_64/sqrt.h"
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
#include "aarch64/sqrt.h"
#elif defined(LIBC_TARGET_ARCH_IS_ARM)
#include "arm/sqrt.h"
Expand Down
50 changes: 47 additions & 3 deletions libc/src/string/memory_utils/aarch64/inline_bcmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,43 @@

namespace LIBC_NAMESPACE_DECL {

[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_aarch64(CPtr p1,
CPtr p2,
size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aarch64_no_fp(CPtr p1, CPtr p2, size_t count) {
if (LIBC_LIKELY(count < 16)) {
switch (count) {
case 0:
return BcmpReturnType::zero();
case 1:
return generic::Bcmp<uint8_t>::block(p1, p2);
case 2:
return generic::Bcmp<uint16_t>::block(p1, p2);
case 3:
return generic::Bcmp<uint16_t>::head_tail(p1, p2, count);
case 4:
return generic::Bcmp<uint32_t>::block(p1, p2);
case 5:
case 6:
case 7:
return generic::Bcmp<uint32_t>::head_tail(p1, p2, count);
case 8:
return generic::Bcmp<uint64_t>::block(p1, p2);
case 9:
case 10:
case 11:
case 12:
case 13:
case 14:
case 15:
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
}
}

return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
}

#ifdef __ARM_NEON
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aarch64_with_fp(CPtr p1, CPtr p2, size_t count) {
if (LIBC_LIKELY(count <= 32)) {
if (LIBC_UNLIKELY(count >= 16)) {
return aarch64::Bcmp<16>::head_tail(p1, p2, count);
Expand Down Expand Up @@ -65,6 +99,16 @@ namespace LIBC_NAMESPACE_DECL {
}
return aarch64::Bcmp<32>::loop_and_tail(p1, p2, count);
}
#endif

[[gnu::flatten]] LIBC_INLINE BcmpReturnType
inline_bcmp_aarch64_dispatch(CPtr p1, CPtr p2, size_t count) {
#if defined(__ARM_NEON)
return inline_bcmp_aarch64_with_fp(p1, p2, count);
#else
return inline_bcmp_aarch64_no_fp(p1, p2, count);
#endif
}

} // namespace LIBC_NAMESPACE_DECL

Expand Down
65 changes: 39 additions & 26 deletions libc/src/string/memory_utils/aarch64/inline_memcmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,40 @@
namespace LIBC_NAMESPACE_DECL {

[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
if (LIBC_UNLIKELY(count >= 384)) {
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
return value;
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
}
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1, p2, count);
inline_memcmp_aarch64_no_fp(CPtr p1, CPtr p2, size_t count) {
if (count == 0)
return MemcmpReturnType::zero();
if (count == 1)
return generic::Memcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Memcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
if (count <= 16)
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);

return generic::Memcmp<uint64_t>::loop_and_tail_align_above(384, p1, p2,
count);
}

#if defined(__ARM_NEON)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
inline_memcmp_aarch64_with_fp(CPtr p1, CPtr p2, size_t count) {
if (count == 0)
return MemcmpReturnType::zero();
if (count == 1)
return generic::Memcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Memcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
if (count <= 16)
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);

if (LIBC_UNLIKELY(count >= 128)) { // [128, ∞]
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
return value;
Expand All @@ -46,25 +69,15 @@ inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1 + 32, p2 + 32,
count - 32);
}
#endif

LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return MemcmpReturnType::zero();
if (count == 1)
return generic::Memcmp<uint8_t>::block(p1, p2);
if (count == 2)
return generic::Memcmp<uint16_t>::block(p1, p2);
if (count == 3)
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
if (count <= 8)
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
if (count <= 16)
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
if constexpr (aarch64::kNeon)
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
else
return inline_memcmp_generic_gt16(p1, p2, count);
[[gnu::flatten]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aarch64_dispatch(CPtr p1, CPtr p2, size_t count) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, the _dispatch functions should only contain the dispatch logic, each implementation should be able to evolve on their own.

#if defined(__ARM_NEON)
return inline_memcmp_aarch64_with_fp(p1, p2, count);
#else
return inline_memcmp_aarch64_no_fp(p1, p2, count);
#endif
}
} // namespace LIBC_NAMESPACE_DECL

Expand Down
4 changes: 1 addition & 3 deletions libc/src/string/memory_utils/aarch64/inline_memmove.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
#ifndef LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMMOVE_H
#define LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMMOVE_H

#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/string/memory_utils/op_aarch64.h" // aarch64::kNeon
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/utils.h"
Expand All @@ -19,7 +18,6 @@
namespace LIBC_NAMESPACE_DECL {

LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
static_assert(aarch64::kNeon, "aarch64 supports vector types");
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
Expand Down
60 changes: 51 additions & 9 deletions libc/src/string/memory_utils/aarch64/inline_memset.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@

namespace LIBC_NAMESPACE_DECL {

using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;

[[maybe_unused]] LIBC_INLINE static void
inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
static_assert(aarch64::kNeon, "aarch64 supports vector types");
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
inline_memset_aarch64_no_fp(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count <= 3) {
Expand All @@ -46,15 +46,57 @@ inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
generic::Memset<uint256_t>::tail(dst, value, count);
return;
}

generic::Memset<uint128_t>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<uint512_t>::loop_and_tail(dst, value, count);
}

#if defined(__ARM_NEON)
[[maybe_unused]] LIBC_INLINE static void
inline_memset_aarch64_with_fp(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count <= 3) {
generic::Memset<uint8_t>::block(dst, value);
if (count > 1)
generic::Memset<uint16_t>::tail(dst, value, count);
return;
}
if (count <= 8)
return generic::Memset<uint32_t>::head_tail(dst, value, count);
if (count <= 16)
return generic::Memset<uint64_t>::head_tail(dst, value, count);
if (count <= 32)
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= (32 + 64)) {
generic::Memset<uint256_t>::block(dst, value);
if (count <= 64)
return generic::Memset<uint256_t>::tail(dst, value, count);
generic::Memset<uint256_t>::block(dst + 32, value);
generic::Memset<uint256_t>::tail(dst, value, count);
return;
}

if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
generic::Memset<uint512_t>::block(dst, 0);
align_to_next_boundary<64>(dst, count);
return aarch64::neon::BzeroCacheLine::loop_and_tail(dst, 0, count);
} else {
generic::Memset<uint128_t>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<uint512_t>::loop_and_tail(dst, value, count);
}

generic::Memset<uint128_t>::block(dst, value);
align_to_next_boundary<16>(dst, count);
return generic::Memset<uint512_t>::loop_and_tail(dst, value, count);
}
#endif

[[gnu::flatten]] [[maybe_unused]] LIBC_INLINE static void
inline_memset_aarch64_dispatch(Ptr dst, uint8_t value, size_t count) {
#if defined(__ARM_NEON)
return inline_memset_aarch64_with_fp(dst, value, count);
#else
return inline_memset_aarch64_no_fp(dst, value, count);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
Expand Down
2 changes: 1 addition & 1 deletion libc/src/string/memory_utils/inline_bcmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#define LIBC_SRC_STRING_MEMORY_UTILS_BCMP inline_bcmp_x86
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_bcmp.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_BCMP inline_bcmp_aarch64
#define LIBC_SRC_STRING_MEMORY_UTILS_BCMP inline_bcmp_aarch64_dispatch
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
#include "src/string/memory_utils/riscv/inline_bcmp.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_BCMP inline_bcmp_riscv
Expand Down
2 changes: 1 addition & 1 deletion libc/src/string/memory_utils/inline_memcmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP inline_memcmp_x86
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_memcmp.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP inline_memcmp_aarch64
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP inline_memcmp_aarch64_dispatch
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
#include "src/string/memory_utils/riscv/inline_memcmp.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP inline_memcmp_riscv
Expand Down
2 changes: 1 addition & 1 deletion libc/src/string/memory_utils/inline_memset.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_x86
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_memset.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_aarch64
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_aarch64_dispatch
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
#include "src/string/memory_utils/riscv/inline_memset.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_riscv
Expand Down
8 changes: 7 additions & 1 deletion libc/src/string/memory_utils/op_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif //__ARM_NEON

namespace LIBC_NAMESPACE_DECL {
namespace aarch64 {
Expand Down Expand Up @@ -176,6 +175,8 @@ template <size_t Size> struct Bcmp {
} // namespace aarch64
} // namespace LIBC_NAMESPACE_DECL

#endif //__ARM_NEON

namespace LIBC_NAMESPACE_DECL {
namespace generic {

Expand Down Expand Up @@ -225,6 +226,8 @@ LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
return MemcmpReturnType::zero();
}

#if defined(__ARM_NEON)

///////////////////////////////////////////////////////////////////////////////
// Specializations for uint8x16_t
template <> struct is_vector<uint8x16_t> : cpp::true_type {};
Expand Down Expand Up @@ -269,6 +272,9 @@ LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
}
return MemcmpReturnType::zero();
}

#endif // __ARM_NEON

} // namespace generic
} // namespace LIBC_NAMESPACE_DECL

Expand Down
Loading