[compiler-rt][AArch64] Provide basic implementations of SME memcpy/memmove in case of strictly aligned memory access (llvm#138250)

vhscampos · web-flow · commit 75c3ff8c0b29 · 2025-06-03T10:59:25.000+01:00
The existing implementations, written in assembly, make use of unaligned accesses for performance reasons. They are not compatible with strict aligned configurations, i.e. with `-mno-unaligned-access`. If the functions are used in this scenario, an exception is raised due to unaligned memory accesses. This patch reintroduces vanilla implementations for these functions to be used in strictly aligned configurations. The actual code is largely based on the code from llvm#77496
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -50,6 +50,24 @@ void foo(void)  __arm_streaming_compatible {
 }
 ")
 
+builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_UNALIGNED
+"
+void foo() {
+#ifndef __ARM_FEATURE_UNALIGNED
+#error \"Unaligned accesses unsupported\"
+#endif
+}
+")
+
+builtin_check_c_compiler_source(COMPILER_RT_HAS_ARM_FP
+"
+void foo() {
+#ifndef __ARM_FP
+#error \"No floating-point support\"
+#endif
+}
+")
+
 check_include_files("sys/auxv.h"    COMPILER_RT_HAS_AUXV)
 
 if(ANDROID)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -600,9 +600,17 @@ if (COMPILER_RT_HAS_AARCH64_SME)
     set_source_files_properties(aarch64/arm_apple_sme_abi.s PROPERTIES COMPILE_FLAGS -march=armv8a+sme)
     message(STATUS "AArch64 Apple SME ABI routines enabled")
   elseif (NOT COMPILER_RT_DISABLE_AARCH64_FMV AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND COMPILER_RT_AARCH64_FMV_USES_GLOBAL_CONSTRUCTOR)
-    list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-libc-mem-routines.S aarch64/sme-abi-assert.c aarch64/sme-libc-routines.c)
+    if(COMPILER_RT_HAS_ARM_UNALIGNED AND COMPILER_RT_HAS_ARM_FP)
+      list(APPEND aarch64_SOURCES aarch64/sme-libc-opt-memset-memchr.S aarch64/sme-libc-opt-memcpy-memmove.S)
+    elseif(COMPILER_RT_HAS_ARM_UNALIGNED)
+      list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-opt-memcpy-memmove.S)
+      message(WARNING "AArch64 SME ABI assembly-optimized memset/memchr disabled: target does not have hardware floating-point support.")
+    else()
+      list(APPEND aarch64_SOURCES aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c)
+      message(WARNING "AArch64 SME ABI assembly-optimized routines disabled: target does not support unaligned accesses.")
+    endif()
     message(STATUS "AArch64 SME ABI routines enabled")
-    set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
+    set_source_files_properties(aarch64/sme-libc-memset-memchr.c aarch64/sme-libc-memcpy-memmove.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
   else()
     if(COMPILER_RT_DISABLE_AARCH64_FMV)
       message(WARNING "AArch64 SME ABI routines require function multiversioning support.")
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-memcpy-memmove.c b/compiler-rt/lib/builtins/aarch64/sme-libc-memcpy-memmove.c
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains basic implementations of Scalable Matrix Extension (SME)
+/// compatible memcpy and memmove functions to be used when their assembly-
+/// optimized counterparts can't.
+///
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+extern void *__arm_sc_memcpy(void *__restrict dest, const void *__restrict src,
+                             size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+extern void *__arm_sc_memmove(void *dest, const void *src,
+                              size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy(dest, src, n);
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-memset-memchr.c b/compiler-rt/lib/builtins/aarch64/sme-libc-memset-memchr.c
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains basic implementations of Scalable Matrix Extension (SME)
+/// compatible memset and memchr functions to be used when their assembly-
+/// optimized counterparts can't.
+///
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+
+extern void *__arm_sc_memset(void *dest, int c,
+                             size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+extern const void *__arm_sc_memchr(const void *src, int c,
+                                   size_t n) __arm_streaming_compatible {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    if (srcp[i] == c8)
+      return &srcp[i];
+
+  return NULL;
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S b/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memcpy-memmove.S
@@ -1,8 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// Routines taken from libc/AOR_v20.02/string/aarch64
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains assembly-optimized implementations of Scalable Matrix
+/// Extension (SME) compatible memcpy and memmove functions.
+///
+/// These implementations depend on unaligned access support.
+///
+/// Routines taken from libc/AOR_v20.02/string/aarch64.
+///
+//===----------------------------------------------------------------------===//
 
 #include "../assembly.h"
 
@@ -234,116 +246,3 @@ END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
 
 DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
 
-// This version uses FP registers. Use this only on targets with them
-#if (defined(__aarch64__) && __ARM_FP != 0) || defined(__arm64ec__)
-//
-//  __arm_sc_memset
-//
-
-#define dstin    x0
-#define val      x1
-#define valw     w1
-#define count    x2
-#define dst      x3
-#define dstend2  x4
-#define zva_val  x5
-
-DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
-#ifdef __ARM_FEATURE_SVE
-        mov     z0.b, valw
-#else
-        bfi valw, valw, #8, #8
-        bfi valw, valw, #16, #16
-        bfi val, val, #32, #32
-        fmov d0, val
-        fmov v0.d[1], val
-#endif
-        add     dstend2, dstin, count
-
-        cmp     count, 96
-        b.hi    7f  // set_long
-        cmp     count, 16
-        b.hs    4f  // set_medium
-        mov     val, v0.D[0]
-
-        /* Set 0..15 bytes.  */
-        tbz     count, 3, 1f
-        str     val, [dstin]
-        str     val, [dstend2, -8]
-        ret
-        nop
-1:      tbz     count, 2, 2f
-        str     valw, [dstin]
-        str     valw, [dstend2, -4]
-        ret
-2:      cbz     count, 3f
-        strb    valw, [dstin]
-        tbz     count, 1, 3f
-        strh    valw, [dstend2, -2]
-3:      ret
-
-        /* Set 17..96 bytes.  */
-4:  // set_medium
-        str     q0, [dstin]
-        tbnz    count, 6, 6f  // set96
-        str     q0, [dstend2, -16]
-        tbz     count, 5, 5f
-        str     q0, [dstin, 16]
-        str     q0, [dstend2, -32]
-5:      ret
-
-        .p2align 4
-        /* Set 64..96 bytes.  Write 64 bytes from the start and
-           32 bytes from the end.  */
-6:  // set96
-        str     q0, [dstin, 16]
-        stp     q0, q0, [dstin, 32]
-        stp     q0, q0, [dstend2, -32]
-        ret
-
-        .p2align 4
-7:  // set_long
-        and     valw, valw, 255
-        bic     dst, dstin, 15
-        str     q0, [dstin]
-        cmp     count, 160
-        ccmp    valw, 0, 0, hs
-        b.ne    9f  // no_zva
-
-#ifndef SKIP_ZVA_CHECK
-        mrs     zva_val, dczid_el0
-        and     zva_val, zva_val, 31
-        cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
-        b.ne    9f  // no_zva
-#endif
-        str     q0, [dst, 16]
-        stp     q0, q0, [dst, 32]
-        bic     dst, dst, 63
-        sub     count, dstend2, dst      /* Count is now 64 too large.  */
-        sub     count, count, 128       /* Adjust count and bias for loop.  */
-
-        .p2align 4
-8:  // zva_loop
-        add     dst, dst, 64
-        dc      zva, dst
-        subs    count, count, 64
-        b.hi    8b  // zva_loop
-        stp     q0, q0, [dstend2, -64]
-        stp     q0, q0, [dstend2, -32]
-        ret
-
-9:  // no_zva
-        sub     count, dstend2, dst      /* Count is 16 too large.  */
-        sub     dst, dst, 16            /* Dst is biased by -32.  */
-        sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-10: // no_zva_loop
-        stp     q0, q0, [dst, 32]
-        stp     q0, q0, [dst, 64]!
-        subs    count, count, 64
-        b.hi    10b  // no_zva_loop
-        stp     q0, q0, [dstend2, -64]
-        stp     q0, q0, [dstend2, -32]
-        ret
-END_COMPILERRT_FUNCTION(__arm_sc_memset)
-
-#endif // __aarch64__
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S b/compiler-rt/lib/builtins/aarch64/sme-libc-opt-memset-memchr.S
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c