Kotlin
diff --git a/‎libc/src/string/memory_utils/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎libc/src/string/memory_utils/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎libc/src/string/memory_utils/README.md
Lines changed: 97 additions & 0 deletions b/‎libc/src/string/memory_utils/README.md
Lines changed: 97 additions & 0 deletions
diff --git a/‎libc/src/string/memory_utils/op_aarch64.h
Lines changed: 172 additions & 0 deletions b/‎libc/src/string/memory_utils/op_aarch64.h
Lines changed: 172 additions & 0 deletions
@@ -2,13 +2,17 @@
 add_header_library(
   memory_utils
   HDRS
-    utils.h
-    elements.h
     bcmp_implementations.h
     bzero_implementations.h
+    elements.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
+    op_aarch64.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
 
@@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
@@ -0,0 +1,172 @@
+//===-- aarch64 implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides aarch64 specific building blocks to compose memory
+// functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif //__ARM_NEON
+
+namespace __llvm_libc::aarch64 {
+
+static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
+
+namespace neon {
+
+template <size_t Size> struct BzeroCacheLine {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t) {
+    static_assert(Size == 64);
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    // Unaligned store, we can't use 'dc zva' here.
+    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
+    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block count in words.
+  // So the next line checks whether the instruction is permitted and block
+  // count is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace neon
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t BlockSize = 32;
+
+  static const unsigned char *as_u8(CPtr ptr) {
+    return reinterpret_cast<const unsigned char *>(ptr);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t an = veorq_u8(a, n);
+      uint8x16_t bo = veorq_u8(b, o);
+      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
+      // a difference between the two buffers.  We reduce this value down to 4
+      // bytes in two steps. First, calculate the saturated move value when
+      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
+      // a single 32 bit nonzero value if a mismatch occurred.
+      uint8x16_t anbo = vorrq_u8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr ((Size % BlockSize) == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if constexpr (Size <= 8) {
+      return generic::Bcmp<Size>::head_tail(p1, p2, count);
+    } else if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + count - 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + count - 16);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      // anbo = (a ^ n) | (b ^ o)
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t c = vld1q_u8(_p1 + count - 16);
+      uint8x16_t d = vld1q_u8(_p1 + count - 32);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t p = vld1q_u8(_p2 + count - 16);
+      uint8x16_t q = vld1q_u8(_p2 + count - 32);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      uint8x16_t cp = veorq_s8(c, p);
+      uint8x16_t dq = veorq_s8(d, q);
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint8x16_t cpdq = vorrq_s8(cp, dq);
+      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
+      // a nonzero 32 bit value if a mismatch occurred.
+      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
+      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
+      return vmaxv_u32(abnocpdq_reduced);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+} // namespace __llvm_libc::aarch64
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H