llvm
diff --git a/‎libc/src/stdio/printf_core/string_writer.cpp
Lines changed: 1 addition & 1 deletion b/‎libc/src/stdio/printf_core/string_writer.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎libc/src/string/bcmp.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/string/bcmp.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/string/memcmp.cpp
Lines changed: 2 additions & 2 deletions b/‎libc/src/string/memcmp.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎libc/src/string/memmove.cpp
Lines changed: 84 additions & 16 deletions b/‎libc/src/string/memmove.cpp
Lines changed: 84 additions & 16 deletions
diff --git a/‎libc/src/string/memory_utils/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎libc/src/string/memory_utils/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎libc/src/string/memory_utils/README.md
Lines changed: 97 additions & 0 deletions b/‎libc/src/string/memory_utils/README.md
Lines changed: 97 additions & 0 deletions
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
     len = available_capacity;
 
   if (len > 0) {
-    inline_memset(cur_buffer, new_char, len);
+    inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
     cur_buffer += len;
     available_capacity -= len;
   }
 
@@ -14,8 +14,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, bcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_bcmp(static_cast<const char *>(lhs),
-                     static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
+                                      static_cast<const char *>(rhs), count));
 }
 
 } // namespace __llvm_libc
@@ -15,8 +15,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, memcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_memcmp(static_cast<const char *>(lhs),
-                       static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
+                                        static_cast<const char *>(rhs), count));
 }
 
 } // namespace __llvm_libc
@@ -9,36 +9,104 @@
 #include "src/string/memmove.h"
 
 #include "src/__support/common.h"
-#include "src/__support/integer_operations.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include <stddef.h> // size_t, ptrdiff_t
 
+#include <stdio.h>
+
 namespace __llvm_libc {
 
-static inline void inline_memmove(char *dst, const char *src, size_t count) {
-  using namespace __llvm_libc::scalar;
+[[maybe_unused]] static inline void
+inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
+  if ((count == 0) || (dst == src))
+    return;
+  if (dst < src) {
+#pragma nounroll
+    for (size_t offset = 0; offset < count; ++offset)
+      builtin::Memcpy<1>::block(dst + offset, src + offset);
+  } else {
+#pragma nounroll
+    for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
+      builtin::Memcpy<1>::block(dst + offset, src + offset);
+  }
+}
+
+template <size_t MaxSize>
+[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
+                                                           size_t count) {
   if (count == 0)
     return;
   if (count == 1)
-    return move<_1>(dst, src);
+    return generic::Memmove<1, MaxSize>::block(dst, src);
   if (count <= 4)
-    return move<HeadTail<_2>>(dst, src, count);
+    return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
   if (count <= 8)
-    return move<HeadTail<_4>>(dst, src, count);
+    return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
   if (count <= 16)
-    return move<HeadTail<_8>>(dst, src, count);
+    return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
   if (count <= 32)
-    return move<HeadTail<_16>>(dst, src, count);
+    return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
   if (count <= 64)
-    return move<HeadTail<_32>>(dst, src, count);
+    return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
   if (count <= 128)
-    return move<HeadTail<_64>>(dst, src, count);
+    return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
+  if (dst < src) {
+    generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
+                                                                    count);
+    return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
+                                                                count);
+  } else {
+    generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
+                                                                     count);
+    return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
+                                                                 count);
+  }
+}
 
-  using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
-  if (dst < src)
-    return move<AlignedMoveLoop>(dst, src, count);
-  else if (dst > src)
-    return move_backward<AlignedMoveLoop>(dst, src, count);
+static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+#if defined(LLVM_LIBC_ARCH_X86)
+  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+                                     : x86::kAvx   ? 32
+                                     : x86::kSse2  ? 16
+                                                   : 8;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+#endif
+  // return inline_memmove_generic<kMaxSize>(dst, src, count);
+  if (count == 0)
+    return;
+  if (count == 1)
+    return generic::Memmove<1, kMaxSize>::block(dst, src);
+  if (count <= 4)
+    return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 8)
+    return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 16)
+    return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 32)
+    return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 64)
+    return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 128)
+    return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
+  if (dst < src) {
+    generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
+                                                                 count);
+  } else {
+    generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
+                                                                  count);
+  }
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_memmove_embedded_tiny(dst, src, count);
+#else
+#error "Unsupported platform"
+#endif
 }
 
 LLVM_LIBC_FUNCTION(void *, memmove,
 
@@ -2,13 +2,17 @@
 add_header_library(
   memory_utils
   HDRS
-    utils.h
-    elements.h
     bcmp_implementations.h
     bzero_implementations.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
+    op_aarch64.h
+    op_higher_order.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
 
@@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {`
`33`	`33`	`len = available_capacity;`
`34`	`34`
`35`	`35`	`if (len > 0) {`
`36`		`- inline_memset(cur_buffer, new_char, len);`
	`36`	`+ inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);`
`37`	`37`	`cur_buffer += len;`
`38`	`38`	`available_capacity -= len;`
`39`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,8 @@ namespace __llvm_libc {`
`14`	`14`
`15`	`15`	`LLVM_LIBC_FUNCTION(int, bcmp,`
`16`	`16`	`(const void lhs, const void rhs, size_t count)) {`
`17`		`- return inline_bcmp(static_cast<const char *>(lhs),`
`18`		`- static_cast<const char *>(rhs), count);`
	`17`	`+ return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),`
	`18`	`+ static_cast<const char *>(rhs), count));`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`} // namespace __llvm_libc`
Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,8 @@ namespace __llvm_libc {`
`15`	`15`
`16`	`16`	`LLVM_LIBC_FUNCTION(int, memcmp,`
`17`	`17`	`(const void lhs, const void rhs, size_t count)) {`
`18`		`- return inline_memcmp(static_cast<const char *>(lhs),`
`19`		`- static_cast<const char *>(rhs), count);`
	`18`	`+ return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),`
	`19`	`+ static_cast<const char *>(rhs), count));`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`} // namespace __llvm_libc`