|
18 | 18 |
|
19 | 19 | namespace LIBC_NAMESPACE {
|
20 | 20 |
|
21 |
| -LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) { |
| 21 | +LIBC_INLINE bool inline_memmove_small_size_x86(Ptr dst, CPtr src, |
| 22 | + size_t count) { |
22 | 23 | #if defined(__AVX512F__)
|
| 24 | + constexpr size_t vector_size = 64; |
23 | 25 | using uint128_t = generic_v128;
|
24 | 26 | using uint256_t = generic_v256;
|
25 | 27 | using uint512_t = generic_v512;
|
26 | 28 | #elif defined(__AVX__)
|
| 29 | + constexpr size_t vector_size = 32; |
27 | 30 | using uint128_t = generic_v128;
|
28 | 31 | using uint256_t = generic_v256;
|
29 | 32 | using uint512_t = cpp::array<generic_v256, 2>;
|
30 | 33 | #elif defined(__SSE2__)
|
| 34 | + constexpr size_t vector_size = 16; |
31 | 35 | using uint128_t = generic_v128;
|
32 | 36 | using uint256_t = cpp::array<generic_v128, 2>;
|
33 | 37 | using uint512_t = cpp::array<generic_v128, 4>;
|
34 | 38 | #else
|
| 39 | + constexpr size_t vector_size = 8; |
35 | 40 | using uint128_t = cpp::array<uint64_t, 2>;
|
36 | 41 | using uint256_t = cpp::array<uint64_t, 4>;
|
37 | 42 | using uint512_t = cpp::array<uint64_t, 8>;
|
38 | 43 | #endif
|
| 44 | + (void)vector_size; |
39 | 45 | if (count == 0)
|
40 |
| - return; |
41 |
| - if (count == 1) |
42 |
| - return generic::Memmove<uint8_t>::block(dst, src); |
43 |
| - if (count <= 4) |
44 |
| - return generic::Memmove<uint16_t>::head_tail(dst, src, count); |
45 |
| - if (count <= 8) |
46 |
| - return generic::Memmove<uint32_t>::head_tail(dst, src, count); |
47 |
| - if (count <= 16) |
48 |
| - return generic::Memmove<uint64_t>::head_tail(dst, src, count); |
49 |
| - if (count <= 32) |
50 |
| - return generic::Memmove<uint128_t>::head_tail(dst, src, count); |
51 |
| - if (count <= 64) |
52 |
| - return generic::Memmove<uint256_t>::head_tail(dst, src, count); |
53 |
| - if (count <= 128) |
54 |
| - return generic::Memmove<uint512_t>::head_tail(dst, src, count); |
| 46 | + return true; |
| 47 | + if (count == 1) { |
| 48 | + generic::Memmove<uint8_t>::block(dst, src); |
| 49 | + return true; |
| 50 | + } |
| 51 | + if (count == 2) { |
| 52 | + generic::Memmove<uint16_t>::block(dst, src); |
| 53 | + return true; |
| 54 | + } |
| 55 | + if (count == 3) { |
| 56 | + generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src); |
| 57 | + return true; |
| 58 | + } |
| 59 | + if (count == 4) { |
| 60 | + generic::Memmove<uint32_t>::block(dst, src); |
| 61 | + return true; |
| 62 | + } |
| 63 | + if (count < 8) { |
| 64 | + generic::Memmove<uint32_t>::head_tail(dst, src, count); |
| 65 | + return true; |
| 66 | + } |
| 67 | + // If count is equal to a power of 2, we can handle it as head-tail |
| 68 | + // of both smaller size and larger size (head-tail are either |
| 69 | + // non-overlapping for smaller size, or completely collapsed |
| 70 | + // for larger size). It seems to be more profitable to do the copy |
| 71 | + // with the larger size, if it's natively supported (e.g. doing |
| 72 | + // 2 collapsed 32-byte moves for count=64 if AVX2 is supported). |
| 73 | + // But it's not profitable to use larger size if it's not natively |
| 74 | + // supported: we will both use more instructions and handle fewer |
| 75 | + // sizes in earlier branches. |
| 76 | + if (vector_size >= 16 ? count < 16 : count <= 16) { |
| 77 | + generic::Memmove<uint64_t>::head_tail(dst, src, count); |
| 78 | + return true; |
| 79 | + } |
| 80 | + if (vector_size >= 32 ? count < 32 : count <= 32) { |
| 81 | + generic::Memmove<uint128_t>::head_tail(dst, src, count); |
| 82 | + return true; |
| 83 | + } |
| 84 | + if (vector_size >= 64 ? count < 64 : count <= 64) { |
| 85 | + generic::Memmove<uint256_t>::head_tail(dst, src, count); |
| 86 | + return true; |
| 87 | + } |
| 88 | + if (count <= 128) { |
| 89 | + generic::Memmove<uint512_t>::head_tail(dst, src, count); |
| 90 | + return true; |
| 91 | + } |
| 92 | + return false; |
| 93 | +} |
| 94 | + |
| 95 | +LIBC_INLINE void inline_memmove_follow_up_x86(Ptr dst, CPtr src, size_t count) { |
| 96 | +#if defined(__AVX512F__) |
| 97 | + using uint256_t = generic_v256; |
| 98 | + using uint512_t = generic_v512; |
| 99 | +#elif defined(__AVX__) |
| 100 | + using uint256_t = generic_v256; |
| 101 | + using uint512_t = cpp::array<generic_v256, 2>; |
| 102 | +#elif defined(__SSE2__) |
| 103 | + using uint256_t = cpp::array<generic_v128, 2>; |
| 104 | + using uint512_t = cpp::array<generic_v128, 4>; |
| 105 | +#else |
| 106 | + using uint256_t = cpp::array<uint64_t, 4>; |
| 107 | + using uint512_t = cpp::array<uint64_t, 8>; |
| 108 | +#endif |
55 | 109 | if (dst < src) {
|
56 | 110 | generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
|
57 | 111 | return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
|
|
0 commit comments