Skip to content

Commit 67fe3bd

Browse files
committed
[libc][mem*] Introduce Sized/Backends for new mem framework
This patch is a subpart of D125768 intented to make the review easier. The `SizedOp` struct represents operations to be performed on a certain number of bytes. It is responsible for breaking them down into platform types and forwarded to the `Backend`. The `Backend` struct represents a lower level abstraction that works only on types (`uint8_t`, `__m128i`, ...). It is similar to instruction selection. Differential Revision: https://reviews.llvm.org/D126768
1 parent fc655a9 commit 67fe3bd

File tree

7 files changed

+831
-0
lines changed

7 files changed

+831
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
//===-- Elementary operations for aarch64 ---------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
10+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
11+
12+
#if !defined(LLVM_LIBC_ARCH_AARCH64)
13+
#include "src/string/memory_utils/backend_scalar.h"
14+
15+
#ifdef __ARM_NEON
16+
#include <arm_neon.h>
17+
#endif
18+
19+
namespace __llvm_libc {
20+
21+
struct Aarch64Backend : public Scalar64BitBackend {
22+
static constexpr bool IS_BACKEND_TYPE = true;
23+
24+
template <typename T, Temporality TS, Aligned AS,
25+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
26+
static inline T load(const T *src) {
27+
return Scalar64BitBackend::template load<T, TS, AS>(src);
28+
}
29+
};
30+
31+
// Implementation of the SizedOp abstraction for the set operation.
32+
struct Zva64 {
33+
static constexpr size_t SIZE = 64;
34+
35+
template <typename DstAddrT>
36+
static inline void set(DstAddrT dst, ubyte value) {
37+
#if __SIZEOF_POINTER__ == 4
38+
asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
39+
#else
40+
asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
41+
#endif
42+
}
43+
};
44+
45+
inline static bool hasZva() {
46+
uint64_t zva_val;
47+
asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
48+
// DC ZVA is permitted if DZP, bit [4] is zero.
49+
// BS, bits [3:0] is log2 of the block size in words.
50+
// So the next line checks whether the instruction is permitted and block size
51+
// is 16 words (i.e. 64 bytes).
52+
return (zva_val & 0b11111) == 0b00100;
53+
}
54+
55+
namespace aarch64 {
56+
using _1 = SizedOp<Aarch64Backend, 1>;
57+
using _2 = SizedOp<Aarch64Backend, 2>;
58+
using _3 = SizedOp<Aarch64Backend, 3>;
59+
using _4 = SizedOp<Aarch64Backend, 4>;
60+
using _8 = SizedOp<Aarch64Backend, 8>;
61+
using _16 = SizedOp<Aarch64Backend, 16>;
62+
using _32 = SizedOp<Aarch64Backend, 32>;
63+
using _64 = SizedOp<Aarch64Backend, 64>;
64+
using _128 = SizedOp<Aarch64Backend, 128>;
65+
} // namespace aarch64
66+
67+
} // namespace __llvm_libc
68+
69+
#endif // LLVM_LIBC_ARCH_AARCH64
70+
71+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_AARCH64_H
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
//===-- Elementary operations for native scalar types ---------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
9+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
10+
11+
#include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
12+
#include "src/__support/endian.h"
13+
14+
namespace __llvm_libc {
15+
16+
struct Scalar64BitBackend {
17+
static constexpr bool IS_BACKEND_TYPE = true;
18+
19+
template <typename T>
20+
static constexpr bool IsScalarType =
21+
cpp::IsSameV<T, uint8_t> || cpp::IsSameV<T, uint16_t> ||
22+
cpp::IsSameV<T, uint32_t> || cpp::IsSameV<T, uint64_t>;
23+
24+
template <typename T, Temporality TS, Aligned AS>
25+
static inline T load(const T *src) {
26+
static_assert(IsScalarType<T>);
27+
static_assert(TS == Temporality::TEMPORAL,
28+
"Scalar load does not support non-temporal access");
29+
return *src;
30+
}
31+
32+
template <typename T, Temporality TS, Aligned AS>
33+
static inline void store(T *dst, T value) {
34+
static_assert(IsScalarType<T>);
35+
static_assert(TS == Temporality::TEMPORAL,
36+
"Scalar store does not support non-temporal access");
37+
*dst = value;
38+
}
39+
40+
template <typename T> static inline T splat(ubyte value) {
41+
static_assert(IsScalarType<T>);
42+
return (T(~0ULL) / T(0xFF)) * T(value);
43+
}
44+
45+
template <typename T> static inline uint64_t notEquals(T v1, T v2) {
46+
static_assert(IsScalarType<T>);
47+
return v1 ^ v2;
48+
}
49+
50+
template <typename T> static inline int32_t threeWayCmp(T v1, T v2) {
51+
DeferredStaticAssert("not implemented");
52+
}
53+
54+
// Returns the type to use to consume Size bytes.
55+
template <size_t Size>
56+
using getNextType = cpp::ConditionalType<
57+
Size >= 8, uint64_t,
58+
cpp::ConditionalType<Size >= 4, uint32_t,
59+
cpp::ConditionalType<Size >= 2, uint16_t, uint8_t>>>;
60+
};
61+
62+
template <>
63+
int32_t inline Scalar64BitBackend::threeWayCmp<uint8_t>(uint8_t a, uint8_t b) {
64+
const int16_t la = Endian::to_big_endian(a);
65+
const int16_t lb = Endian::to_big_endian(b);
66+
return la - lb;
67+
}
68+
template <>
69+
int32_t inline Scalar64BitBackend::threeWayCmp<uint16_t>(uint16_t a,
70+
uint16_t b) {
71+
const int32_t la = Endian::to_big_endian(a);
72+
const int32_t lb = Endian::to_big_endian(b);
73+
return la - lb;
74+
}
75+
template <>
76+
int32_t inline Scalar64BitBackend::threeWayCmp<uint32_t>(uint32_t a,
77+
uint32_t b) {
78+
const uint32_t la = Endian::to_big_endian(a);
79+
const uint32_t lb = Endian::to_big_endian(b);
80+
return la > lb ? 1 : la < lb ? -1 : 0;
81+
}
82+
template <>
83+
int32_t inline Scalar64BitBackend::threeWayCmp<uint64_t>(uint64_t a,
84+
uint64_t b) {
85+
const uint64_t la = Endian::to_big_endian(a);
86+
const uint64_t lb = Endian::to_big_endian(b);
87+
return la > lb ? 1 : la < lb ? -1 : 0;
88+
}
89+
90+
namespace scalar {
91+
using _1 = SizedOp<Scalar64BitBackend, 1>;
92+
using _2 = SizedOp<Scalar64BitBackend, 2>;
93+
using _3 = SizedOp<Scalar64BitBackend, 3>;
94+
using _4 = SizedOp<Scalar64BitBackend, 4>;
95+
using _8 = SizedOp<Scalar64BitBackend, 8>;
96+
using _16 = SizedOp<Scalar64BitBackend, 16>;
97+
using _32 = SizedOp<Scalar64BitBackend, 32>;
98+
using _64 = SizedOp<Scalar64BitBackend, 64>;
99+
using _128 = SizedOp<Scalar64BitBackend, 128>;
100+
} // namespace scalar
101+
102+
} // namespace __llvm_libc
103+
104+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_SCALAR_H
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
//===-- Elementary operations for x86 -------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
9+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
10+
11+
#if defined(LLVM_LIBC_ARCH_X86)
12+
#include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
13+
#include "src/string/memory_utils/backend_scalar.h"
14+
15+
#ifdef __SSE2__
16+
#include <immintrin.h>
17+
#endif // __SSE2__
18+
19+
#if defined(__SSE2__)
20+
#define HAS_M128 true
21+
#else
22+
#define HAS_M128 false
23+
#endif
24+
25+
#if defined(__AVX2__)
26+
#define HAS_M256 true
27+
#else
28+
#define HAS_M256 false
29+
#endif
30+
31+
#if defined(__AVX512F__) and defined(__AVX512BW__)
32+
#define HAS_M512 true
33+
#else
34+
#define HAS_M512 false
35+
#endif
36+
37+
namespace __llvm_libc {
38+
struct X86Backend : public Scalar64BitBackend {
39+
static constexpr bool IS_BACKEND_TYPE = true;
40+
41+
// Scalar types use base class implementations.
42+
template <typename T, Temporality TS, Aligned AS,
43+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
44+
static inline T load(const T *src) {
45+
return Scalar64BitBackend::template load<T, TS, AS>(src);
46+
}
47+
48+
// Scalar types use base class implementations.
49+
template <typename T, Temporality TS, Aligned AS,
50+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
51+
static inline void store(T *dst, T value) {
52+
Scalar64BitBackend::template store<T, TS, AS>(dst, value);
53+
}
54+
55+
// Scalar types use base class implementations.
56+
template <typename T,
57+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
58+
static inline uint64_t notEquals(T v1, T v2) {
59+
return Scalar64BitBackend::template notEquals<T>(v1, v2);
60+
}
61+
62+
// Scalar types use base class implementations.
63+
template <typename T,
64+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
65+
static inline T splat(ubyte value) {
66+
return Scalar64BitBackend::template splat<T>(value);
67+
}
68+
69+
// Scalar types use base class implementations.
70+
template <typename T,
71+
cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
72+
static inline int32_t threeWayCmp(T v1, T v2) {
73+
return Scalar64BitBackend::template threeWayCmp<T>(v1, v2);
74+
}
75+
76+
// X86 types are specialized below.
77+
template <
78+
typename T, Temporality TS, Aligned AS,
79+
cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
80+
static inline T load(const T *src);
81+
82+
// X86 types are specialized below.
83+
template <
84+
typename T, Temporality TS, Aligned AS,
85+
cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
86+
static inline void store(T *dst, T value);
87+
88+
// X86 types are specialized below.
89+
template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
90+
bool> = true>
91+
static inline T splat(ubyte value);
92+
93+
// X86 types are specialized below.
94+
template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
95+
bool> = true>
96+
static inline uint64_t notEquals(T v1, T v2);
97+
98+
template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
99+
bool> = true>
100+
static inline int32_t threeWayCmp(T v1, T v2) {
101+
return char_diff(reinterpret_cast<char *>(&v1),
102+
reinterpret_cast<char *>(&v2), notEquals(v1, v2));
103+
}
104+
105+
// Returns the type to use to consume Size bytes.
106+
template <size_t Size>
107+
using getNextType = cpp::ConditionalType<
108+
(HAS_M512 && Size >= 64), __m512i,
109+
cpp::ConditionalType<
110+
(HAS_M256 && Size >= 32), __m256i,
111+
cpp::ConditionalType<(HAS_M128 && Size >= 16), __m128i,
112+
Scalar64BitBackend::getNextType<Size>>>>;
113+
114+
private:
115+
static inline int32_t char_diff(const char *a, const char *b, uint64_t mask) {
116+
const size_t diff_index = mask == 0 ? 0 : __builtin_ctzll(mask);
117+
const int16_t ca = (unsigned char)a[diff_index];
118+
const int16_t cb = (unsigned char)b[diff_index];
119+
return ca - cb;
120+
}
121+
};
122+
123+
static inline void repmovsb(void *dst, const void *src, size_t runtime_size) {
124+
asm volatile("rep movsb"
125+
: "+D"(dst), "+S"(src), "+c"(runtime_size)
126+
:
127+
: "memory");
128+
}
129+
130+
#define SPECIALIZE_LOAD(T, OS, AS, INTRISIC) \
131+
template <> inline T X86Backend::load<T, OS, AS>(const T *src) { \
132+
return INTRISIC(const_cast<T *>(src)); \
133+
}
134+
#define SPECIALIZE_STORE(T, OS, AS, INTRISIC) \
135+
template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) { \
136+
INTRISIC(dst, value); \
137+
}
138+
139+
#if HAS_M128
140+
SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128)
141+
SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128)
142+
SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
143+
_mm_stream_load_si128)
144+
// X86 non-temporal load needs aligned access
145+
SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128)
146+
SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128)
147+
SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
148+
_mm_stream_si128)
149+
// X86 non-temporal store needs aligned access
150+
template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) {
151+
return _mm_set1_epi8(__builtin_bit_cast(char, value));
152+
}
153+
template <>
154+
inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) {
155+
using T = char __attribute__((__vector_size__(16)));
156+
return _mm_movemask_epi8(T(a) != T(b));
157+
}
158+
#endif // HAS_M128
159+
160+
#if HAS_M256
161+
SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256)
162+
SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256)
163+
SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
164+
_mm256_stream_load_si256)
165+
// X86 non-temporal load needs aligned access
166+
SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES,
167+
_mm256_store_si256)
168+
SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO,
169+
_mm256_storeu_si256)
170+
SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
171+
_mm256_stream_si256)
172+
// X86 non-temporal store needs aligned access
173+
template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) {
174+
return _mm256_set1_epi8(__builtin_bit_cast(char, value));
175+
}
176+
template <>
177+
inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) {
178+
using T = char __attribute__((__vector_size__(32)));
179+
return _mm256_movemask_epi8(T(a) != T(b));
180+
}
181+
#endif // HAS_M256
182+
183+
#if HAS_M512
184+
SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512)
185+
SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512)
186+
SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
187+
_mm512_stream_load_si512)
188+
// X86 non-temporal load needs aligned access
189+
SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES,
190+
_mm512_store_si512)
191+
SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO,
192+
_mm512_storeu_si512)
193+
SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
194+
_mm512_stream_si512)
195+
// X86 non-temporal store needs aligned access
196+
template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) {
197+
return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value)));
198+
}
199+
template <>
200+
inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) {
201+
return _mm512_cmpneq_epi8_mask(a, b);
202+
}
203+
#endif // HAS_M512
204+
205+
namespace x86 {
206+
using _1 = SizedOp<X86Backend, 1>;
207+
using _2 = SizedOp<X86Backend, 2>;
208+
using _3 = SizedOp<X86Backend, 3>;
209+
using _4 = SizedOp<X86Backend, 4>;
210+
using _8 = SizedOp<X86Backend, 8>;
211+
using _16 = SizedOp<X86Backend, 16>;
212+
using _32 = SizedOp<X86Backend, 32>;
213+
using _64 = SizedOp<X86Backend, 64>;
214+
using _128 = SizedOp<X86Backend, 128>;
215+
} // namespace x86
216+
217+
} // namespace __llvm_libc
218+
219+
#endif // defined(LLVM_LIBC_ARCH_X86)
220+
221+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H

0 commit comments

Comments
 (0)