15
15
#include " src/string/memory_utils/op_aarch64.h"
16
16
#include " src/string/memory_utils/op_builtin.h"
17
17
#include " src/string/memory_utils/op_generic.h"
18
+ #include " src/string/memory_utils/op_riscv.h"
18
19
#include " src/string/memory_utils/op_x86.h"
19
20
20
21
#include < stddef.h> // size_t
21
22
22
23
namespace __llvm_libc {
23
24
24
25
[[maybe_unused]] LIBC_INLINE BcmpReturnType
25
- inline_bcmp_byte_per_byte (CPtr p1, CPtr p2, size_t offset, size_t count) {
26
- LIBC_LOOP_NOUNROLL
27
- for (; offset < count; ++offset)
28
- if (p1[offset] != p2[offset])
29
- return BcmpReturnType::NONZERO ();
30
- return BcmpReturnType::ZERO ();
26
+ inline_bcmp_byte_per_byte (CPtr p1, CPtr p2, size_t count, size_t offset = 0 ) {
27
+ return generic::Bcmp<uint8_t >::loop_and_tail_offset (p1, p2, count, offset);
31
28
}
32
29
33
30
[[maybe_unused]] LIBC_INLINE BcmpReturnType
34
31
inline_bcmp_aligned_access_64bit (CPtr p1, CPtr p2, size_t count) {
35
32
constexpr size_t kAlign = sizeof (uint64_t );
36
33
if (count <= 2 * kAlign )
37
- return inline_bcmp_byte_per_byte (p1, p2, 0 , count);
34
+ return inline_bcmp_byte_per_byte (p1, p2, count);
38
35
size_t bytes_to_p1_align = distance_to_align_up<kAlign >(p1);
39
- if (auto value = inline_bcmp_byte_per_byte (p1, p2, 0 , bytes_to_p1_align))
36
+ if (auto value = inline_bcmp_byte_per_byte (p1, p2, bytes_to_p1_align))
40
37
return value;
41
38
size_t offset = bytes_to_p1_align;
42
39
size_t p2_alignment = distance_to_align_down<kAlign >(p2 + offset);
@@ -55,16 +52,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
55
52
if (a != b)
56
53
return BcmpReturnType::NONZERO ();
57
54
}
58
- return inline_bcmp_byte_per_byte (p1, p2, offset, count );
55
+ return inline_bcmp_byte_per_byte (p1, p2, count, offset );
59
56
}
60
57
61
58
[[maybe_unused]] LIBC_INLINE BcmpReturnType
62
59
inline_bcmp_aligned_access_32bit (CPtr p1, CPtr p2, size_t count) {
63
60
constexpr size_t kAlign = sizeof (uint32_t );
64
61
if (count <= 2 * kAlign )
65
- return inline_bcmp_byte_per_byte (p1, p2, 0 , count);
62
+ return inline_bcmp_byte_per_byte (p1, p2, count);
66
63
size_t bytes_to_p1_align = distance_to_align_up<kAlign >(p1);
67
- if (auto value = inline_bcmp_byte_per_byte (p1, p2, 0 , bytes_to_p1_align))
64
+ if (auto value = inline_bcmp_byte_per_byte (p1, p2, bytes_to_p1_align))
68
65
return value;
69
66
size_t offset = bytes_to_p1_align;
70
67
size_t p2_alignment = distance_to_align_down<kAlign >(p2 + offset);
@@ -80,89 +77,82 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
80
77
if (a != b)
81
78
return BcmpReturnType::NONZERO ();
82
79
}
83
- return inline_bcmp_byte_per_byte (p1, p2, offset, count );
80
+ return inline_bcmp_byte_per_byte (p1, p2, count, offset );
84
81
}
85
82
86
83
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
87
84
[[maybe_unused]] LIBC_INLINE BcmpReturnType
88
85
inline_bcmp_generic_gt16 (CPtr p1, CPtr p2, size_t count) {
89
- if (count < 256 )
90
- return generic::Bcmp<16 >::loop_and_tail (p1, p2, count);
91
- if (auto value = generic::Bcmp<64 >::block (p1, p2))
92
- return value;
93
- align_to_next_boundary<64 , Arg::P1>(p1, p2, count);
94
- return generic::Bcmp<64 >::loop_and_tail (p1, p2, count);
86
+ return generic::Bcmp<uint64_t >::loop_and_tail_align_above (256 , p1, p2, count);
95
87
}
96
88
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
97
89
// defined(LIBC_TARGET_ARCH_IS_AARCH64)
98
90
99
91
#if defined(LIBC_TARGET_ARCH_IS_X86)
92
+ #if defined(__SSE4_1__)
100
93
[[maybe_unused]] LIBC_INLINE BcmpReturnType
101
- inline_bcmp_x86_sse2_gt16 (CPtr p1, CPtr p2, size_t count) {
94
+ inline_bcmp_x86_sse41_gt16 (CPtr p1, CPtr p2, size_t count) {
102
95
if (count <= 32 )
103
- return x86::sse2::Bcmp<16 >::head_tail (p1, p2, count);
104
- if (count < 256 )
105
- return x86::sse2::Bcmp<16 >::loop_and_tail (p1, p2, count);
106
- if (auto value = x86::sse2::Bcmp<16 >::block (p1, p2))
107
- return value;
108
- align_to_next_boundary<16 , Arg::P1>(p1, p2, count);
109
- return x86::sse2::Bcmp<64 >::loop_and_tail (p1, p2, count);
96
+ return generic::Bcmp<__m128i>::head_tail (p1, p2, count);
97
+ return generic::Bcmp<__m128i>::loop_and_tail_align_above (256 , p1, p2, count);
110
98
}
99
+ #endif // __SSE4_1__
111
100
101
+ #if defined(__AVX__)
112
102
[[maybe_unused]] LIBC_INLINE BcmpReturnType
113
- inline_bcmp_x86_avx2_gt16 (CPtr p1, CPtr p2, size_t count) {
103
+ inline_bcmp_x86_avx_gt16 (CPtr p1, CPtr p2, size_t count) {
114
104
if (count <= 32 )
115
- return x86::sse2:: Bcmp<16 >::head_tail (p1, p2, count);
105
+ return generic:: Bcmp<__m128i >::head_tail (p1, p2, count);
116
106
if (count <= 64 )
117
- return x86::avx2::Bcmp<32 >::head_tail (p1, p2, count);
118
- if (count <= 128 )
119
- return x86::avx2::Bcmp<64 >::head_tail (p1, p2, count);
120
- if (LIBC_UNLIKELY (count >= 256 )) {
121
- if (auto value = x86::avx2::Bcmp<64 >::block (p1, p2))
122
- return value;
123
- align_to_next_boundary<64 , Arg::P1>(p1, p2, count);
124
- }
125
- return x86::avx2::Bcmp<64 >::loop_and_tail (p1, p2, count);
107
+ return generic::Bcmp<__m256i>::head_tail (p1, p2, count);
108
+ return generic::Bcmp<__m256i>::loop_and_tail_align_above (256 , p1, p2, count);
126
109
}
110
+ #endif // __AVX__
127
111
112
+ #if defined(__AVX512BW__)
128
113
[[maybe_unused]] LIBC_INLINE BcmpReturnType
129
114
inline_bcmp_x86_avx512bw_gt16 (CPtr p1, CPtr p2, size_t count) {
130
115
if (count <= 32 )
131
- return x86::sse2:: Bcmp<16 >::head_tail (p1, p2, count);
116
+ return generic:: Bcmp<__m128i >::head_tail (p1, p2, count);
132
117
if (count <= 64 )
133
- return x86::avx2:: Bcmp<32 >::head_tail (p1, p2, count);
118
+ return generic:: Bcmp<__m256i >::head_tail (p1, p2, count);
134
119
if (count <= 128 )
135
- return x86::avx512bw::Bcmp<64 >::head_tail (p1, p2, count);
136
- if (LIBC_UNLIKELY (count >= 256 )) {
137
- if (auto value = x86::avx512bw::Bcmp<64 >::block (p1, p2))
138
- return value;
139
- align_to_next_boundary<64 , Arg::P1>(p1, p2, count);
140
- }
141
- return x86::avx512bw::Bcmp<64 >::loop_and_tail (p1, p2, count);
120
+ return generic::Bcmp<__m512i>::head_tail (p1, p2, count);
121
+ return generic::Bcmp<__m512i>::loop_and_tail_align_above (256 , p1, p2, count);
142
122
}
123
+ #endif // __AVX512BW__
143
124
144
125
[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86 (CPtr p1, CPtr p2,
145
126
size_t count) {
146
127
if (count == 0 )
147
128
return BcmpReturnType::ZERO ();
148
129
if (count == 1 )
149
- return generic::Bcmp<1 >::block (p1, p2);
130
+ return generic::Bcmp<uint8_t >::block (p1, p2);
150
131
if (count == 2 )
151
- return generic::Bcmp<2 >::block (p1, p2);
152
- if (count <= 4 )
153
- return generic::Bcmp<2 >::head_tail (p1, p2, count);
154
- if (count <= 8 )
155
- return generic::Bcmp<4 >::head_tail (p1, p2, count);
132
+ return generic::Bcmp<uint16_t >::block (p1, p2);
133
+ if (count == 3 )
134
+ return generic::BcmpSequence<uint16_t , uint8_t >::block (p1, p2);
135
+ if (count == 4 )
136
+ return generic::Bcmp<uint32_t >::block (p1, p2);
137
+ if (count == 5 )
138
+ return generic::BcmpSequence<uint32_t , uint8_t >::block (p1, p2);
139
+ if (count == 6 )
140
+ return generic::BcmpSequence<uint32_t , uint16_t >::block (p1, p2);
141
+ if (count == 7 )
142
+ return generic::BcmpSequence<uint32_t , uint16_t , uint8_t >::block (p1, p2);
143
+ if (count == 8 )
144
+ return generic::Bcmp<uint64_t >::block (p1, p2);
156
145
if (count <= 16 )
157
- return generic::Bcmp<8 >::head_tail (p1, p2, count);
158
- if constexpr (x86::kAvx512BW )
159
- return inline_bcmp_x86_avx512bw_gt16 (p1, p2, count);
160
- else if constexpr (x86::kAvx2 )
161
- return inline_bcmp_x86_avx2_gt16 (p1, p2, count);
162
- else if constexpr (x86::kSse2 )
163
- return inline_bcmp_x86_sse2_gt16 (p1, p2, count);
164
- else
165
- return inline_bcmp_generic_gt16 (p1, p2, count);
146
+ return generic::Bcmp<uint64_t >::head_tail (p1, p2, count);
147
+ #if defined(__AVX512BW__)
148
+ return inline_bcmp_x86_avx512bw_gt16 (p1, p2, count);
149
+ #elif defined(__AVX__)
150
+ return inline_bcmp_x86_avx_gt16 (p1, p2, count);
151
+ #elif defined(__SSE4_1__)
152
+ return inline_bcmp_x86_sse41_gt16 (p1, p2, count);
153
+ #else
154
+ return inline_bcmp_generic_gt16 (p1, p2, count);
155
+ #endif
166
156
}
167
157
#endif // defined(LIBC_TARGET_ARCH_IS_X86)
168
158
@@ -178,27 +168,27 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
178
168
case 0 :
179
169
return BcmpReturnType::ZERO ();
180
170
case 1 :
181
- return generic::Bcmp<1 >::block (p1, p2);
171
+ return generic::Bcmp<uint8_t >::block (p1, p2);
182
172
case 2 :
183
- return generic::Bcmp<2 >::block (p1, p2);
173
+ return generic::Bcmp<uint16_t >::block (p1, p2);
184
174
case 3 :
185
- return generic::Bcmp<2 >::head_tail (p1, p2, count);
175
+ return generic::Bcmp<uint16_t >::head_tail (p1, p2, count);
186
176
case 4 :
187
- return generic::Bcmp<4 >::block (p1, p2);
177
+ return generic::Bcmp<uint32_t >::block (p1, p2);
188
178
case 5 :
189
179
case 6 :
190
180
case 7 :
191
- return generic::Bcmp<4 >::head_tail (p1, p2, count);
181
+ return generic::Bcmp<uint32_t >::head_tail (p1, p2, count);
192
182
case 8 :
193
- return generic::Bcmp<8 >::block (p1, p2);
183
+ return generic::Bcmp<uint64_t >::block (p1, p2);
194
184
case 9 :
195
185
case 10 :
196
186
case 11 :
197
187
case 12 :
198
188
case 13 :
199
189
case 14 :
200
190
case 15 :
201
- return generic::Bcmp<8 >::head_tail (p1, p2, count);
191
+ return generic::Bcmp<uint64_t >::head_tail (p1, p2, count);
202
192
}
203
193
}
204
194
@@ -225,7 +215,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
225
215
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
226
216
return inline_bcmp_aligned_access_32bit (p1, p2, count);
227
217
#else
228
- return inline_bcmp_byte_per_byte (p1, p2, 0 , count);
218
+ return inline_bcmp_byte_per_byte (p1, p2, count);
229
219
#endif
230
220
}
231
221
0 commit comments