Skip to content

Commit 2db7921

Browse files
Dinar TemirbulatovDinar Temirbulatov
authored andcommitted
[AArch64][compiler-rt] Add memcpy, memset, memmove, memchr simple implementation RT builtins.
Add naive implementation of memcpy, memset, memmove, memchr for SME targets. Patch co-authored by David Sherwood <[email protected]>
1 parent 06286a5 commit 2db7921

File tree

3 files changed

+286
-1
lines changed

3 files changed

+286
-1
lines changed

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ set(aarch64_SOURCES
561561
)
562562

563563
if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
564-
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
564+
list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
565565
message(STATUS "AArch64 SME ABI routines enabled")
566566
else()
567567
message(STATUS "AArch64 SME ABI routines disabled")
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#include <stdlib.h>
2+
3+
// WARNING: When building the scalar versions of these functions you need to
4+
// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
5+
// from recognising a loop idiom and planting calls to memcpy!
6+
7+
static void *
8+
__arm_sc_memcpy_fwd(void *dest, const void *src,
9+
size_t n) __arm_streaming_compatible __arm_preserves_za {
10+
unsigned char *destp = (unsigned char *)dest;
11+
const unsigned char *srcp = (const unsigned char *)src;
12+
13+
for (size_t i = 0; i < n; i++) {
14+
destp[i] = srcp[i];
15+
}
16+
17+
return dest;
18+
}
19+
20+
// If dest and src overlap then behaviour is undefined, hence we can add the
21+
// restrict keywords here. This also matches the definition of the libc memcpy
22+
// according to the man page.
23+
void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
24+
size_t n) __arm_streaming_compatible __arm_preserves_za {
25+
return __arm_sc_memcpy_fwd(dest, src, n);
26+
}
27+
28+
void *__arm_sc_memset(void *dest, int c,
29+
size_t n) __arm_streaming_compatible __arm_preserves_za {
30+
unsigned char *destp = (unsigned char *)dest;
31+
unsigned char c8 = (unsigned char)c;
32+
33+
for (size_t i = 0; i < n; i++) {
34+
destp[i] = c8;
35+
}
36+
37+
return dest;
38+
}
39+
40+
static void *
41+
__arm_sc_memcpy_rev(void *dest, const void *src,
42+
size_t n) __arm_streaming_compatible __arm_preserves_za {
43+
unsigned char *destp = (unsigned char *)dest;
44+
const unsigned char *srcp = (const unsigned char *)src;
45+
46+
// TODO: Improve performance by copying larger chunks in reverse, or by
47+
// using SVE.
48+
while (n > 0) {
49+
n--;
50+
destp[n] = srcp[n];
51+
}
52+
return dest;
53+
}
54+
55+
// Semantically a memmove is equivalent to the following:
56+
// 1. Copy the entire contents of src to a temporary array that does not
57+
// overlap with src or dest.
58+
// 2. Copy the contents of the temporary array into dest.
59+
void *__arm_sc_memmove(void *dest, const void *src,
60+
size_t n) __arm_streaming_compatible __arm_preserves_za {
61+
unsigned char *destp = (unsigned char *)dest;
62+
const unsigned char *srcp = (const unsigned char *)src;
63+
64+
// If src and dest are identical there is nothing to do!
65+
if ((destp == srcp) || (n == 0))
66+
return destp;
67+
68+
// If src and dest don't overlap then just invoke memcpy
69+
if ((srcp > (destp + n)) || (destp > (srcp + n)))
70+
return __arm_sc_memcpy_fwd(dest, src, n);
71+
72+
// Overlap case 1:
73+
// src: Low | -> | High
74+
// dest: Low | -> | High
75+
// Here src is always ahead of dest at a higher addres. If we first read a
76+
// chunk of data from src we can safely write the same chunk to dest without
77+
// corrupting future reads of src.
78+
if (srcp > destp)
79+
return __arm_sc_memcpy_fwd(dest, src, n);
80+
81+
// Overlap case 2:
82+
// src: Low | -> | High
83+
// dest: Low | -> | High
84+
// While we're in the overlap region we're always corrupting future reads of
85+
// src when writing to dest. An efficient way to do this is to copy the data
86+
// in reverse by starting at the highest address.
87+
return __arm_sc_memcpy_rev(dest, src, n);
88+
}
89+
90+
const void *
91+
__arm_sc_memchr(const void *src, int c,
92+
size_t n) __arm_streaming_compatible __arm_preserves_za {
93+
const unsigned char *srcp = (const unsigned char *)src;
94+
unsigned char c8 = (unsigned char)c;
95+
96+
for (size_t i = 0; i < n; i++) {
97+
if (srcp[i] == c8)
98+
return &srcp[i];
99+
}
100+
101+
return NULL;
102+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// REQUIRES: linux, aarch64-target-arch
2+
// RUN: %clang_builtins %s %librt -o %t && %run %t
3+
4+
#include <stdint.h>
5+
#include <stdio.h>
6+
#include <stdlib.h>
7+
#include <string.h>
8+
#include <time.h>
9+
10+
#define N 1024
11+
#define NREPS 1234
12+
13+
static uint8_t dst[N], src[N];
14+
15+
extern void *__arm_sc_memcpy(void *, const void *, size_t);
16+
extern void *__arm_sc_memset(void *, int, size_t);
17+
extern void *__arm_sc_memmove(void *, const void *, size_t);
18+
extern void *__arm_sc_memchr(const void *, int, size_t);
19+
20+
void init(void) {
21+
for (int i = 0; i < N; i++) {
22+
src[i] = i * 2;
23+
dst[i] = i + 1;
24+
}
25+
}
26+
27+
void reinit_dst(int n) {
28+
for (int i = 0; i < n; i++) {
29+
dst[i] = i + 1;
30+
}
31+
}
32+
33+
int sum(uint8_t *dest, int n) {
34+
int t = 0;
35+
for (int i = 0; i < n; i++) {
36+
t += dest[i];
37+
}
38+
return t;
39+
}
40+
41+
long get_time_diff(struct timespec tv[2]) {
42+
long us0 = (tv[0].tv_sec * 1000000) + (tv[0].tv_nsec / 1000);
43+
long us1 = (tv[1].tv_sec * 1000000) + (tv[1].tv_nsec / 1000);
44+
return us1 - us0;
45+
}
46+
47+
int main() {
48+
struct timespec tv[2];
49+
50+
init();
51+
52+
// Test correctness of memcpy
53+
for (int i = 0; i < 67; i++) {
54+
int t[2];
55+
if (!__arm_sc_memcpy(dst, src, i)) {
56+
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
57+
abort();
58+
}
59+
t[0] = sum(dst, N);
60+
reinit_dst(i);
61+
memcpy(dst, src, i);
62+
t[1] = sum(dst, N);
63+
reinit_dst(i);
64+
if (t[0] != t[1]) {
65+
fprintf(stderr, "__arm_sc_memcpy doesn't match memcpy behaviour!\n");
66+
abort();
67+
}
68+
}
69+
70+
#ifdef TEST_PERF
71+
// Collect perf data for memcpy
72+
clock_gettime(CLOCK_REALTIME, &tv[0]);
73+
for (int r = 0; r < NREPS; r++) {
74+
for (int i = 0; i < 67; i++) {
75+
int t[2];
76+
if (!__arm_sc_memcpy(dst, src, i)) {
77+
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
78+
abort();
79+
}
80+
}
81+
}
82+
reinit_dst(67);
83+
clock_gettime(CLOCK_REALTIME, &tv[1]);
84+
printf("memcpy time = %ld\n", get_time_diff(tv));
85+
#endif
86+
87+
// Test correctness of memset
88+
for (int i = 0; i < 67; i++) {
89+
int t[2];
90+
if (!__arm_sc_memset(dst, src[i], i)) {
91+
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
92+
abort();
93+
}
94+
t[0] = sum(dst, N);
95+
reinit_dst(i);
96+
memset(dst, src[i], i);
97+
t[1] = sum(dst, N);
98+
reinit_dst(i);
99+
if (t[0] != t[1]) {
100+
fprintf(stderr, "__arm_sc_memcpy doesn't match memset behaviour!\n");
101+
abort();
102+
}
103+
}
104+
105+
#ifdef TEST_PERF
106+
// Collect perf data for memset
107+
clock_gettime(CLOCK_REALTIME, &tv[0]);
108+
for (int r = 0; r < NREPS; r++) {
109+
for (int i = 0; i < 67; i++) {
110+
if (!__arm_sc_memset(dst, src[i], i)) {
111+
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
112+
abort();
113+
}
114+
}
115+
}
116+
reinit_dst(67);
117+
clock_gettime(CLOCK_REALTIME, &tv[1]);
118+
printf("memset time = %ld\n", get_time_diff(tv));
119+
#endif
120+
121+
// Test correctness of memchr
122+
for (int i = 0; i < 67; i++) {
123+
for (int j = 0; j < 67; j++) {
124+
uint8_t *t[2];
125+
t[0] = __arm_sc_memchr(src, src[j], i);
126+
t[1] = memchr(src, src[j], i);
127+
if (t[0] != t[1]) {
128+
fprintf(stderr, "__arm_sc_memchr doesn't match memchr behaviour!\n");
129+
abort();
130+
}
131+
}
132+
}
133+
134+
#ifdef TEST_PERF
135+
// Collect perf data for memchr
136+
clock_gettime(CLOCK_REALTIME, &tv[0]);
137+
for (int r = 0; r < NREPS; r++) {
138+
for (int i = 0; i < 67; i++) {
139+
for (int j = 0; j < 67; j++) {
140+
__arm_sc_memchr(src, src[j], i);
141+
}
142+
}
143+
}
144+
clock_gettime(CLOCK_REALTIME, &tv[1]);
145+
printf("memchr time = %ld\n", get_time_diff(tv));
146+
#endif
147+
148+
// Test correctness for memmove
149+
for (int i = 0; i < 67; i++) {
150+
for (int j = 0; j < 67; j++) {
151+
int t[2];
152+
if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i)) {
153+
fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memmove!\n");
154+
abort();
155+
}
156+
t[0] = sum(dst, N);
157+
reinit_dst(200);
158+
memmove(&dst[66 - j], &dst[j], i);
159+
t[1] = sum(dst, N);
160+
reinit_dst(200);
161+
if (t[0] != t[1]) {
162+
fprintf(stderr, "__arm_sc_memmove doesn't match memmove behaviour!\n");
163+
abort();
164+
}
165+
}
166+
}
167+
168+
#ifdef TEST_PERF
169+
// Collect perf data for memmove
170+
clock_gettime(CLOCK_REALTIME, &tv[0]);
171+
for (int r = 0; r < NREPS; r++) {
172+
for (int i = 0; i < 67; i++) {
173+
for (int j = 0; j < 67; j++) {
174+
__arm_sc_memmove(&dst[66 - j], &dst[j], i);
175+
}
176+
}
177+
}
178+
clock_gettime(CLOCK_REALTIME, &tv[1]);
179+
printf("memmove time = %ld\n", get_time_diff(tv));
180+
#endif
181+
182+
return 0;
183+
}

0 commit comments

Comments
 (0)