Skip to content

Commit 1b80990

Browse files
committed
Reland "[compiler-rt][AArch64] Allow platform-specific mangling of SME routines. (#119864)"
Avoid issues caused by `.subsections_via_symbols` directive, by using numbered labels instead of named labels for the branch locations. This reverts commit 4032ce3.
1 parent 908e306 commit 1b80990

File tree

2 files changed

+60
-60
lines changed

2 files changed

+60
-60
lines changed

compiler-rt/lib/builtins/aarch64/sme-abi.S

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
4040
.cfi_offset w30, -24
4141
.cfi_offset w29, -32
4242
.cfi_offset 46, -16
43-
bl __arm_sme_state
43+
bl SYMBOL_NAME(__arm_sme_state)
4444
tbz x0, #0, 2f
4545
1:
4646
smstop sm
@@ -54,7 +54,7 @@ END_COMPILERRT_FUNCTION(do_abort)
5454
// __arm_sme_state fills the result registers based on a local
5555
// that is set as part of the compiler-rt startup code.
5656
// __aarch64_has_sme_and_tpidr2_el0
57-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
57+
DEFINE_COMPILERRT_FUNCTION(__arm_sme_state)
5858
.variant_pcs __arm_sme_state
5959
BTI_C
6060
mov x0, xzr
@@ -70,9 +70,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
7070
mrs x1, TPIDR2_EL0
7171
1:
7272
ret
73-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
73+
END_COMPILERRT_FUNCTION(__arm_sme_state)
7474

75-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
75+
DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
7676
.variant_pcs __arm_tpidr2_restore
7777
BTI_C
7878
// If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
@@ -106,9 +106,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
106106
ret
107107
2:
108108
b SYMBOL_NAME(do_abort)
109-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
109+
END_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
110110

111-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
111+
DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_save)
112112
.variant_pcs __arm_tpidr2_save
113113
BTI_C
114114
// If the current thread does not have access to TPIDR2_EL0, the subroutine
@@ -147,9 +147,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
147147
ret
148148
2:
149149
b SYMBOL_NAME(do_abort)
150-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
150+
END_COMPILERRT_FUNCTION(__arm_tpidr2_save)
151151

152-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
152+
DEFINE_COMPILERRT_FUNCTION(__arm_za_disable)
153+
.cfi_startproc
153154
.variant_pcs __arm_za_disable
154155
BTI_C
155156
// If the current thread does not have access to SME, the subroutine does
@@ -166,7 +167,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
166167
.cfi_def_cfa w29, 16
167168
.cfi_offset w30, -8
168169
.cfi_offset w29, -16
169-
bl __arm_tpidr2_save
170+
bl SYMBOL_NAME(__arm_tpidr2_save)
170171

171172
// * Set TPIDR2_EL0 to null.
172173
msr TPIDR2_EL0, xzr
@@ -181,9 +182,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
181182
.cfi_restore w29
182183
0:
183184
ret
184-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
185+
.cfi_endproc
186+
END_COMPILERRT_FUNCTION(__arm_za_disable)
185187

186-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
188+
DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
187189
.variant_pcs __arm_get_current_vg
188190
BTI_C
189191

@@ -200,7 +202,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
200202
2:
201203
mov x0, xzr
202204
ret
203-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
205+
END_COMPILERRT_FUNCTION(__arm_get_current_vg)
204206

205207
NO_EXEC_STACK_DIRECTIVE
206208

compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66

77
#include "../assembly.h"
88

9-
#define L(l) .L ## l
10-
119
//
1210
// __arm_sc_memcpy / __arm_sc_memmove
1311
//
@@ -52,26 +50,26 @@
5250
The loop tail is handled by always copying 64 bytes from the end.
5351
*/
5452

55-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
53+
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
5654
add srcend1, src, count
5755
add dstend1, dstin, count
5856
cmp count, 128
59-
b.hi L(copy_long)
57+
b.hi 7f // copy_long
6058
cmp count, 32
61-
b.hi L(copy32_128)
59+
b.hi 4f // copy32_128
6260

6361
/* Small copies: 0..32 bytes. */
6462
cmp count, 16
65-
b.lo L(copy16)
63+
b.lo 0f // copy16
6664
ldp A_l, A_h, [src]
6765
ldp D_l, D_h, [srcend1, -16]
6866
stp A_l, A_h, [dstin]
6967
stp D_l, D_h, [dstend1, -16]
7068
ret
7169

7270
/* Copy 8-15 bytes. */
73-
L(copy16):
74-
tbz count, 3, L(copy8)
71+
0: // copy16
72+
tbz count, 3, 1f // copy8
7573
ldr A_l, [src]
7674
ldr A_h, [srcend1, -8]
7775
str A_l, [dstin]
@@ -80,36 +78,36 @@ L(copy16):
8078

8179
.p2align 3
8280
/* Copy 4-7 bytes. */
83-
L(copy8):
84-
tbz count, 2, L(copy4)
81+
1: // copy8
82+
tbz count, 2, 2f // copy4
8583
ldr A_lw, [src]
8684
ldr B_lw, [srcend1, -4]
8785
str A_lw, [dstin]
8886
str B_lw, [dstend1, -4]
8987
ret
9088

9189
/* Copy 0..3 bytes using a branchless sequence. */
92-
L(copy4):
93-
cbz count, L(copy0)
90+
2: // copy4
91+
cbz count, 3f // copy0
9492
lsr tmp1, count, 1
9593
ldrb A_lw, [src]
9694
ldrb C_lw, [srcend1, -1]
9795
ldrb B_lw, [src, tmp1]
9896
strb A_lw, [dstin]
9997
strb B_lw, [dstin, tmp1]
10098
strb C_lw, [dstend1, -1]
101-
L(copy0):
99+
3: // copy0
102100
ret
103101

104102
.p2align 4
105103
/* Medium copies: 33..128 bytes. */
106-
L(copy32_128):
104+
4: // copy32_128
107105
ldp A_l, A_h, [src]
108106
ldp B_l, B_h, [src, 16]
109107
ldp C_l, C_h, [srcend1, -32]
110108
ldp D_l, D_h, [srcend1, -16]
111109
cmp count, 64
112-
b.hi L(copy128)
110+
b.hi 5f // copy128
113111
stp A_l, A_h, [dstin]
114112
stp B_l, B_h, [dstin, 16]
115113
stp C_l, C_h, [dstend1, -32]
@@ -118,16 +116,16 @@ L(copy32_128):
118116

119117
.p2align 4
120118
/* Copy 65..128 bytes. */
121-
L(copy128):
119+
5: // copy128
122120
ldp E_l, E_h, [src, 32]
123121
ldp F_l, F_h, [src, 48]
124122
cmp count, 96
125-
b.ls L(copy96)
123+
b.ls 6f // copy96
126124
ldp G_l, G_h, [srcend1, -64]
127125
ldp H_l, H_h, [srcend1, -48]
128126
stp G_l, G_h, [dstend1, -64]
129127
stp H_l, H_h, [dstend1, -48]
130-
L(copy96):
128+
6: // copy96
131129
stp A_l, A_h, [dstin]
132130
stp B_l, B_h, [dstin, 16]
133131
stp E_l, E_h, [dstin, 32]
@@ -138,12 +136,12 @@ L(copy96):
138136

139137
.p2align 4
140138
/* Copy more than 128 bytes. */
141-
L(copy_long):
139+
7: // copy_long
142140
/* Use backwards copy if there is an overlap. */
143141
sub tmp1, dstin, src
144-
cbz tmp1, L(copy0)
142+
cbz tmp1, 3b // copy0
145143
cmp tmp1, count
146-
b.lo L(copy_long_backwards)
144+
b.lo 10f //copy_long_backwards
147145

148146
/* Copy 16 bytes and then align dst to 16-byte alignment. */
149147

@@ -158,8 +156,8 @@ L(copy_long):
158156
ldp C_l, C_h, [src, 48]
159157
ldp D_l, D_h, [src, 64]!
160158
subs count, count, 128 + 16 /* Test and readjust count. */
161-
b.ls L(copy64_from_end)
162-
L(loop64):
159+
b.ls 9f // copy64_from_end
160+
8: // loop64
163161
stp A_l, A_h, [dst, 16]
164162
ldp A_l, A_h, [src, 16]
165163
stp B_l, B_h, [dst, 32]
@@ -169,10 +167,10 @@ L(loop64):
169167
stp D_l, D_h, [dst, 64]!
170168
ldp D_l, D_h, [src, 64]!
171169
subs count, count, 64
172-
b.hi L(loop64)
170+
b.hi 8b // loop64
173171

174172
/* Write the last iteration and copy 64 bytes from the end. */
175-
L(copy64_from_end):
173+
9: // copy64_from_end
176174
ldp E_l, E_h, [srcend1, -64]
177175
stp A_l, A_h, [dst, 16]
178176
ldp A_l, A_h, [srcend1, -48]
@@ -191,7 +189,7 @@ L(copy64_from_end):
191189

192190
/* Large backwards copy for overlapping copies.
193191
Copy 16 bytes and then align dst to 16-byte alignment. */
194-
L(copy_long_backwards):
192+
10: // copy_long_backwards
195193
ldp D_l, D_h, [srcend1, -16]
196194
and tmp1, dstend1, 15
197195
sub srcend1, srcend1, tmp1
@@ -203,9 +201,9 @@ L(copy_long_backwards):
203201
ldp D_l, D_h, [srcend1, -64]!
204202
sub dstend1, dstend1, tmp1
205203
subs count, count, 128
206-
b.ls L(copy64_from_start)
204+
b.ls 12f // copy64_from_start
207205

208-
L(loop64_backwards):
206+
11: // loop64_backwards
209207
stp A_l, A_h, [dstend1, -16]
210208
ldp A_l, A_h, [srcend1, -16]
211209
stp B_l, B_h, [dstend1, -32]
@@ -215,10 +213,10 @@ L(loop64_backwards):
215213
stp D_l, D_h, [dstend1, -64]!
216214
ldp D_l, D_h, [srcend1, -64]!
217215
subs count, count, 64
218-
b.hi L(loop64_backwards)
216+
b.hi 11b // loop64_backwards
219217

220218
/* Write the last iteration and copy 64 bytes from the start. */
221-
L(copy64_from_start):
219+
12: // copy64_from_start
222220
ldp G_l, G_h, [src, 48]
223221
stp A_l, A_h, [dstend1, -16]
224222
ldp A_l, A_h, [src, 32]
@@ -232,7 +230,7 @@ L(copy64_from_start):
232230
stp B_l, B_h, [dstin, 16]
233231
stp C_l, C_h, [dstin]
234232
ret
235-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
233+
END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
236234

237235
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
238236

@@ -250,7 +248,7 @@ DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
250248
#define dstend2 x4
251249
#define zva_val x5
252250

253-
DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
251+
DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
254252
#ifdef __ARM_FEATURE_SVE
255253
mov z0.b, valw
256254
#else
@@ -263,9 +261,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
263261
add dstend2, dstin, count
264262

265263
cmp count, 96
266-
b.hi L(set_long)
264+
b.hi 7f // set_long
267265
cmp count, 16
268-
b.hs L(set_medium)
266+
b.hs 4f // set_medium
269267
mov val, v0.D[0]
270268

271269
/* Set 0..15 bytes. */
@@ -285,38 +283,38 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
285283
3: ret
286284

287285
/* Set 17..96 bytes. */
288-
L(set_medium):
286+
4: // set_medium
289287
str q0, [dstin]
290-
tbnz count, 6, L(set96)
288+
tbnz count, 6, 6f // set96
291289
str q0, [dstend2, -16]
292-
tbz count, 5, 1f
290+
tbz count, 5, 5f
293291
str q0, [dstin, 16]
294292
str q0, [dstend2, -32]
295-
1: ret
293+
5: ret
296294

297295
.p2align 4
298296
/* Set 64..96 bytes. Write 64 bytes from the start and
299297
32 bytes from the end. */
300-
L(set96):
298+
6: // set96
301299
str q0, [dstin, 16]
302300
stp q0, q0, [dstin, 32]
303301
stp q0, q0, [dstend2, -32]
304302
ret
305303

306304
.p2align 4
307-
L(set_long):
305+
7: // set_long
308306
and valw, valw, 255
309307
bic dst, dstin, 15
310308
str q0, [dstin]
311309
cmp count, 160
312310
ccmp valw, 0, 0, hs
313-
b.ne L(no_zva)
311+
b.ne 9f // no_zva
314312

315313
#ifndef SKIP_ZVA_CHECK
316314
mrs zva_val, dczid_el0
317315
and zva_val, zva_val, 31
318316
cmp zva_val, 4 /* ZVA size is 64 bytes. */
319-
b.ne L(no_zva)
317+
b.ne 9f // no_zva
320318
#endif
321319
str q0, [dst, 16]
322320
stp q0, q0, [dst, 32]
@@ -325,27 +323,27 @@ L(set_long):
325323
sub count, count, 128 /* Adjust count and bias for loop. */
326324

327325
.p2align 4
328-
L(zva_loop):
326+
8: // zva_loop
329327
add dst, dst, 64
330328
dc zva, dst
331329
subs count, count, 64
332-
b.hi L(zva_loop)
330+
b.hi 8b // zva_loop
333331
stp q0, q0, [dstend2, -64]
334332
stp q0, q0, [dstend2, -32]
335333
ret
336334

337-
L(no_zva):
335+
9: // no_zva
338336
sub count, dstend2, dst /* Count is 16 too large. */
339337
sub dst, dst, 16 /* Dst is biased by -32. */
340338
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
341-
L(no_zva_loop):
339+
10: // no_zva_loop
342340
stp q0, q0, [dst, 32]
343341
stp q0, q0, [dst, 64]!
344342
subs count, count, 64
345-
b.hi L(no_zva_loop)
343+
b.hi 10b // no_zva_loop
346344
stp q0, q0, [dstend2, -64]
347345
stp q0, q0, [dstend2, -32]
348346
ret
349-
END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
347+
END_COMPILERRT_FUNCTION(__arm_sc_memset)
350348

351349
#endif // __aarch64__

0 commit comments

Comments
 (0)