Skip to content

Commit 811f2a6

Browse files
[Compiler-rt] Add AArch64 routines for __arm_agnostic("sme_za_state") (#120059)
The specification of these routines can be found here: https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
1 parent 8177bf5 commit 811f2a6

File tree

3 files changed

+169
-2
lines changed

3 files changed

+169
-2
lines changed

compiler-rt/cmake/builtin-config-ix.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\");
4343
builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
4444
"
4545
void foo(void) __arm_streaming_compatible {
46-
asm(\".arch armv9-a+sme\");
46+
asm(\".arch armv9-a+sme2\");
4747
asm(\"smstart\");
48+
asm(\"ldr zt0, [sp]\");
4849
}
4950
")
5051

compiler-rt/lib/builtins/aarch64/sme-abi-assert.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
#include "../cpu_model/AArch64CPUFeatures.inc"
99
_Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30");
1010
_Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42");
11+
_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57");

compiler-rt/lib/builtins/aarch64/sme-abi.S

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
.set FEAT_SVE_BIT, 30
1212
.set FEAT_SME_BIT, 42
13+
.set FEAT_SME2_BIT, 57
14+
.set FEAT_SME2_MASK, 1 << 57
1315
.set SVCR_PSTATE_SM_BIT, 0
1416

1517
#if !defined(__APPLE__)
@@ -22,7 +24,7 @@
2224
#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
2325
#endif
2426

25-
.arch armv9-a+sme
27+
.arch armv9-a+sme2
2628

2729
// Utility function which calls a system's abort() routine. Because the function
2830
// is streaming-compatible it should disable streaming-SVE mode before calling
@@ -204,6 +206,169 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
204206
ret
205207
END_COMPILERRT_FUNCTION(__arm_get_current_vg)
206208

209+
// The diagram below describes the layout used in the following routines:
210+
// * __arm_sme_state_size
211+
// * __arm_sme_save
212+
// * __arm_sme_restore
213+
//
214+
// +---------------------------------+
215+
// | ... |
216+
// | ZA buffer |
217+
// | ... |
218+
// +---------------------------------+ <- @96
219+
// | ZT0 contents |
220+
// +---------------------------------+ <- @32
221+
// | byte 15-10: zero (reserved) |
222+
// | byte 9-8: num_za_save_slices | TPIDR2 block
223+
// | byte 7-0: za_save_buffer |
224+
// +---------------------------------+ <- @16
225+
// | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore
226+
// | bit 0: VALID |
227+
// +---------------------------------+ <- @0
228+
229+
DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
230+
.variant_pcs __arm_sme_state_size
231+
BTI_C
232+
233+
// Test if SME is available and ZA state is 'active'.
234+
adrp x17, CPU_FEATS_SYMBOL
235+
ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
236+
tbz x17, #FEAT_SME_BIT, 0f
237+
mrs x16, SVCR
238+
tbz x16, #1, 0f
239+
mrs x16, TPIDR2_EL0
240+
cbnz x16, 0f
241+
242+
// Size = HAS_FEAT_SME2 ? 96 : 32
243+
tst x17, #FEAT_SME2_MASK
244+
mov w17, #32
245+
mov w16, #96
246+
csel x16, x17, x16, eq
247+
248+
// Size = Size + (SVLB * SVLB)
249+
rdsvl x17, #1
250+
madd x0, x17, x17, x16
251+
ret
252+
253+
0:
254+
// Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes)
255+
mov w0, #16
256+
ret
257+
END_COMPILERRT_FUNCTION(__arm_sme_state_size)
258+
259+
DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
260+
.variant_pcs __arm_sme_save
261+
BTI_C
262+
263+
// If PTR is not 16-byte aligned, abort.
264+
tst x0, #0xF
265+
b.ne 3f
266+
267+
// Clear internal state bits
268+
stp xzr, xzr, [x0]
269+
270+
// If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
271+
adrp x17, CPU_FEATS_SYMBOL
272+
ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
273+
tbz x17, #FEAT_SME_BIT, 2f
274+
mrs x16, SVCR
275+
tbz x16, #1, 2f
276+
mrs x16, TPIDR2_EL0
277+
cbnz x16, 2f
278+
279+
# ZA or ZT0 need saving, we can now set internal VALID bit to 1
280+
mov w16, #1
281+
str x16, [x0]
282+
283+
add x18, x0, #32
284+
tbz x17, #FEAT_SME2_BIT, 1f
285+
286+
// Store ZT0
287+
str zt0, [x18]
288+
add x18, x18, #64
289+
290+
1:
291+
// Set up lazy-save (x18 = pointer to buffer)
292+
rdsvl x17, #1
293+
str x18, [x0, #16]!
294+
strh w17, [x0, #8]
295+
strh wzr, [x0, #10]
296+
str wzr, [x0, #12]
297+
msr TPIDR2_EL0, x0
298+
299+
2:
300+
// Do nothing
301+
ret
302+
303+
3:
304+
b SYMBOL_NAME(do_abort)
305+
END_COMPILERRT_FUNCTION(__arm_sme_save)
306+
307+
DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
308+
.cfi_startproc
309+
.variant_pcs __arm_sme_restore
310+
BTI_C
311+
312+
stp x29, x30, [sp, #-16]!
313+
.cfi_def_cfa_offset 16
314+
mov x29, sp
315+
.cfi_def_cfa w29, 16
316+
.cfi_offset w30, -8
317+
.cfi_offset w29, -16
318+
319+
// If PTR is not 16-byte aligned, abort.
320+
tst x0, #0xF
321+
b.ne 3f
322+
323+
// If the VALID bit is 0, return early.
324+
ldr x16, [x0]
325+
cbz x16, 2f
326+
327+
// If SME is not available, abort.
328+
adrp x17, CPU_FEATS_SYMBOL
329+
ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
330+
tbz x17, #FEAT_SME_BIT, 3f
331+
332+
// If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
333+
mrs x16, TPIDR2_EL0
334+
cbnz x16, 1f
335+
336+
// If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
337+
// abort.
338+
mrs x16, SVCR
339+
tbnz x16, #1, 3f
340+
341+
// Restore za.
342+
smstart za
343+
add x0, x0, #16
344+
bl __arm_tpidr2_restore
345+
sub x0, x0, #16
346+
347+
1:
348+
smstart za
349+
msr TPIDR2_EL0, xzr
350+
351+
// Check if zt0 needs restoring.
352+
tbz x17, #FEAT_SME2_BIT, 2f
353+
354+
// Restore zt0.
355+
add x16, x0, #32
356+
ldr zt0, [x16]
357+
358+
2:
359+
// Do nothing
360+
.cfi_def_cfa wsp, 16
361+
ldp x29, x30, [sp], #16
362+
.cfi_def_cfa_offset 0
363+
.cfi_restore w30
364+
.cfi_restore w29
365+
ret
366+
367+
3:
368+
b SYMBOL_NAME(do_abort)
369+
.cfi_endproc
370+
END_COMPILERRT_FUNCTION(__arm_sme_restore)
371+
207372
NO_EXEC_STACK_DIRECTIVE
208373

209374
// GNU property note for BTI and PAC

0 commit comments

Comments
 (0)