|
10 | 10 |
|
11 | 11 | .set FEAT_SVE_BIT, 30
|
12 | 12 | .set FEAT_SME_BIT, 42
|
| 13 | +.set FEAT_SME2_BIT, 57 |
| 14 | +.set FEAT_SME2_MASK, 1 << 57 |
13 | 15 | .set SVCR_PSTATE_SM_BIT, 0
|
14 | 16 |
|
15 | 17 | #if !defined(__APPLE__)
|
|
22 | 24 | #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
|
23 | 25 | #endif
|
24 | 26 |
|
25 |
| -.arch armv9-a+sme |
| 27 | +.arch armv9-a+sme2 |
26 | 28 |
|
27 | 29 | // Utility function which calls a system's abort() routine. Because the function
|
28 | 30 | // is streaming-compatible it should disable streaming-SVE mode before calling
|
@@ -204,6 +206,169 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
|
204 | 206 | ret
|
205 | 207 | END_COMPILERRT_FUNCTION(__arm_get_current_vg)
|
206 | 208 |
|
| 209 | +// The diagram below describes the layout used in the following routines: |
| 210 | +// * __arm_sme_state_size |
| 211 | +// * __arm_sme_save |
| 212 | +// * __arm_sme_restore |
| 213 | +// |
| 214 | +// +---------------------------------+ |
| 215 | +// | ... | |
| 216 | +// | ZA buffer | |
| 217 | +// | ... | |
| 218 | +// +---------------------------------+ <- @96 |
| 219 | +// | ZT0 contents | |
| 220 | +// +---------------------------------+ <- @32 |
| 221 | +// | byte 15-10: zero (reserved) | |
| 222 | +// | byte 9-8: num_za_save_slices | TPIDR2 block |
| 223 | +// | byte 7-0: za_save_buffer | |
| 224 | +// +---------------------------------+ <- @16 |
| 225 | +// | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore |
| 226 | +// | bit 0: VALID | |
| 227 | +// +---------------------------------+ <- @0 |
| 228 | + |
| 229 | +DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size) |
| 230 | + .variant_pcs __arm_sme_state_size |
| 231 | + BTI_C |
| 232 | + |
| 233 | + // Test if SME is available and ZA state is 'active'. |
| 234 | + adrp x17, CPU_FEATS_SYMBOL |
| 235 | + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
| 236 | + tbz x17, #FEAT_SME_BIT, 0f |
| 237 | + mrs x16, SVCR |
| 238 | + tbz x16, #1, 0f |
| 239 | + mrs x16, TPIDR2_EL0 |
| 240 | + cbnz x16, 0f |
| 241 | + |
| 242 | + // Size = HAS_FEAT_SME2 ? 96 : 32 |
| 243 | + tst x17, #FEAT_SME2_MASK |
| 244 | + mov w17, #32 |
| 245 | + mov w16, #96 |
| 246 | + csel x16, x17, x16, eq |
| 247 | + |
| 248 | + // Size = Size + (SVLB * SVLB) |
| 249 | + rdsvl x17, #1 |
| 250 | + madd x0, x17, x17, x16 |
| 251 | + ret |
| 252 | + |
| 253 | +0: |
| 254 | + // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes) |
| 255 | + mov w0, #16 |
| 256 | + ret |
| 257 | +END_COMPILERRT_FUNCTION(__arm_sme_state_size) |
| 258 | + |
| 259 | +DEFINE_COMPILERRT_FUNCTION(__arm_sme_save) |
| 260 | + .variant_pcs __arm_sme_save |
| 261 | + BTI_C |
| 262 | + |
| 263 | + // If PTR is not 16-byte aligned, abort. |
| 264 | + tst x0, #0xF |
| 265 | + b.ne 3f |
| 266 | + |
| 267 | + // Clear internal state bits |
| 268 | + stp xzr, xzr, [x0] |
| 269 | + |
| 270 | + // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return. |
| 271 | + adrp x17, CPU_FEATS_SYMBOL |
| 272 | + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
| 273 | + tbz x17, #FEAT_SME_BIT, 2f |
| 274 | + mrs x16, SVCR |
| 275 | + tbz x16, #1, 2f |
| 276 | + mrs x16, TPIDR2_EL0 |
| 277 | + cbnz x16, 2f |
| 278 | + |
| 279 | + # ZA or ZT0 need saving, we can now set internal VALID bit to 1 |
| 280 | + mov w16, #1 |
| 281 | + str x16, [x0] |
| 282 | + |
| 283 | + add x18, x0, #32 |
| 284 | + tbz x17, #FEAT_SME2_BIT, 1f |
| 285 | + |
| 286 | + // Store ZT0 |
| 287 | + str zt0, [x18] |
| 288 | + add x18, x18, #64 |
| 289 | + |
| 290 | +1: |
| 291 | + // Set up lazy-save (x18 = pointer to buffer) |
| 292 | + rdsvl x17, #1 |
| 293 | + str x18, [x0, #16]! |
| 294 | + strh w17, [x0, #8] |
| 295 | + strh wzr, [x0, #10] |
| 296 | + str wzr, [x0, #12] |
| 297 | + msr TPIDR2_EL0, x0 |
| 298 | + |
| 299 | +2: |
| 300 | + // Do nothing |
| 301 | + ret |
| 302 | + |
| 303 | +3: |
| 304 | + b SYMBOL_NAME(do_abort) |
| 305 | +END_COMPILERRT_FUNCTION(__arm_sme_save) |
| 306 | + |
| 307 | +DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore) |
| 308 | + .cfi_startproc |
| 309 | + .variant_pcs __arm_sme_restore |
| 310 | + BTI_C |
| 311 | + |
| 312 | + stp x29, x30, [sp, #-16]! |
| 313 | + .cfi_def_cfa_offset 16 |
| 314 | + mov x29, sp |
| 315 | + .cfi_def_cfa w29, 16 |
| 316 | + .cfi_offset w30, -8 |
| 317 | + .cfi_offset w29, -16 |
| 318 | + |
| 319 | + // If PTR is not 16-byte aligned, abort. |
| 320 | + tst x0, #0xF |
| 321 | + b.ne 3f |
| 322 | + |
| 323 | + // If the VALID bit is 0, return early. |
| 324 | + ldr x16, [x0] |
| 325 | + cbz x16, 2f |
| 326 | + |
| 327 | + // If SME is not available, abort. |
| 328 | + adrp x17, CPU_FEATS_SYMBOL |
| 329 | + ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET] |
| 330 | + tbz x17, #FEAT_SME_BIT, 3f |
| 331 | + |
| 332 | + // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0. |
| 333 | + mrs x16, TPIDR2_EL0 |
| 334 | + cbnz x16, 1f |
| 335 | + |
| 336 | + // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'), |
| 337 | + // abort. |
| 338 | + mrs x16, SVCR |
| 339 | + tbnz x16, #1, 3f |
| 340 | + |
| 341 | + // Restore za. |
| 342 | + smstart za |
| 343 | + add x0, x0, #16 |
| 344 | + bl __arm_tpidr2_restore |
| 345 | + sub x0, x0, #16 |
| 346 | + |
| 347 | +1: |
| 348 | + smstart za |
| 349 | + msr TPIDR2_EL0, xzr |
| 350 | + |
| 351 | + // Check if zt0 needs restoring. |
| 352 | + tbz x17, #FEAT_SME2_BIT, 2f |
| 353 | + |
| 354 | + // Restore zt0. |
| 355 | + add x16, x0, #32 |
| 356 | + ldr zt0, [x16] |
| 357 | + |
| 358 | +2: |
| 359 | + // Do nothing |
| 360 | + .cfi_def_cfa wsp, 16 |
| 361 | + ldp x29, x30, [sp], #16 |
| 362 | + .cfi_def_cfa_offset 0 |
| 363 | + .cfi_restore w30 |
| 364 | + .cfi_restore w29 |
| 365 | + ret |
| 366 | + |
| 367 | +3: |
| 368 | + b SYMBOL_NAME(do_abort) |
| 369 | + .cfi_endproc |
| 370 | +END_COMPILERRT_FUNCTION(__arm_sme_restore) |
| 371 | + |
207 | 372 | NO_EXEC_STACK_DIRECTIVE
|
208 | 373 |
|
209 | 374 | // GNU property note for BTI and PAC
|
|
0 commit comments