Skip to content

Commit f914e8e

Browse files
[AArch64][SME] Add coalescer barrier for args/results in locally streaming functions. (llvm#85388)
Similar to how we protected FP/fixed-vector arguments and results from calls, we should do the same for arguments/results from locally-streaming functions such that those are not spilled/filled as ZPR registers. This may cause a small regression (additional spills/fills), which is addressed by llvm#85386.
1 parent da9ac43 commit f914e8e

File tree

5 files changed

+168
-35
lines changed

5 files changed

+168
-35
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6895,6 +6895,11 @@ AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
68956895
return TPIDR2Obj;
68966896
}
68976897

6898+
static bool isPassedInFPR(EVT VT) {
6899+
return VT.isFixedLengthVector() ||
6900+
(VT.isFloatingPoint() && !VT.isScalableVector());
6901+
}
6902+
68986903
SDValue AArch64TargetLowering::LowerFormalArguments(
68996904
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
69006905
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -7031,6 +7036,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
70317036
// This will be the new Chain/Root node.
70327037
ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
70337038
Glue = ArgValue.getValue(2);
7039+
if (isPassedInFPR(ArgValue.getValueType())) {
7040+
ArgValue =
7041+
DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
7042+
DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7043+
{ArgValue, Glue});
7044+
Glue = ArgValue.getValue(1);
7045+
}
70347046
} else
70357047
ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
70367048

@@ -7402,11 +7414,6 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
74027414
}
74037415
}
74047416

7405-
static bool isPassedInFPR(EVT VT) {
7406-
return VT.isFixedLengthVector() ||
7407-
(VT.isFloatingPoint() && !VT.isScalableVector());
7408-
}
7409-
74107417
/// LowerCallResult - Lower the result values of a call into the
74117418
/// appropriate copies out of appropriate physical registers.
74127419
SDValue AArch64TargetLowering::LowerCallResult(
@@ -8632,6 +8639,10 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
86328639

86338640
SmallVector<SDValue, 4> RetOps(1, Chain);
86348641
for (auto &RetVal : RetVals) {
8642+
if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8643+
isPassedInFPR(RetVal.second.getValueType()))
8644+
RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8645+
RetVal.second.getValueType(), RetVal.second);
86358646
Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
86368647
Glue = Chain.getValue(1);
86378648
RetOps.push_back(

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
2929
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
3030
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
3131
def AArch64CoalescerBarrier
32-
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, []>;
32+
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
3333

3434
//===----------------------------------------------------------------------===//
3535
// Instruction naming conventions.
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK-COALESCER-BARRIER
3+
; RUN: llc -mattr=+sme -stop-after=virtregrewriter < %s | FileCheck %s --check-prefix=CHECK-REGALLOC
4+
5+
target triple = "aarch64"
6+
7+
define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
8+
; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_args
9+
; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
10+
; CHECK-COALESCER-BARRIER-NEXT: liveins: $q0
11+
; CHECK-COALESCER-BARRIER-NEXT: {{ $}}
12+
; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
13+
; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
14+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
15+
; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
16+
; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
17+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
18+
; CHECK-COALESCER-BARRIER-NEXT: $z0 = COPY [[INSERT_SUBREG]]
19+
; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
20+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
21+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
22+
; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR
23+
;
24+
; CHECK-REGALLOC-LABEL: name: dont_coalesce_args
25+
; CHECK-REGALLOC: bb.0 (%ir-block.0):
26+
; CHECK-REGALLOC-NEXT: liveins: $q0
27+
; CHECK-REGALLOC-NEXT: {{ $}}
28+
; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
29+
; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
30+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
31+
; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
32+
; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0
33+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
34+
; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
35+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
36+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
37+
; CHECK-REGALLOC-NEXT: RET_ReallyLR
38+
%sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
39+
call void @scalable_args(<vscale x 2 x i64> %sa)
40+
ret void
41+
}
42+
43+
define <2 x i64> @dont_coalesce_res() "aarch64_pstate_sm_body" nounwind {
44+
; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_res
45+
; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
46+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
47+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
48+
; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
49+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
50+
; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:zpr = COPY $z0
51+
; CHECK-COALESCER-BARRIER-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY [[COPY]].zsub
52+
; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY1]]
53+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
54+
; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_]]
55+
; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0
56+
;
57+
; CHECK-REGALLOC-LABEL: name: dont_coalesce_res
58+
; CHECK-REGALLOC: bb.0 (%ir-block.0):
59+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
60+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
61+
; CHECK-REGALLOC-NEXT: BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
62+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
63+
; CHECK-REGALLOC-NEXT: renamable $q0 = KILL renamable $q0, implicit killed $z0
64+
; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
65+
; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
66+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
67+
; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
68+
; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0
69+
%sa = call <vscale x 2 x i64> @scalable_res()
70+
%res = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %sa, i64 0)
71+
ret <2 x i64> %res
72+
}
73+
74+
define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
75+
; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_arg_that_is_also_res
76+
; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
77+
; CHECK-COALESCER-BARRIER-NEXT: liveins: $q0
78+
; CHECK-COALESCER-BARRIER-NEXT: {{ $}}
79+
; CHECK-COALESCER-BARRIER-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
80+
; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
81+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
82+
; CHECK-COALESCER-BARRIER-NEXT: [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
83+
; CHECK-COALESCER-BARRIER-NEXT: [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
84+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
85+
; CHECK-COALESCER-BARRIER-NEXT: $z0 = COPY [[INSERT_SUBREG]]
86+
; CHECK-COALESCER-BARRIER-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
87+
; CHECK-COALESCER-BARRIER-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
88+
; CHECK-COALESCER-BARRIER-NEXT: [[COALESCER_BARRIER_FPR128_1:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COALESCER_BARRIER_FPR128_]]
89+
; CHECK-COALESCER-BARRIER-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
90+
; CHECK-COALESCER-BARRIER-NEXT: $q0 = COPY [[COALESCER_BARRIER_FPR128_1]]
91+
; CHECK-COALESCER-BARRIER-NEXT: RET_ReallyLR implicit $q0
92+
;
93+
; CHECK-REGALLOC-LABEL: name: dont_coalesce_arg_that_is_also_res
94+
; CHECK-REGALLOC: bb.0 (%ir-block.0):
95+
; CHECK-REGALLOC-NEXT: liveins: $q0
96+
; CHECK-REGALLOC-NEXT: {{ $}}
97+
; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
98+
; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
99+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
100+
; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
101+
; CHECK-REGALLOC-NEXT: renamable $q0 = KILL killed renamable $q0, implicit-def $z0
102+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
103+
; CHECK-REGALLOC-NEXT: BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
104+
; CHECK-REGALLOC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
105+
; CHECK-REGALLOC-NEXT: renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
106+
; CHECK-REGALLOC-NEXT: renamable $q0 = COALESCER_BARRIER_FPR128 killed renamable $q0
107+
; CHECK-REGALLOC-NEXT: STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
108+
; CHECK-REGALLOC-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
109+
; CHECK-REGALLOC-NEXT: $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
110+
; CHECK-REGALLOC-NEXT: RET_ReallyLR implicit $q0
111+
%sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
112+
call void @scalable_args(<vscale x 2 x i64> %sa)
113+
ret <2 x i64> %a
114+
}
115+
116+
declare void @scalable_args(<vscale x 2 x i64>) "aarch64_pstate_sm_enabled"
117+
declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
118+
119+
declare <vscale x 2 x i64> @scalable_res() "aarch64_pstate_sm_enabled"
120+
declare <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)

llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,31 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
88
define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
99
; CHECK-LABEL: sm_body_sm_compatible_simple:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
12-
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
13-
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
14-
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
15-
; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
11+
; CHECK-NEXT: sub sp, sp, #96
12+
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
13+
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
14+
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
15+
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
16+
; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
1617
; CHECK-NEXT: bl __arm_sme_state
1718
; CHECK-NEXT: and x8, x0, #0x1
1819
; CHECK-NEXT: tbnz w8, #0, .LBB0_2
1920
; CHECK-NEXT: // %bb.1:
2021
; CHECK-NEXT: smstart sm
2122
; CHECK-NEXT: .LBB0_2:
23+
; CHECK-NEXT: fmov s0, wzr
24+
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
2225
; CHECK-NEXT: tbnz w8, #0, .LBB0_4
2326
; CHECK-NEXT: // %bb.3:
2427
; CHECK-NEXT: smstop sm
2528
; CHECK-NEXT: .LBB0_4:
26-
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
27-
; CHECK-NEXT: fmov s0, wzr
28-
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
29-
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
30-
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
31-
; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
29+
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
30+
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
31+
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
32+
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
33+
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
34+
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
35+
; CHECK-NEXT: add sp, sp, #96
3236
; CHECK-NEXT: ret
3337
ret float zeroinitializer
3438
}

llvm/test/CodeGen/AArch64/sme-streaming-body.ll

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -87,29 +87,27 @@ if.end:
8787
define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
8888
; CHECK-LABEL: locally_streaming_caller_no_callee:
8989
; CHECK: // %bb.0:
90-
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
91-
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
92-
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
93-
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
94-
; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
95-
; CHECK-NEXT: addsvl sp, sp, #-1
96-
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
97-
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
90+
; CHECK-NEXT: sub sp, sp, #80
91+
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
92+
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
93+
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
94+
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
95+
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
9896
; CHECK-NEXT: smstart sm
9997
; CHECK-NEXT: index z0.d, #0, #1
100-
; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
98+
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
99+
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
101100
; CHECK-NEXT: add z0.d, z0.d, z1.d
102101
; CHECK-NEXT: add z0.d, z0.d, #41 // =0x29
103-
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
104-
; CHECK-NEXT: smstop sm
105-
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
106102
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
107-
; CHECK-NEXT: addsvl sp, sp, #1
108-
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
109-
; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
110-
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
111-
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
112-
; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
103+
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
104+
; CHECK-NEXT: smstop sm
105+
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
106+
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
107+
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
108+
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
109+
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
110+
; CHECK-NEXT: add sp, sp, #80
113111
; CHECK-NEXT: ret
114112

115113
%add = add <2 x i64> %a, <i64 41, i64 42>;

0 commit comments

Comments
 (0)