Skip to content

Commit 39d15a7

Browse files
authored
[AArch64][SME] Remove implicit-def's on smstart (#69012)
When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway.
1 parent f58fb8c commit 39d15a7

File tree

6 files changed

+109
-9
lines changed

6 files changed

+109
-9
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7398,6 +7398,22 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
73987398
return ZExtBool;
73997399
}
74007400

7401+
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7402+
SDNode *Node) const {
7403+
// Live-in physreg copies that are glued to SMSTART are applied as
7404+
// implicit-def's in the InstrEmitter. Here we remove them, allowing the
7405+
// register allocator to pass call args in callee saved regs, without extra
7406+
// copies to avoid these fake clobbers of actually-preserved GPRs.
7407+
if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7408+
MI.getOpcode() == AArch64::MSRpstatePseudo)
7409+
for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7410+
if (MachineOperand &MO = MI.getOperand(I);
7411+
MO.isReg() && MO.isImplicit() && MO.isDef() &&
7412+
(AArch64::GPR32RegClass.contains(MO.getReg()) ||
7413+
AArch64::GPR64RegClass.contains(MO.getReg())))
7414+
MI.removeOperand(I);
7415+
}
7416+
74017417
SDValue AArch64TargetLowering::changeStreamingMode(
74027418
SelectionDAG &DAG, SDLoc DL, bool Enable,
74037419
SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,9 @@ class AArch64TargetLowering : public TargetLowering {
999999
const SDLoc &DL, SelectionDAG &DAG,
10001000
SmallVectorImpl<SDValue> &InVals) const override;
10011001

1002+
void AdjustInstrPostInstrSelection(MachineInstr &MI,
1003+
SDNode *Node) const override;
1004+
10021005
SDValue LowerCall(CallLoweringInfo & /*CLI*/,
10031006
SmallVectorImpl<SDValue> &InVals) const override;
10041007

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,9 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
203203
def MSRpstatePseudo :
204204
Pseudo<(outs),
205205
(ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
206-
Sched<[WriteSys]>;
206+
Sched<[WriteSys]> {
207+
let hasPostISelHook = 1;
208+
}
207209

208210
def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
209211
(MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ def MSRpstatesvcrImm1
222222
let Inst{11-9} = pstatefield;
223223
let Inst{8} = imm;
224224
let Inst{7-5} = 0b011; // op2
225+
let hasPostISelHook = 1;
225226
}
226227

227228
def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>;

llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,3 +436,56 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
436436
tail call void @normal_callee();
437437
ret void;
438438
}
439+
440+
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
441+
; CHECK-LABEL: call_to_non_streaming_pass_args:
442+
; CHECK: // %bb.0: // %entry
443+
; CHECK-NEXT: sub sp, sp, #112
444+
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
445+
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
446+
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
447+
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
448+
; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
449+
; CHECK-NEXT: .cfi_def_cfa_offset 112
450+
; CHECK-NEXT: .cfi_offset w19, -8
451+
; CHECK-NEXT: .cfi_offset w30, -16
452+
; CHECK-NEXT: .cfi_offset b8, -24
453+
; CHECK-NEXT: .cfi_offset b9, -32
454+
; CHECK-NEXT: .cfi_offset b10, -40
455+
; CHECK-NEXT: .cfi_offset b11, -48
456+
; CHECK-NEXT: .cfi_offset b12, -56
457+
; CHECK-NEXT: .cfi_offset b13, -64
458+
; CHECK-NEXT: .cfi_offset b14, -72
459+
; CHECK-NEXT: .cfi_offset b15, -80
460+
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
461+
; CHECK-NEXT: mov x8, x1
462+
; CHECK-NEXT: mov x9, x0
463+
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
464+
; CHECK-NEXT: bl __arm_sme_state
465+
; CHECK-NEXT: and x19, x0, #0x1
466+
; CHECK-NEXT: tbz w19, #0, .LBB10_2
467+
; CHECK-NEXT: // %bb.1: // %entry
468+
; CHECK-NEXT: smstop sm
469+
; CHECK-NEXT: .LBB10_2: // %entry
470+
; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
471+
; CHECK-NEXT: mov x0, x9
472+
; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
473+
; CHECK-NEXT: mov x1, x8
474+
; CHECK-NEXT: bl bar
475+
; CHECK-NEXT: tbz w19, #0, .LBB10_4
476+
; CHECK-NEXT: // %bb.3: // %entry
477+
; CHECK-NEXT: smstart sm
478+
; CHECK-NEXT: .LBB10_4: // %entry
479+
; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
480+
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
481+
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
482+
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
483+
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
484+
; CHECK-NEXT: add sp, sp, #112
485+
; CHECK-NEXT: ret
486+
entry:
487+
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
488+
ret void
489+
}
490+
491+
declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)

llvm/test/CodeGen/AArch64/sme-streaming-interface.ll

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
368368
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
369369
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
370370
; CHECK-NEXT: addvl sp, sp, #-3
371-
; CHECK-NEXT: rdsvl x8, #1
372-
; CHECK-NEXT: addvl x9, sp, #2
373-
; CHECK-NEXT: addvl x10, sp, #1
374-
; CHECK-NEXT: mov x11, sp
371+
; CHECK-NEXT: rdsvl x3, #1
372+
; CHECK-NEXT: addvl x0, sp, #2
373+
; CHECK-NEXT: addvl x1, sp, #1
374+
; CHECK-NEXT: mov x2, sp
375375
; CHECK-NEXT: smstop sm
376-
; CHECK-NEXT: mov x0, x9
377-
; CHECK-NEXT: mov x1, x10
378-
; CHECK-NEXT: mov x2, x11
379-
; CHECK-NEXT: mov x3, x8
380376
; CHECK-NEXT: bl foo
381377
; CHECK-NEXT: smstart sm
382378
; CHECK-NEXT: ptrue p0.b
@@ -400,8 +396,37 @@ entry:
400396
ret i8 %vecext
401397
}
402398

399+
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
400+
; CHECK-LABEL: call_to_non_streaming_pass_args:
401+
; CHECK: // %bb.0: // %entry
402+
; CHECK-NEXT: sub sp, sp, #112
403+
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
404+
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
405+
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
406+
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
407+
; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
408+
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
409+
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
410+
; CHECK-NEXT: smstop sm
411+
; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
412+
; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
413+
; CHECK-NEXT: bl bar
414+
; CHECK-NEXT: smstart sm
415+
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
416+
; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
417+
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
418+
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
419+
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
420+
; CHECK-NEXT: add sp, sp, #112
421+
; CHECK-NEXT: ret
422+
entry:
423+
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
424+
ret void
425+
}
426+
403427
declare i64 @llvm.aarch64.sme.cntsb()
404428

405429
declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
430+
declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
406431

407432
attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }

0 commit comments

Comments
 (0)