Skip to content

Commit 09eb08d

Browse files
committed
[AArch64][SME] Remove implicit-def's on smstart
When we lower calls, the sequence of argument copy-to-reg nodes are glued to the smstart. In the InstrEmitter, these glued copies are turned into implicit defs, since the actual call instruction uses those physregs, resulting in the register allocator adding unnecessary copies of regs that are preserved anyway.
1 parent 50b9930 commit 09eb08d

File tree

6 files changed

+109
-9
lines changed

6 files changed

+109
-9
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7376,6 +7376,22 @@ static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
73767376
return ZExtBool;
73777377
}
73787378

7379+
void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7380+
SDNode *Node) const {
7381+
// Live-in physreg copies that are glued to SMSTART are applied as
7382+
// implicit-def's in the InstrEmitter. Here we remove them, allowing the
7383+
// register allocator to pass call args in callee saved regs, without extra
7384+
// copies to avoid these fake clobbers of actually-preserved GPRs.
7385+
if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7386+
MI.getOpcode() == AArch64::MSRpstatePseudo)
7387+
for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7388+
if (MachineOperand &MO = MI.getOperand(I);
7389+
MO.isReg() && MO.isImplicit() && MO.isDef() &&
7390+
(AArch64::GPR32RegClass.contains(MO.getReg()) ||
7391+
AArch64::GPR64RegClass.contains(MO.getReg())))
7392+
MI.removeOperand(I);
7393+
}
7394+
73797395
SDValue AArch64TargetLowering::changeStreamingMode(
73807396
SelectionDAG &DAG, SDLoc DL, bool Enable,
73817397
SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,9 @@ class AArch64TargetLowering : public TargetLowering {
975975
const SDLoc &DL, SelectionDAG &DAG,
976976
SmallVectorImpl<SDValue> &InVals) const override;
977977

978+
void AdjustInstrPostInstrSelection(MachineInstr &MI,
979+
SDNode *Node) const override;
980+
978981
SDValue LowerCall(CallLoweringInfo & /*CLI*/,
979982
SmallVectorImpl<SDValue> &InVals) const override;
980983

llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,9 @@ def : Pat<(i64 (int_aarch64_sme_get_tpidr2)),
203203
def MSRpstatePseudo :
204204
Pseudo<(outs),
205205
(ins svcr_op:$pstatefield, timm0_1:$imm, GPR64:$rtpstate, timm0_1:$expected_pstate, variable_ops), []>,
206-
Sched<[WriteSys]>;
206+
Sched<[WriteSys]> {
207+
let hasPostISelHook = 1;
208+
}
207209

208210
def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 GPR64:$rtpstate), (i64 timm0_1:$expected_pstate)),
209211
(MSRpstatePseudo svcr_op:$pstate, 0b1, GPR64:$rtpstate, timm0_1:$expected_pstate)>;

llvm/lib/Target/AArch64/SMEInstrFormats.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def MSRpstatesvcrImm1
221221
let Inst{11-9} = pstatefield;
222222
let Inst{8} = imm;
223223
let Inst{7-5} = 0b011; // op2
224+
let hasPostISelHook = 1;
224225
}
225226

226227
def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>;

llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,3 +436,56 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
436436
tail call void @normal_callee();
437437
ret void;
438438
}
439+
440+
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
441+
; CHECK-LABEL: call_to_non_streaming_pass_args:
442+
; CHECK: // %bb.0: // %entry
443+
; CHECK-NEXT: sub sp, sp, #112
444+
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
445+
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
446+
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
447+
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
448+
; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
449+
; CHECK-NEXT: .cfi_def_cfa_offset 112
450+
; CHECK-NEXT: .cfi_offset w19, -8
451+
; CHECK-NEXT: .cfi_offset w30, -16
452+
; CHECK-NEXT: .cfi_offset b8, -24
453+
; CHECK-NEXT: .cfi_offset b9, -32
454+
; CHECK-NEXT: .cfi_offset b10, -40
455+
; CHECK-NEXT: .cfi_offset b11, -48
456+
; CHECK-NEXT: .cfi_offset b12, -56
457+
; CHECK-NEXT: .cfi_offset b13, -64
458+
; CHECK-NEXT: .cfi_offset b14, -72
459+
; CHECK-NEXT: .cfi_offset b15, -80
460+
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
461+
; CHECK-NEXT: mov x8, x1
462+
; CHECK-NEXT: mov x9, x0
463+
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
464+
; CHECK-NEXT: bl __arm_sme_state
465+
; CHECK-NEXT: and x19, x0, #0x1
466+
; CHECK-NEXT: tbz w19, #0, .LBB10_2
467+
; CHECK-NEXT: // %bb.1: // %entry
468+
; CHECK-NEXT: smstop sm
469+
; CHECK-NEXT: .LBB10_2: // %entry
470+
; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
471+
; CHECK-NEXT: mov x0, x9
472+
; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
473+
; CHECK-NEXT: mov x1, x8
474+
; CHECK-NEXT: bl bar
475+
; CHECK-NEXT: tbz w19, #0, .LBB10_4
476+
; CHECK-NEXT: // %bb.3: // %entry
477+
; CHECK-NEXT: smstart sm
478+
; CHECK-NEXT: .LBB10_4: // %entry
479+
; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
480+
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
481+
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
482+
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
483+
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
484+
; CHECK-NEXT: add sp, sp, #112
485+
; CHECK-NEXT: ret
486+
entry:
487+
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
488+
ret void
489+
}
490+
491+
declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)

llvm/test/CodeGen/AArch64/sme-streaming-interface.ll

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -368,15 +368,11 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
368368
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
369369
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
370370
; CHECK-NEXT: addvl sp, sp, #-3
371-
; CHECK-NEXT: rdsvl x8, #1
372-
; CHECK-NEXT: addvl x9, sp, #2
373-
; CHECK-NEXT: addvl x10, sp, #1
374-
; CHECK-NEXT: mov x11, sp
371+
; CHECK-NEXT: rdsvl x3, #1
372+
; CHECK-NEXT: addvl x0, sp, #2
373+
; CHECK-NEXT: addvl x1, sp, #1
374+
; CHECK-NEXT: mov x2, sp
375375
; CHECK-NEXT: smstop sm
376-
; CHECK-NEXT: mov x0, x9
377-
; CHECK-NEXT: mov x1, x10
378-
; CHECK-NEXT: mov x2, x11
379-
; CHECK-NEXT: mov x3, x8
380376
; CHECK-NEXT: bl foo
381377
; CHECK-NEXT: smstart sm
382378
; CHECK-NEXT: ptrue p0.b
@@ -400,8 +396,37 @@ entry:
400396
ret i8 %vecext
401397
}
402398

399+
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) #0 {
400+
; CHECK-LABEL: call_to_non_streaming_pass_args:
401+
; CHECK: // %bb.0: // %entry
402+
; CHECK-NEXT: sub sp, sp, #112
403+
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
404+
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
405+
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
406+
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
407+
; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
408+
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
409+
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
410+
; CHECK-NEXT: smstop sm
411+
; CHECK-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
412+
; CHECK-NEXT: ldp d2, d3, [sp, #16] // 16-byte Folded Reload
413+
; CHECK-NEXT: bl bar
414+
; CHECK-NEXT: smstart sm
415+
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
416+
; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
417+
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
418+
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
419+
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
420+
; CHECK-NEXT: add sp, sp, #112
421+
; CHECK-NEXT: ret
422+
entry:
423+
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
424+
ret void
425+
}
426+
403427
declare i64 @llvm.aarch64.sme.cntsb()
404428

405429
declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef)
430+
declare void @bar(ptr noundef, i64 noundef, i64 noundef, i32 noundef, i32 noundef, float noundef, float noundef, double noundef, double noundef)
406431

407432
attributes #0 = { nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" }

0 commit comments

Comments
 (0)