Skip to content

[AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) #89017

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 67 additions & 1 deletion llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class AArch64PostSelectOptimize : public MachineFunctionPass {
bool doPeepholeOpts(MachineBasicBlock &MBB);
/// Look for cross regclass copies that can be trivially eliminated.
bool foldSimpleCrossClassCopies(MachineInstr &MI);
bool foldCopyDup(MachineInstr &MI);
};
} // end anonymous namespace

Expand Down Expand Up @@ -105,7 +106,10 @@ unsigned getNonFlagSettingVariant(unsigned Opc) {
bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
bool Changed = false;
for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
Changed |= foldSimpleCrossClassCopies(MI);
bool CurrentIterChanged = foldSimpleCrossClassCopies(MI);
if (!CurrentIterChanged)
CurrentIterChanged |= foldCopyDup(MI);
Changed |= CurrentIterChanged;
}
return Changed;
}
Expand Down Expand Up @@ -158,6 +162,68 @@ bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) {
return true;
}

bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) {
if (!MI.isCopy())
return false;

auto *MF = MI.getMF();
auto &MRI = MF->getRegInfo();
auto *TII = MF->getSubtarget().getInstrInfo();

// Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i).
// Here Dst is y and Src is the result of DUP.
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();

if (!Dst.isVirtual() || !Src.isVirtual())
return false;

auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
const TargetRegisterClass *FPRRegClass, unsigned DUP,
unsigned UMOV) {
if (MRI.getRegClassOrNull(Dst) != GPRRegClass ||
MRI.getRegClassOrNull(Src) != FPRRegClass)
return false;

// There is a special case when one of the uses is COPY(z:FPR, y:GPR).
// In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can
// be folded by peephole-opt into just DUP(z:FPR, i), so this transform is
// not worthwhile in that case.
for (auto &Use : MRI.use_nodbg_instructions(Dst)) {
if (!Use.isCopy())
continue;

Register UseOp0 = Use.getOperand(0).getReg();
Register UseOp1 = Use.getOperand(1).getReg();
if (UseOp0.isPhysical() || UseOp1.isPhysical())
return false;

if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass &&
MRI.getRegClassOrNull(UseOp1) == GPRRegClass)
return false;
}

MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src);
if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src))
return false;

Register DupSrc = SrcMI->getOperand(1).getReg();
int64_t DupImm = SrcMI->getOperand(2).getImm();

BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
.addReg(DupSrc)
.addImm(DupImm);
SrcMI->eraseFromParent();
MI.eraseFromParent();
return true;
};

return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
AArch64::DUPi32, AArch64::UMOVvi32) ||
TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
AArch64::DUPi64, AArch64::UMOVvi64);
}

bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) {
// If we find a dead NZCV implicit-def, we
// - try to convert the operation to a non-flag-setting equivalent
Expand Down
117 changes: 39 additions & 78 deletions llvm/test/CodeGen/AArch64/aarch64-mulv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)

define i8 @mulv_v2i8(<2 x i8> %a) {
; CHECK-SD-LABEL: mulv_v2i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov w8, v0.s[1]
; CHECK-SD-NEXT: fmov w9, s0
; CHECK-SD-NEXT: mul w0, w9, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v2i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mul w0, w8, w9
; CHECK-GI-NEXT: ret
; CHECK-LABEL: mulv_v2i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: mul w0, w9, w8
; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
ret i8 %arg1
Expand Down Expand Up @@ -230,22 +221,13 @@ entry:
}

define i16 @mulv_v2i16(<2 x i16> %a) {
; CHECK-SD-LABEL: mulv_v2i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov w8, v0.s[1]
; CHECK-SD-NEXT: fmov w9, s0
; CHECK-SD-NEXT: mul w0, w9, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v2i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mul w0, w8, w9
; CHECK-GI-NEXT: ret
; CHECK-LABEL: mulv_v2i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: mul w0, w9, w8
; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
ret i16 %arg1
Expand Down Expand Up @@ -372,22 +354,13 @@ entry:
}

define i32 @mulv_v2i32(<2 x i32> %a) {
; CHECK-SD-LABEL: mulv_v2i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: mov w8, v0.s[1]
; CHECK-SD-NEXT: fmov w9, s0
; CHECK-SD-NEXT: mul w0, w9, w8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v2i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mul w0, w8, w9
; CHECK-GI-NEXT: ret
; CHECK-LABEL: mulv_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov w8, v0.s[1]
; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: mul w0, w9, w8
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
ret i32 %arg1
Expand Down Expand Up @@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mul w0, w8, w9
; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: mul w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
Expand All @@ -452,31 +424,22 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
; CHECK-GI-NEXT: mov s1, v0.s[1]
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mul w0, w8, w9
; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: mul w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
ret i32 %arg1
}

define i64 @mulv_v2i64(<2 x i64> %a) {
; CHECK-SD-LABEL: mulv_v2i64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov x8, v0.d[1]
; CHECK-SD-NEXT: fmov x9, d0
; CHECK-SD-NEXT: mul x0, x9, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mulv_v2i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: mul x0, x8, x9
; CHECK-GI-NEXT: ret
; CHECK-LABEL: mulv_v2i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: mul x0, x9, x8
; CHECK-NEXT: ret
entry:
%arg1 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %a)
ret i64 %arg1
Expand Down Expand Up @@ -522,14 +485,12 @@ define i64 @mulv_v4i64(<4 x i64> %a) {
;
; CHECK-GI-LABEL: mulv_v4i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d2, v0.d[1]
; CHECK-GI-NEXT: mov d3, v1.d[1]
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d2
; CHECK-GI-NEXT: fmov x10, d3
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: mul x9, x9, x10
; CHECK-GI-NEXT: mov x8, v0.d[1]
; CHECK-GI-NEXT: fmov x10, d0
; CHECK-GI-NEXT: mov x9, v1.d[1]
; CHECK-GI-NEXT: mul x8, x10, x8
; CHECK-GI-NEXT: fmov x10, d1
; CHECK-GI-NEXT: mul x9, x10, x9
; CHECK-GI-NEXT: mul x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
Expand Down
Loading
Loading