-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) #89017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Dhruv Chawla (dc03-work) ChangesThis patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition. This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark. Full diff: https://github.com/llvm/llvm-project/pull/89017.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 22da7ddef98a2a..ef1ec90d739a80 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -61,6 +61,12 @@
// %6:fpr128 = IMPLICIT_DEF
// %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
//
+// 8. %129:fpr32 = DUPi32 %167:fpr128, 3
+// %173:gpr32 = COPY %129:fpr32
+// ==>
+// %173:gpr32 = UMOVvi32 %167:fpr128, 3
+// Similar peephole for 64-bit moves.
+//
//===----------------------------------------------------------------------===//
#include "AArch64ExpandImm.h"
@@ -128,6 +134,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
bool visitINSvi64lane(MachineInstr &MI);
bool visitFMOVDr(MachineInstr &MI);
+ bool visitCOPY(MachineInstr &MI);
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -690,6 +697,42 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
return true;
}
+bool AArch64MIPeepholeOpt::visitCOPY(MachineInstr &MI) {
+ // Optimize COPY of FPR extract into GPR regbank to UMOV
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+
+ if (!Dst.isVirtual() || !Src.isVirtual())
+ return false;
+
+ auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
+ const TargetRegisterClass *FPRRegClass, unsigned DUP,
+ unsigned UMOV) {
+ if (MRI->getRegClassOrNull(Dst) != GPRRegClass ||
+ MRI->getRegClassOrNull(Src) != FPRRegClass)
+ return false;
+
+ MachineInstr *SrcMI = MRI->getUniqueVRegDef(Src);
+ if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI->hasOneUse(Src))
+ return false;
+
+ Register DupSrc = SrcMI->getOperand(1).getReg();
+ int64_t DupImm = SrcMI->getOperand(2).getImm();
+
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
+ .addReg(DupSrc)
+ .addImm(DupImm);
+ SrcMI->eraseFromParent();
+ MI.eraseFromParent();
+ return true;
+ };
+
+ return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
+ AArch64::DUPi32, AArch64::UMOVvi32) ||
+ TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
+ AArch64::DUPi32, AArch64::UMOVvi64);
+}
+
bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -771,6 +814,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
case AArch64::FMOVDr:
Changed |= visitFMOVDr(MI);
break;
+ case AArch64::COPY:
+ Changed |= visitCOPY(MI);
+ break;
}
}
}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index 7b7ca9d8ffc2db..b324b896529fd9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)
define i8 @mulv_v2i8(<2 x i8> %a) {
-; CHECK-SD-LABEL: mulv_v2i8:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: mulv_v2i8:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: mul w0, w8, w9
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: mulv_v2i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mul w0, w9, w8
+; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
ret i8 %arg1
@@ -230,22 +221,13 @@ entry:
}
define i16 @mulv_v2i16(<2 x i16> %a) {
-; CHECK-SD-LABEL: mulv_v2i16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: mulv_v2i16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: mul w0, w8, w9
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: mulv_v2i16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mul w0, w9, w8
+; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
ret i16 %arg1
@@ -372,22 +354,13 @@ entry:
}
define i32 @mulv_v2i32(<2 x i32> %a) {
-; CHECK-SD-LABEL: mulv_v2i32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov w8, v0.s[1]
-; CHECK-SD-NEXT: fmov w9, s0
-; CHECK-SD-NEXT: mul w0, w9, w8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: mulv_v2i32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: mul w0, w8, w9
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: mulv_v2i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mul w0, w9, w8
+; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
ret i32 %arg1
@@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: mul w0, w8, w9
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mul w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
@@ -452,10 +424,9 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: mul w0, w8, w9
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: mul w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 749d6071c98d7c..bdc39a214ec851 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1488,8 +1488,7 @@ define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
; CHECK-GI-LABEL: test_dup_v2i32_v4i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: mov s0, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: dup v0.4h, w8
; CHECK-GI-NEXT: ret
entry:
@@ -1510,8 +1509,7 @@ define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
;
; CHECK-GI-LABEL: test_dup_v4i32_v8i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s0, v0.s[3]
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w8, v0.s[3]
; CHECK-GI-NEXT: dup v0.8h, w8
; CHECK-GI-NEXT: ret
entry:
@@ -1626,8 +1624,7 @@ define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
;
; CHECK-GI-LABEL: test_dup_v4i32_v4i16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s0, v0.s[1]
-; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w8, v0.s[1]
; CHECK-GI-NEXT: dup v0.4h, w8
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index c6b2d07231bf86..8b82004388b095 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -983,13 +983,12 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
;
; CHECK-GI-LABEL: insert_v3i32_0:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov s1, v0.s[1]
-; CHECK-GI-NEXT: mov s2, v0.s[2]
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: fmov w8, s1
-; CHECK-GI-NEXT: mov v0.s[1], w8
-; CHECK-GI-NEXT: fmov w8, s2
-; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov w8, v0.s[1]
+; CHECK-GI-NEXT: fmov s1, w0
+; CHECK-GI-NEXT: mov w9, v0.s[2]
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[2], w9
+; CHECK-GI-NEXT: mov v0.16b, v1.16b
; CHECK-GI-NEXT: ret
entry:
%d = insertelement <3 x i32> %a, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 62ad45b212967a..31502e452efebc 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -30,10 +30,9 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
; GISEL-LABEL: test_redand_v2i1:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: and w8, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: and w8, w9, w8
; GISEL-NEXT: and w0, w8, #0x1
; GISEL-NEXT: ret
%or_result = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
@@ -457,10 +456,9 @@ define i32 @test_redand_v2i32(<2 x i32> %a) {
; GISEL-LABEL: test_redand_v2i32:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: and w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: and w0, w9, w8
; GISEL-NEXT: ret
%and_result = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
ret i32 %and_result
@@ -480,10 +478,9 @@ define i32 @test_redand_v4i32(<4 x i32> %a) {
; GISEL: // %bb.0:
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: and w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: and w0, w9, w8
; GISEL-NEXT: ret
%and_result = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
ret i32 %and_result
@@ -505,10 +502,9 @@ define i32 @test_redand_v8i32(<8 x i32> %a) {
; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: and w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: and w0, w9, w8
; GISEL-NEXT: ret
%and_result = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
ret i32 %and_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 20c498d36fdea4..708668959f9fbf 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -30,10 +30,9 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
; GISEL-LABEL: test_redor_v2i1:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: orr w8, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: orr w8, w9, w8
; GISEL-NEXT: and w0, w8, #0x1
; GISEL-NEXT: ret
%or_result = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
@@ -459,10 +458,9 @@ define i32 @test_redor_v2i32(<2 x i32> %a) {
; GISEL-LABEL: test_redor_v2i32:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: orr w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: orr w0, w9, w8
; GISEL-NEXT: ret
%or_result = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
ret i32 %or_result
@@ -482,10 +480,9 @@ define i32 @test_redor_v4i32(<4 x i32> %a) {
; GISEL: // %bb.0:
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: orr w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: orr w0, w9, w8
; GISEL-NEXT: ret
%or_result = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
ret i32 %or_result
@@ -507,10 +504,9 @@ define i32 @test_redor_v8i32(<8 x i32> %a) {
; GISEL-NEXT: orr v0.16b, v0.16b, v1.16b
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: orr w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: orr w0, w9, w8
; GISEL-NEXT: ret
%or_result = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
ret i32 %or_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index b8ca99e003b627..a902c06711f2c5 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -27,10 +27,9 @@ define i1 @test_redxor_v2i1(<2 x i1> %a) {
; GISEL-LABEL: test_redxor_v2i1:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: eor w8, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: eor w8, w9, w8
; GISEL-NEXT: and w0, w8, #0x1
; GISEL-NEXT: ret
%or_result = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
@@ -448,10 +447,9 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) {
; GISEL-LABEL: test_redxor_v2i32:
; GISEL: // %bb.0:
; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: eor w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: eor w0, w9, w8
; GISEL-NEXT: ret
%xor_result = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
ret i32 %xor_result
@@ -471,10 +469,9 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) {
; GISEL: // %bb.0:
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: eor w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: eor w0, w9, w8
; GISEL-NEXT: ret
%xor_result = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
ret i32 %xor_result
@@ -496,10 +493,9 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) {
; GISEL-NEXT: eor v0.16b, v0.16b, v1.16b
; GISEL-NEXT: mov d1, v0.d[1]
; GISEL-NEXT: eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: mov s1, v0.s[1]
-; GISEL-NEXT: fmov w8, s0
-; GISEL-NEXT: fmov w9, s1
-; GISEL-NEXT: eor w0, w8, w9
+; GISEL-NEXT: mov w8, v0.s[1]
+; GISEL-NEXT: fmov w9, s0
+; GISEL-NEXT: eor w0, w9, w8
; GISEL-NEXT: ret
%xor_result = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
ret i32 %xor_result
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hello. Does this come up on in selection-dag anywhere? If not I think I would expect it to be optimized during GISel.
No, this does not come up with SDAG. However, I could not really find any other pass where this would fit in better. |
@davemgreen Would this patch be better put in |
Morning - I was looking into https://godbolt.org/z/njh7rs44G. Are most of the changes due to G_EXTRACT_VECTOR_ELT requiring the result to be a FPR? Is it possible to change that? I recently manage to commit #81453, which made some changes to G_INSERT_VECTOR_ELT. I have a similar patch for extract, but it looks like it is mostly handling legalization not changing selection/regbank. (And could do with some cleanup). The regbank for insert in https://github.com/llvm/llvm-project/pull/81453/files#diff-c2b6149d4ce96115e46b03ba19778fe7dd911412078c302ace57ac14364f5227 does two things - it re-uses FPR/GPR if the input is already known to be a give type, and extends small i8/i16 values to i32. The second part is something different that we can do separately, but is the first part possible to use for extracts too? The selection might change, which might mean fixing the i32 issue too. I might try and upload the patch for extract I have, but I don't think it is making these selection parts better yet. (And it still looks quite messy). The ideal in the end is that we can share the same tablegen patterns between SDAG and GISEL. |
The problem is unfortunately with the output operand not the input operand. The element that gets extracted gets used in a place where only GPR is accepted, so a copy must be generated. Because RegBankSelect works top-down, there's no way for us to know how the output operand is going to be used, which is why I had made this as a peephole. |
Ah, I somehow missed this. Yes, it is because the result has to be an FPR. No, I don't think its possible to change this, without being able to schedule the RegBank for extract after the RegBank for the result has been selected. |
@aemerson Any opinions on making reg-bank-select a little better? My understanding from a while ago was that the greedy mode did not work very well (maybe?), but I feel we will eventually need something like it to produce optimal code for all the cases that could be using gpr or fpr. |
I don't even know if the greedy mode works at all for AArch64, we've always used the default mode. I think we can revisit it after we rework RBS once we have FP types. |
@aemerson Until then, do you feel that it is viable to implement this as a fold in AArch64PostSelectOptimize? Unfortunately, this pattern has real-world implications for performance that we are running into, and I would like to have it fixed... |
I'm pragmatic about incremental solutions, but first have you looked at examining users of instructions in RBS? Even if it works top down there's nothing stopping you from peeking at uses. |
Hmm, so I did more digging into this - I don't think it is possible to do this by examining users at all, because there is no good way to predict if a user will require GPR or not. For example, if the user is a There are instructions that only accept GPR, however that would limit this fold too much. The only way I can see for implementing this is if the user is selected before the definition... Anyways, I also tried moving this to AArch64PostSelectOptimize. While it does fold the way it is doing for the current test cases, it also causes regressions in other tests somehow, where copy-folding appears to be occuring ( I think fixing the issue in RBS may also cause these issues to crop up! As an example, consider the following diff: diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54..32f88588b9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -373,7 +373,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
;
; CHECK-GI-LABEL: test_build_illegal:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov s0, v0[3]
+; CHECK-GI-NEXT: mov.s w8, v0[3]
+; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: mov.h v0[3], v0[0]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret The code before comes from the following IR: # Machine code for function test_build_illegal: IsSSA, TracksLiveness, Legalized, RegBankSelected, Selected
Function Live Ins: $q0
bb.1 (%ir-block.0):
liveins: $q0
%0:fpr128 = COPY $q0
%1:fpr32 = DUPi32 %0:fpr128, 3
%7:gpr32 = COPY %1:fpr32
%6:gpr32 = IMPLICIT_DEF
%20:fpr32 = COPY %6:gpr32
%21:fpr16 = COPY %20.hsub:fpr32
%18:fpr32 = COPY %7:gpr32
%19:fpr16 = COPY %18.hsub:fpr32
%12:fpr128 = IMPLICIT_DEF
%13:fpr128 = INSERT_SUBREG %12:fpr128(tied-def 0), %21:fpr16, %subreg.hsub
%15:fpr128 = IMPLICIT_DEF
%16:fpr128 = INSERT_SUBREG %15:fpr128(tied-def 0), %19:fpr16, %subreg.hsub
%14:fpr128 = INSvi16lane %13:fpr128(tied-def 0), 3, %16:fpr128, 0
%4:fpr64 = COPY %14.dsub:fpr128
$d0 = COPY %4:fpr64
RET_ReallyLR implicit $d0
# End machine code for function test_build_illegal. After the MIR peephole optimizer runs, At this point, I'm starting to feel like this pass really is the best place for this fold, if there were a good way to gate this fold to only run when GlobalISel is being used. |
Ping @aemerson |
AArch64MIPeepholeOpt was added as a last-resort for things we didn't know where else to put. In the long run I feel we really need reg-bank select to be doing it's job properly even if it means selecting regbanks more lazily. AArch64PostSelectOptimize sounds like a decent compromise. Could it be made to fix the regressions you see at the same time? |
Hmm, the problem is |
Could the combine for UMOV check if the input is another COPY? |
It needs to check the user not the input, which I guess is also doable? |
This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition. This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.
@davemgreen I was able to do this. Please have a look. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks.
@@ -107,6 +108,8 @@ bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { | |||
for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) { | |||
Changed |= foldSimpleCrossClassCopies(MI); | |||
} | |||
for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can these be part of the same loop, to save it iterating every instruction twice?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately not. The problem is, if foldSimpleCrossClassCopies
triggers, then MI
gets invalidated because it gets deleted and calling foldCopyDup
then causes a crash.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is possible to check the return value of Nevermind, it appears that it doesn't generate a new instruction. However, this can still be the case for other folds that get implemented.foldSimpleCrossClassCopies
and call foldCopyDup
based on that, however that can cause a situation where we miss the new instruction that foldSimpleCrossClassCopies
generates.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should try to avoid iterating over the instructions multiple times wherever possible. We might, in the long run, want to have this loop switch on the opcode. Having it continue if foldSimpleCrossClassCopies returns true is Ok for the time being though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, that works. Awesome.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. LGTM
This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition.
This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.