[AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) #89017

dc03-work · 2024-04-17T04:14:58Z

This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition.

This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.

llvmbot · 2024-04-17T04:15:28Z

@llvm/pr-subscribers-backend-aarch64

Author: Dhruv Chawla (dc03-work)

Changes

This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition.

This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.

Full diff: https://github.com/llvm/llvm-project/pull/89017.diff

7 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp (+46)
(modified) llvm/test/CodeGen/AArch64/aarch64-mulv.ll (+27-56)
(modified) llvm/test/CodeGen/AArch64/arm64-neon-copy.ll (+3-6)
(modified) llvm/test/CodeGen/AArch64/insertextract.ll (+6-7)
(modified) llvm/test/CodeGen/AArch64/reduce-and.ll (+12-16)
(modified) llvm/test/CodeGen/AArch64/reduce-or.ll (+12-16)
(modified) llvm/test/CodeGen/AArch64/reduce-xor.ll (+12-16)

diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 22da7ddef98a2a..ef1ec90d739a80 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -61,6 +61,12 @@
 //   %6:fpr128 = IMPLICIT_DEF
 //   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
 //
+// 8.   %129:fpr32 = DUPi32 %167:fpr128, 3
+//      %173:gpr32 = COPY %129:fpr32
+//   ==>
+//      %173:gpr32 = UMOVvi32 %167:fpr128, 3
+//   Similar peephole for 64-bit moves.
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ExpandImm.h"
@@ -128,6 +134,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
   bool visitFMOVDr(MachineInstr &MI);
+  bool visitCOPY(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -690,6 +697,42 @@ bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
   return true;
 }
 
+bool AArch64MIPeepholeOpt::visitCOPY(MachineInstr &MI) {
+  // Optimize COPY of FPR extract into GPR regbank to UMOV
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  if (!Dst.isVirtual() || !Src.isVirtual())
+    return false;
+
+  auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass,
+                         const TargetRegisterClass *FPRRegClass, unsigned DUP,
+                         unsigned UMOV) {
+    if (MRI->getRegClassOrNull(Dst) != GPRRegClass ||
+        MRI->getRegClassOrNull(Src) != FPRRegClass)
+      return false;
+
+    MachineInstr *SrcMI = MRI->getUniqueVRegDef(Src);
+    if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI->hasOneUse(Src))
+      return false;
+
+    Register DupSrc = SrcMI->getOperand(1).getReg();
+    int64_t DupImm = SrcMI->getOperand(2).getImm();
+
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst)
+        .addReg(DupSrc)
+        .addImm(DupImm);
+    SrcMI->eraseFromParent();
+    MI.eraseFromParent();
+    return true;
+  };
+
+  return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass,
+                     AArch64::DUPi32, AArch64::UMOVvi32) ||
+         TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass,
+                     AArch64::DUPi32, AArch64::UMOVvi64);
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -771,6 +814,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::FMOVDr:
         Changed |= visitFMOVDr(MI);
         break;
+      case AArch64::COPY:
+        Changed |= visitCOPY(MI);
+        break;
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index 7b7ca9d8ffc2db..b324b896529fd9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -25,22 +25,13 @@ declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
 declare i128 @llvm.vector.reduce.mul.v2i128(<2 x i128>)
 
 define i8 @mulv_v2i8(<2 x i8> %a) {
-; CHECK-SD-LABEL: mulv_v2i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %a)
   ret i8 %arg1
@@ -230,22 +221,13 @@ entry:
 }
 
 define i16 @mulv_v2i16(<2 x i16> %a) {
-; CHECK-SD-LABEL: mulv_v2i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %a)
   ret i16 %arg1
@@ -372,22 +354,13 @@ entry:
 }
 
 define i32 @mulv_v2i32(<2 x i32> %a) {
-; CHECK-SD-LABEL: mulv_v2i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, v0.s[1]
-; CHECK-SD-NEXT:    fmov w9, s0
-; CHECK-SD-NEXT:    mul w0, w9, w8
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: mulv_v2i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: mulv_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mul w0, w9, w8
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %a)
   ret i32 %arg1
@@ -424,10 +397,9 @@ define i32 @mulv_v4i32(<4 x i32> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mul w0, w9, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
@@ -452,10 +424,9 @@ define i32 @mulv_v8i32(<8 x i32> %a) {
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    mul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    mul w0, w9, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 749d6071c98d7c..bdc39a214ec851 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1488,8 +1488,7 @@ define <4 x i16> @test_dup_v2i32_v4i16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: test_dup_v2i32_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    dup v0.4h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1510,8 +1509,7 @@ define <8 x i16> @test_dup_v4i32_v8i16(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_dup_v4i32_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s0, v0.s[3]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    dup v0.8h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1626,8 +1624,7 @@ define <4 x i16> @test_dup_v4i32_v4i16(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_dup_v4i32_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s0, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    dup v0.4h, w8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index c6b2d07231bf86..8b82004388b095 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -983,13 +983,12 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_0:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w9
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 62ad45b212967a..31502e452efebc 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -30,10 +30,9 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redand_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %a)
@@ -457,10 +456,9 @@ define i32 @test_redand_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redand_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
   ret i32 %and_result
@@ -480,10 +478,9 @@ define i32 @test_redand_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
   ret i32 %and_result
@@ -505,10 +502,9 @@ define i32 @test_redand_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    and w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    and w0, w9, w8
 ; GISEL-NEXT:    ret
   %and_result = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %a)
   ret i32 %and_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 20c498d36fdea4..708668959f9fbf 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -30,10 +30,9 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redor_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
@@ -459,10 +458,9 @@ define i32 @test_redor_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redor_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %or_result
@@ -482,10 +480,9 @@ define i32 @test_redor_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %or_result
@@ -507,10 +504,9 @@ define i32 @test_redor_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    orr w0, w9, w8
 ; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
   ret i32 %or_result
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index b8ca99e003b627..a902c06711f2c5 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -27,10 +27,9 @@ define i1 @test_redxor_v2i1(<2 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v2i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w8, w9, w8
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
@@ -448,10 +447,9 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) {
 ; GISEL-LABEL: test_redxor_v2i32:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
   ret i32 %xor_result
@@ -471,10 +469,9 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
   ret i32 %xor_result
@@ -496,10 +493,9 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov s1, v0.s[1]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    eor w0, w8, w9
+; GISEL-NEXT:    mov w8, v0.s[1]
+; GISEL-NEXT:    fmov w9, s0
+; GISEL-NEXT:    eor w0, w9, w8
 ; GISEL-NEXT:    ret
   %xor_result = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %a)
   ret i32 %xor_result

github-actions · 2024-04-17T04:17:47Z

✅ With the latest revision this PR passed the C/C++ code formatter.

davemgreen

Hello. Does this come up on in selection-dag anywhere? If not I think I would expect it to be optimized during GISel.

dc03-work · 2024-04-17T08:23:30Z

Hello. Does this come up on in selection-dag anywhere? If not I think I would expect it to be optimized during GISel.

No, this does not come up with SDAG. However, I could not really find any other pass where this would fit in better.

dc03-work · 2024-04-18T06:03:07Z

@davemgreen Would this patch be better put in AArch64PostSelectOptimize.cpp?

davemgreen · 2024-04-18T08:20:02Z

Morning - I was looking into https://godbolt.org/z/njh7rs44G. Are most of the changes due to G_EXTRACT_VECTOR_ELT requiring the result to be a FPR? Is it possible to change that?

I recently manage to commit #81453, which made some changes to G_INSERT_VECTOR_ELT. I have a similar patch for extract, but it looks like it is mostly handling legalization not changing selection/regbank. (And could do with some cleanup).

The regbank for insert in https://github.com/llvm/llvm-project/pull/81453/files#diff-c2b6149d4ce96115e46b03ba19778fe7dd911412078c302ace57ac14364f5227 does two things - it re-uses FPR/GPR if the input is already known to be a give type, and extends small i8/i16 values to i32. The second part is something different that we can do separately, but is the first part possible to use for extracts too? The selection might change, which might mean fixing the i32 issue too.

I might try and upload the patch for extract I have, but I don't think it is making these selection parts better yet. (And it still looks quite messy). The ideal in the end is that we can share the same tablegen patterns between SDAG and GISEL.

dc03-work · 2024-04-18T08:30:19Z

The regbank for insert in https://github.com/llvm/llvm-project/pull/81453/files#diff-c2b6149d4ce96115e46b03ba19778fe7dd911412078c302ace57ac14364f5227 does two things - it re-uses FPR/GPR if the input is already known to be a give type, and extends small i8/i16 values to i32. The second part is something different that we can do separately, but is the first part possible to use for extracts too?

The problem is unfortunately with the output operand not the input operand. The element that gets extracted gets used in a place where only GPR is accepted, so a copy must be generated. Because RegBankSelect works top-down, there's no way for us to know how the output operand is going to be used, which is why I had made this as a peephole.

dc03-work · 2024-04-23T05:21:10Z

Morning - I was looking into https://godbolt.org/z/njh7rs44G. Are most of the changes due to G_EXTRACT_VECTOR_ELT requiring the result to be a FPR? Is it possible to change that?

Ah, I somehow missed this. Yes, it is because the result has to be an FPR. No, I don't think its possible to change this, without being able to schedule the RegBank for extract after the RegBank for the result has been selected.

davemgreen · 2024-04-24T16:36:35Z

@aemerson Any opinions on making reg-bank-select a little better? My understanding from a while ago was that the greedy mode did not work very well (maybe?), but I feel we will eventually need something like it to produce optimal code for all the cases that could be using gpr or fpr.

aemerson · 2024-04-27T05:30:51Z

@aemerson Any opinions on making reg-bank-select a little better? My understanding from a while ago was that the greedy mode did not work very well (maybe?), but I feel we will eventually need something like it to produce optimal code for all the cases that could be using gpr or fpr.

I don't even know if the greedy mode works at all for AArch64, we've always used the default mode. I think we can revisit it after we rework RBS once we have FP types.

dc03-work · 2024-04-28T03:02:00Z

@aemerson Any opinions on making reg-bank-select a little better? My understanding from a while ago was that the greedy mode did not work very well (maybe?), but I feel we will eventually need something like it to produce optimal code for all the cases that could be using gpr or fpr.

I don't even know if the greedy mode works at all for AArch64, we've always used the default mode. I think we can revisit it after we rework RBS once we have FP types.

@aemerson Until then, do you feel that it is viable to implement this as a fold in AArch64PostSelectOptimize? Unfortunately, this pattern has real-world implications for performance that we are running into, and I would like to have it fixed...

aemerson · 2024-04-28T06:14:47Z

I'm pragmatic about incremental solutions, but first have you looked at examining users of instructions in RBS? Even if it works top down there's nothing stopping you from peeking at uses.

dc03-work · 2024-04-29T11:10:15Z

I'm pragmatic about incremental solutions, but first have you looked at examining users of instructions in RBS? Even if it works top down there's nothing stopping you from peeking at uses.

Hmm, so I did more digging into this - I don't think it is possible to do this by examining users at all, because there is no good way to predict if a user will require GPR or not. For example, if the user is a G_ADD, there's no way to predict that the G_ADD will definitely be a GPR add.

There are instructions that only accept GPR, however that would limit this fold too much. The only way I can see for implementing this is if the user is selected before the definition...

Anyways, I also tried moving this to AArch64PostSelectOptimize. While it does fold the way it is doing for the current test cases, it also causes regressions in other tests somehow, where copy-folding appears to be occuring (copy fpr -> gpr then copy gpr -> fpr) that gets inhibited by folding the copy in AArch64PostSelectOptimize.

I think fixing the issue in RBS may also cause these issues to crop up! As an example, consider the following diff:

diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54..32f88588b9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -373,7 +373,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ;
 ; CHECK-GI-LABEL: test_build_illegal:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov s0, v0[3]
+; CHECK-GI-NEXT:    mov.s w8, v0[3]
+; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov.h v0[3], v0[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret

The code before comes from the following IR:

  # Machine code for function test_build_illegal: IsSSA, TracksLiveness, Legalized, RegBankSelected, Selected             
  Function Live Ins: $q0                                                                                                  
                                                                                                                          
  bb.1 (%ir-block.0):                                                                                                     
    liveins: $q0                                                                                                          
    %0:fpr128 = COPY $q0                                                                                                  
    %1:fpr32 = DUPi32 %0:fpr128, 3                                                                                        
    %7:gpr32 = COPY %1:fpr32                                                                                              
    %6:gpr32 = IMPLICIT_DEF                                                                                               
    %20:fpr32 = COPY %6:gpr32                                                                                             
    %21:fpr16 = COPY %20.hsub:fpr32                                                                                       
    %18:fpr32 = COPY %7:gpr32                                                                                             
    %19:fpr16 = COPY %18.hsub:fpr32                                                                                       
    %12:fpr128 = IMPLICIT_DEF                                                                                             
    %13:fpr128 = INSERT_SUBREG %12:fpr128(tied-def 0), %21:fpr16, %subreg.hsub                                            
    %15:fpr128 = IMPLICIT_DEF                                                                                             
    %16:fpr128 = INSERT_SUBREG %15:fpr128(tied-def 0), %19:fpr16, %subreg.hsub                                            
    %14:fpr128 = INSvi16lane %13:fpr128(tied-def 0), 3, %16:fpr128, 0                                                     
    %4:fpr64 = COPY %14.dsub:fpr128                                                                                       
    $d0 = COPY %4:fpr64                                                                                                   
    RET_ReallyLR implicit $d0                                                                                             
                                                                                                                          
  # End machine code for function test_build_illegal.

After the MIR peephole optimizer runs, %7 gets folded into %18 and then gets deleted. When %7 is folded before this pass runs, %1 gets deleted and a cross regbank copy gets emitted.

At this point, I'm starting to feel like this pass really is the best place for this fold, if there were a good way to gate this fold to only run when GlobalISel is being used.

dc03-work · 2024-05-13T11:10:24Z

Ping @aemerson

davemgreen · 2024-05-13T12:18:13Z

AArch64MIPeepholeOpt was added as a last-resort for things we didn't know where else to put. In the long run I feel we really need reg-bank select to be doing it's job properly even if it means selecting regbanks more lazily.

AArch64PostSelectOptimize sounds like a decent compromise. Could it be made to fix the regressions you see at the same time?

dc03-work · 2024-05-13T12:47:02Z

AArch64PostSelectOptimize sounds like a decent compromise. Could it be made to fix the regressions you see at the same time?

Hmm, the problem is COPY(FPR->GPR) -> COPY(GPR->FPR) being converted to UMOV(FPR->GPR) -> COPY(GPR->FPR). Fixing this would mean having to effectively undo the optimization when one of the users is such a COPY(GPR->FPR), but I'm not sure where that would go?

davemgreen · 2024-05-13T12:51:30Z

Could the combine for UMOV check if the input is another COPY?

dc03-work · 2024-05-13T12:59:27Z

It needs to check the user not the input, which I guess is also doable?

This patch adds a peephole optimization for codegen that is caused by RegBankSelect limiting G_EXTRACT_VECTOR_ELT only to FPR registers in both the input and output registers. This can cause a generation of COPY from FPR to GPR when, for example, the output register of the G_EXTRACT_VECTOR_ELT is used in a branch condition. This was noticed when looking at codegen differences between SDAG and GI for the s1279 kernel in the TSVC benchmark.

dc03-work · 2024-05-14T09:20:32Z

AArch64MIPeepholeOpt was added as a last-resort for things we didn't know where else to put. In the long run I feel we really need reg-bank select to be doing it's job properly even if it means selecting regbanks more lazily.

AArch64PostSelectOptimize sounds like a decent compromise. Could it be made to fix the regressions you see at the same time?

@davemgreen I was able to do this. Please have a look.

davemgreen

Thanks.

davemgreen · 2024-05-14T10:51:06Z

llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp

@@ -107,6 +108,8 @@ bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) {
  for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end()))) {
    Changed |= foldSimpleCrossClassCopies(MI);
  }
+  for (auto &MI : make_early_inc_range(make_range(MBB.begin(), MBB.end())))


Can these be part of the same loop, to save it iterating every instruction twice?

Unfortunately not. The problem is, if foldSimpleCrossClassCopies triggers, then MI gets invalidated because it gets deleted and calling foldCopyDup then causes a crash.

It is possible to check the return value of foldSimpleCrossClassCopies and call foldCopyDup based on that, however that can cause a situation where we miss the new instruction that foldSimpleCrossClassCopies generates. Nevermind, it appears that it doesn't generate a new instruction. However, this can still be the case for other folds that get implemented.

We should try to avoid iterating over the instructions multiple times wherever possible. We might, in the long run, want to have this loop switch on the opcode. Having it continue if foldSimpleCrossClassCopies returns true is Ok for the time being though.

Okay, that works. Awesome.

davemgreen

Thanks. LGTM

dc03-work requested review from aemerson, madhur13490, davemgreen and chuongg3 April 17, 2024 04:14

llvmbot added the backend:AArch64 label Apr 17, 2024

davemgreen reviewed Apr 17, 2024

View reviewed changes

dc03-work added 2 commits May 14, 2024 14:48

Rebase and move to AArch64PostSelectOptimize

0f0183d

dc03-work force-pushed the fold-copy-dup branch from 034a11a to 0f0183d Compare May 14, 2024 09:19

davemgreen reviewed May 14, 2024

View reviewed changes

Fold loops

2565795

davemgreen approved these changes May 15, 2024

View reviewed changes

dc03-work merged commit 1dd0d3c into llvm:main May 16, 2024
4 checks passed

[AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) #89017

[AArch64] Fold COPY(y:gpr, DUP(x:fpr, i)) -> UMOV(y:gpr, x:fpr, i) #89017

Uh oh!

Conversation

dc03-work commented Apr 17, 2024

Uh oh!

llvmbot commented Apr 17, 2024

Uh oh!

github-actions bot commented Apr 17, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

dc03-work commented Apr 17, 2024

Uh oh!

dc03-work commented Apr 18, 2024

Uh oh!

davemgreen commented Apr 18, 2024

Uh oh!

dc03-work commented Apr 18, 2024

Uh oh!

dc03-work commented Apr 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen commented Apr 24, 2024

Uh oh!

aemerson commented Apr 27, 2024

Uh oh!

dc03-work commented Apr 28, 2024

Uh oh!

aemerson commented Apr 28, 2024

Uh oh!

dc03-work commented Apr 29, 2024

Uh oh!

dc03-work commented May 13, 2024

Uh oh!

davemgreen commented May 13, 2024

Uh oh!

dc03-work commented May 13, 2024

Uh oh!

davemgreen commented May 13, 2024

Uh oh!

dc03-work commented May 13, 2024

Uh oh!

dc03-work commented May 14, 2024

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

davemgreen May 14, 2024

Choose a reason for hiding this comment

Uh oh!

dc03-work May 14, 2024

Choose a reason for hiding this comment

Uh oh!

dc03-work May 14, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

davemgreen May 14, 2024

Choose a reason for hiding this comment

Uh oh!

dc03-work May 15, 2024

Choose a reason for hiding this comment

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Apr 17, 2024 •

edited

Loading

dc03-work commented Apr 23, 2024 •

edited

Loading

dc03-work May 14, 2024 •

edited

Loading