llvm · david-arm · Nov 5, 2024 · Nov 5, 2024 · jayfoad · Oct 25, 2024
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -497,10 +497,10 @@ class TargetLoweringBase {
     return true;
   }
 
-  /// Return true if multiple condition registers are available.
-  bool hasMultipleConditionRegisters() const {
-    return HasMultipleConditionRegisters;
-  }
+  /// Return true if multiple (allocatable) predicate registers are available
+  /// for \p VT. If there is only a single register the code generator will
+  /// sink comparisons into the blocks of their users.
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2389,7 +2389,7 @@ class TargetLoweringBase {
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
-    if (hasMultipleConditionRegisters())
+    if (hasMultiplePredicateRegisters(MVT::i1))
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
@@ -2496,15 +2496,6 @@ class TargetLoweringBase {
     StackPointerRegisterToSaveRestore = R;
   }
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
-    HasMultipleConditionRegisters = hasManyRegs;
-  }
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
@@ -3473,13 +3464,6 @@ class TargetLoweringBase {
 private:
   const TargetMachine &TM;
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  bool HasMultipleConditionRegisters;
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1770,8 +1770,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
 /// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
-static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
-  if (TLI.hasMultipleConditionRegisters())
+static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp,
+                              const TargetLowering &TLI) {
+  EVT ResVT = TLI.getValueType(DL, Cmp->getType());
+  if (TLI.hasMultiplePredicateRegisters(ResVT))
     return false;
 
   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
@@ -2176,7 +2178,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
 }
 
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
-  if (sinkCmpExpression(Cmp, *TLI))
+  if (sinkCmpExpression(*DL, Cmp, *TLI))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, ModifiedDT))

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
   MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
-  HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1346,6 +1346,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
+    return VT.isVector();
+  }
 };
 
 namespace AArch64 {

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
-  // FIXME: This is only partially true. If we have to do vector compares, any
-  // SGPR pair can be a condition register. If we have a uniform condition, we
-  // are better off doing SALU operations, where there is only one SCC. For now,
-  // we don't have a way of knowing during instruction selection if a condition
-  // will be uniform and we always use vector compares. Assume we are using
-  // vector compares until that is fixed.
-  setHasMultipleConditionRegisters(true);
-
   setMinCmpXchgSizeInBits(32);
   setSupportsUnalignedAtomics(false);
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
+    // FIXME: This is only partially true. If we have to do vector compares,
+    // any SGPR pair can be a condition register. If we have a uniform
+    // condition, we are better off doing SALU operations, where there is only
+    // one SCC. For now, we don't have a way of knowing during instruction
+    // selection if a condition will be uniform and we always use vector
+    // compares. Assume we are using vector compares until that is fixed.
+    return true;
+  }
 };
 
 namespace AMDGPUISD {

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1461,10 +1461,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
-  if (Subtarget.useCRBits()) {
-    setHasMultipleConditionRegisters();
+  if (Subtarget.useCRBits())
     setJumpIsExpensive();
-  }
 
   // TODO: The default entry number is set to 64. This stops most jump table
   // generation on PPC. But it is good for current PPC HWs because the indirect
@@ -19137,3 +19135,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Builder.CreateOr(
       Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
 }
+
+bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const {
+  // With 32 condition bits, we don't need to sink (and duplicate) compares
+  // aggressively in CodeGenPrep.
+  return Subtarget.useCRBits();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1492,6 +1492,8 @@ namespace llvm {
     /// through to determine the optimal load/store instruction format.
     unsigned computeMOFlags(const SDNode *Parent, SDValue N,
                             SelectionDAG &DAG) const;
+
+    virtual bool hasMultiplePredicateRegisters(EVT VT) const override;
   }; // end class PPCTargetLowering
 
   namespace PPC {

diff --git a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
+; CHECK-LABEL: no_sink_simple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v2.4h, v2.4s
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %s
+; CHECK-NEXT:    sshll v1.4s, v2.4h, #0
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %t
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %d = icmp slt <4 x i32> %a, %b
+  br i1 %c, label %s, label %t
+
+s:
+  %s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer
+  store <4 x i32> %s1, ptr %p
+  ret <4 x i32> %s1
+
+t:
+  %s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer
+  ret <4 x i32> %s2
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    mov w10, #4 // =0x4
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    adrp x11, .LCPI1_1
+; CHECK-NEXT:    dup v0.2d, x9
+; CHECK-NEXT:    dup v1.2d, x10
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI1_1]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    add v2.2d, v2.2d, v1.2d
+; CHECK-NEXT:    add v3.2d, v3.2d, v1.2d
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    b.eq .LBB1_10
+; CHECK-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmhi v4.2d, v0.2d, v2.2d
+; CHECK-NEXT:    cmhi v5.2d, v0.2d, v3.2d
+; CHECK-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; CHECK-NEXT:    xtn v4.4h, v4.4s
+; CHECK-NEXT:    umov w11, v4.h[0]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_6
+; CHECK-NEXT:  // %bb.3: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    umov w11, v4.h[1]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_7
+; CHECK-NEXT:  .LBB1_4: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    umov w11, v4.h[2]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_8
+; CHECK-NEXT:  .LBB1_5: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    umov w11, v4.h[3]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_1
+; CHECK-NEXT:    b .LBB1_9
+; CHECK-NEXT:  .LBB1_6: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:    umov w11, v4.h[1]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_4
+; CHECK-NEXT:  .LBB1_7: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:    umov w11, v4.h[2]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_5
+; CHECK-NEXT:  .LBB1_8: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:    umov w11, v4.h[3]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_1
+; CHECK-NEXT:  .LBB1_9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    str w10, [x8, #4]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}