per comments:

vpykhtin · vpykhtin · commit 50ab57303ac3 · 2023-10-25T16:00:48.000+02:00
* supported negated ballot
 * improved comments
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/R600MCTargetDesc.h"
 #include "R600RegisterInfo.h"
+#include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -2259,26 +2260,30 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
   return false;
 }
 
-bool isBoolSGPR(SDValue V);
-
-static SDValue combineBallotPattern(SDValue VCMP) {
+static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
   assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
   // Special case for amdgcn.ballot:
   // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
-  // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne ; lowered ballot
+  // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
   // =>
   // Use i1 %Cond value instead of i(WaveSize) %VCMP.
   // This is possible because divergent ISD::SETCC is selected as V_CMP and
   // Cond becomes a i(WaveSize) full mask value.
+  // Note that ballot doesn't use SETEQ condition but its easy to support it
+  // here for completeness, so in this case Negate is set true on return.
   auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
   auto *VCMP_CRHS = dyn_cast<ConstantSDNode>(VCMP.getOperand(1));
-  if (VCMP_CC == ISD::SETNE && VCMP_CRHS && VCMP_CRHS->isZero()) {
+  if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS &&
+      VCMP_CRHS->isZero()) {
+
     auto Cond = VCMP.getOperand(0);
     if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
       Cond = Cond.getOperand(0);
 
-    if (isBoolSGPR(Cond))
+    if (isBoolSGPR(Cond)) {
+      Negate = VCMP_CC == ISD::SETEQ;
       return Cond;
+    }
   }
   return SDValue();
 }
@@ -2306,14 +2311,18 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
     if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero()) {
       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
       // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
+      // BRCOND i1 %C, %BB
       // =>
-      // Use "i(WaveSize) %VCMP value in VCC register ne/eq zero" as the branch
-      // condition.
+      // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+      // VCC = COPY i(WaveSize) %VCMP
+      // S_CBRANCH_VCCNZ/VCCZ %BB
       Negate = CC == ISD::SETEQ;
       auto VCMP = Cond->getOperand(0);
-      if (auto BallotCond = combineBallotPattern(VCMP)) {
+      bool NegatedBallot = false;
+      if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
         Cond = BallotCond;
         UseSCCBr = !BallotCond->isDivergent();
+        Negate = Negate ^ NegatedBallot;
       } else {
         // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
         // selected as V_CMP, but this may change for uniform condition.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10499,9 +10499,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   return SDValue();
 }
 
-// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cndmask_b32 to be deserialized.
-bool isBoolSGPR(SDValue V) {
+bool llvm::isBoolSGPR(SDValue V) {
   if (V.getValueType() != MVT::i1)
     return false;
   switch (V.getOpcode()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -586,6 +586,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   getTargetMMOFlags(const Instruction &I) const override;
 };
 
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
+bool isBoolSGPR(SDValue V);
+
 } // End namespace llvm
 
 #endif
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -396,3 +396,115 @@ true:
 false:
   ret i32 33
 }
+
+declare i32 @llvm.amdgcn.icmp.i32(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB20_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB20_3
+; CHECK-NEXT:  .LBB20_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc1 .LBB21_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB21_3
+; CHECK-NEXT:  .LBB21_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i32 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
+; CHECK-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
+; CHECK-NEXT:    s_cbranch_vccnz .LBB22_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB22_3
+; CHECK-NEXT:  .LBB22_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b32 s0, -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b32 s1, -1, 0
+; CHECK-NEXT:    s_and_b32 s0, s0, s1
+; CHECK-NEXT:    s_and_b32 s0, s0, exec_lo
+; CHECK-NEXT:    s_cbranch_scc1 .LBB23_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB23_3
+; CHECK-NEXT:  .LBB23_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i32 @llvm.amdgcn.icmp.i32(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i32 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -399,3 +399,115 @@ true:
 false:
   ret i32 33
 }
+
+declare i64 @llvm.amdgcn.icmp.i64(i1, i1, i32)
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_ne_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB0_3
+; CHECK-NEXT:  .LBB0_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB0_3
+; CHECK-NEXT:  .LBB0_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_ne_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_ne_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc1 .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB1_3
+; CHECK-NEXT:  .LBB1_2: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB1_3
+; CHECK-NEXT:  .LBB1_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_ne_zero = icmp ne i64 %ballot, 0
+  br i1 %ballot_ne_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_divergent_simulated_negated_ballot_eq_zero_and(i32 %v1, i32 %v2) {
+; CHECK-LABEL: branch_divergent_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
+; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
+; CHECK-NEXT:    s_and_b64 vcc, vcc, s[0:1]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB2_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB2_3
+; CHECK-NEXT:  .LBB2_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB2_3
+; CHECK-NEXT:  .LBB2_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}
+
+define amdgpu_cs i32 @branch_uniform_simulated_negated_ballot_eq_zero_and(i32 inreg %v1, i32 inreg %v2) {
+; CHECK-LABEL: branch_uniform_simulated_negated_ballot_eq_zero_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_cmp_lt_u32 s0, 12
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_gt_u32 s1, 34
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; CHECK-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_cbranch_scc1 .LBB3_2
+; CHECK-NEXT:  ; %bb.1: ; %false
+; CHECK-NEXT:    s_mov_b32 s0, 33
+; CHECK-NEXT:    s_branch .LBB3_3
+; CHECK-NEXT:  .LBB3_2: ; %true
+; CHECK-NEXT:    s_mov_b32 s0, 42
+; CHECK-NEXT:    s_branch .LBB3_3
+; CHECK-NEXT:  .LBB3_3:
+  %v1c = icmp ult i32 %v1, 12
+  %v2c = icmp ugt i32 %v2, 34
+  %c = and i1 %v1c, %v2c
+  %ballot = call i64 @llvm.amdgcn.icmp.i64(i1 %c, i1 0, i32 32) ; ICMP_EQ == 32
+  %ballot_eq_zero = icmp eq i64 %ballot, 0
+  br i1 %ballot_eq_zero, label %true, label %false
+true:
+  ret i32 42
+false:
+  ret i32 33
+}