-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU/GlobalISel: Fix inst-selection of ballot #109986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesBoth input and output of ballot are lane-masks: Patch is 321.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109986.diff 64 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index febf0711c7d574..af6081f34303a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1429,34 +1429,57 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
- const auto BuildCopy = [&](Register SrcReg) {
- if (Size == STI.getWavefrontSize()) {
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
- .addReg(SrcReg);
- return;
- }
+ const auto BuildAnd = [&](unsigned Opcode, Register Dst, Register Src,
+ Register Exec) {
+ auto And = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
+ .addReg(Src)
+ .addReg(Exec)
+ .setOperandDead(3); // Dead scc
+ constrainSelectedInstRegOperands(*And, TII, TRI, RBI);
+ };
- // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
- Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(SrcReg)
+ const auto BuildREG_SEQUENCE = [&](Register Dst, Register Lo, Register Hi) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(Lo)
.addImm(AMDGPU::sub0)
- .addReg(HiReg)
+ .addReg(Hi)
.addImm(AMDGPU::sub1);
};
if (Arg) {
- const int64_t Value = Arg->Value.getSExtValue();
+ const int64_t Value = Arg->Value.getZExtValue();
if (Value == 0) {
+ // DstReg(32or64) = S_MOV 0
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
- } else if (Value == -1) // all ones
- BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
- else
+ } else if (Value == 1) {
+ if (Size == STI.getWavefrontSize()) {
+ // DstReg(32or64) = COPY EXEC
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TRI.getExec());
+ } else {
+ // DstReg(64) = REG_SEQUENCE EXEC_LO, 0
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildREG_SEQUENCE(DstReg, TRI.getExec(), HiReg);
+ }
+ } else
return false;
- } else
- BuildCopy(I.getOperand(2).getReg());
+ } else {
+ Register SrcReg = I.getOperand(2).getReg();
+ if (Size == STI.getWavefrontSize()) {
+ // DstReg(32or64) = AND SrcReg, EXEC
+ unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildAnd(AndOpc, DstReg, SrcReg, TRI.getExec());
+ } else {
+ // DstReg(64) = REG_SEQUENCE (AND SrcReg(32), EXEC_LO), 0
+ Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildAnd(AMDGPU::S_AND_B32, LoReg, SrcReg, AMDGPU::EXEC_LO);
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildREG_SEQUENCE(DstReg, LoReg, HiReg);
+ }
+ }
I.eraseFromParent();
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index de973481f82308..3d7b715a6dbc19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -138,6 +138,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_and_b32 s4, s4, s5
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2]
; CHECK-NEXT: s_and_b32 s4, s4, s5
+; CHECK-NEXT: s_and_b32 s4, s4, exec_lo
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
; CHECK-NEXT: v_writelane_b32 v0, s4, 13
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 96cab200b61cdb..f893ac8177b4a3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc)
@@ -45,7 +46,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) {
define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -55,7 +57,8 @@ define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) {
define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
; CHECK-LABEL: compare_int_with_constant:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
+; CHECK-NEXT: v_cmp_le_i32_e32 vcc_lo, 0x63, v0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -65,7 +68,8 @@ define amdgpu_cs i32 @compare_int_with_constant(i32 %x) {
define amdgpu_cs i32 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -76,7 +80,8 @@ define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1
-; CHECK-NEXT: s_bcnt1_i32_b32 s0, vcc_lo
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
@@ -89,7 +94,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -113,6 +119,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -137,7 +144,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -161,6 +169,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -184,7 +193,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -210,6 +220,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -233,7 +244,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
-; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0
+; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
+; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -259,6 +271,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -284,6 +297,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -315,6 +329,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -342,6 +357,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -373,6 +389,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -401,6 +418,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
+; CHECK-NEXT: s_and_b32 s0, s0, exec_lo
; CHECK-NEXT: s_cmp_le_i32 s0, 22
; CHECK-NEXT: s_cbranch_scc1 .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index a18f843440445c..72b65f087bdad5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
; CHECK-LABEL: non_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%trunc = trunc i32 %x to i1
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
@@ -46,7 +47,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
; CHECK-LABEL: compare_ints:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp eq i32 %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -57,7 +59,8 @@ define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
; CHECK-LABEL: compare_int_with_constant:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_mov_b32_e32 v1, 0x63
-; CHECK-NEXT: v_cmp_ge_i32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = icmp sge i32 %x, 99
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -67,7 +70,8 @@ define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
define amdgpu_cs i64 @compare_floats(float %x, float %y) {
; CHECK-LABEL: compare_floats:
; CHECK: ; %bb.0:
-; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1
+; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
@@ -78,7 +82,8 @@ define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
; CHECK-LABEL: ctpop_of_ballot:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
-; CHECK-NEXT: s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ; return to shader part epilog
%cmp = fcmp ogt float %x, %y
@@ -92,7 +97,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB7_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -116,6 +122,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB8_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -140,7 +147,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -164,6 +172,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_non_compare(i32 inreg %v) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB10_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -187,7 +196,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_ne_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_eq_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB11_2
; CHECK-NEXT: ; %bb.1: ; %true
; CHECK-NEXT: s_mov_b32 s0, 42
@@ -213,6 +223,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB12_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -236,7 +247,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_compare(i32 %v) {
; CHECK-LABEL: branch_divergent_ballot_eq_zero_compare:
; CHECK: ; %bb.0:
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
-; CHECK-NEXT: s_cmp_lg_u64 vcc, 0
+; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB13_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -262,6 +274,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB14_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -287,6 +300,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB15_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -318,6 +332,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_ne_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc1 .LBB16_2
; CHECK-NEXT: ; %bb.1: ; %true
@@ -345,6 +360,7 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -376,6 +392,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_eq_zero_and(i32 inreg %v1, i32 inreg
; CHECK-NEXT: s_and_b32 s0, s0, s1
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB18_2
; CHECK-NEXT: ; %bb.1: ; %false
@@ -404,6 +421,7 @@ define amdgpu_cs i32 @branch_uniform_ballot_sgt_N_compare(i32 inreg %v) {
; CHECK-NEXT: s_cselect_b32 s0, 1, 0
; CHECK-NEXT: s_and_b32 s0, 1, s0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
+; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
; CHECK-NEXT: v_cmp_le_i64_e64 vcc, s[0:1], 22
; CHECK-NEXT: s_cbranch_vccnz .LBB19_2
; CHECK-NEXT: ; %bb.1: ; %true
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 614f59c564df64..4593607112ba45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -166,6 +166,7 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[19:20]
; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
; GFX1030-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX1030-NEXT: s_and...
[truncated]
|
.addReg(Src) | ||
.addReg(Exec) | ||
.setOperandDead(3); // Dead scc | ||
constrainSelectedInstRegOperands(*And, TII, TRI, RBI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should check return value
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 | ||
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
v_cmp already treats inactive lanes as 0. SelectionDAG does not introduce the extra instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code is worse at the moment but correct. I don't see simple way to fix it.
Maybe if lower ballot earlier but most probably need new regbankselect first then remove extra AND in regbankcombiner.
; GFX10-NEXT: s_or_b32 s0, s0, s2 | ||
; GFX10-NEXT: ; %bb.4: ; %exit | ||
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for context this is what sdag does.
non-constant ballot
tsrc: i1 = ...
tballot: i32 = llvm.amdgcn.ballot tsrc
is selected as (by default, when combine does not kick in)
tzext: i32 = zero_extend tsrc
tballot: i32 = SETCC tzext, Constant:i32<0>, setne:ch
Note the uppercase SETCC and i32 type, regular compare is "i1 = setcc"
most common case is with regular compare input that gets folded(*)
tsrc: i1 = setcc t_op0, t_op1
tballot: i32 = llvm.amdgcn.ballot tsrc
tballot: i32 = SETCC t_op0, t_op1
Case that is missing on GlobalISel is when inactive lanes need to be zeroed by ballot.
SDag does this by selecting tzext: i32 = zero_extend tsrc into select
and then comparing it with zero - compare result in vcc is set to 0 for inactive lanes.
(*) In theory this might be wrong in case folded compare was from another block with different exec mask but might not be possible since sdag works block by block?
GlobalISel could avoid adding 'AND with exec' when input is compare result that was calculated with same exec mask (compared to current exec - used by ballot)
e12ef91
to
4bcf7e7
Compare
Maybe it is possible to not make AND in some cases. Will look into it. |
For completeness |
4bcf7e7
to
3c6efe1
Compare
3c1d2bd
to
c0fc9c7
Compare
Ping. Fixed most of the cases where compare was no folded, there is a comment that justifies why I think it is correct to fold. |
Register Op0 = CmpMI->getOperand(2).getReg(); | ||
Register Op1 = CmpMI->getOperand(3).getReg(); | ||
unsigned OpSize = MRI->getType(Op0).getSizeInBits(); | ||
const TargetRegisterClass *VgprRC = TRI.getVGPRClassForBitWidth(OpSize); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sink this to where it's needed? You also don't need it if you use constrainSelectedInstRegOperands
// is in a block that merged control flow. Using the compare directly in the | ||
// ballot implies that active lanes for the ballot are a subset of active | ||
// lanes for the compare. | ||
auto Pred = (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cast<GAnyCmp>
and use getCond
Op0Idx = 1; | ||
Op1Idx = 2; | ||
} else { | ||
// fcmp compares have modifiers |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So try to match them?
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 | ||
; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 | ||
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The output was a real compare, so this shouldn't be needed. The DAG doesn't emit this and
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is really not obvious, Input is "vgpr to vcc trunc" that is selected as compare. Will become better after new reg bank select.
return constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, *Cmp, *VgprRC, | ||
Cmp->getOperand(Op0Idx)) && | ||
constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, *Cmp, *VgprRC, | ||
Cmp->getOperand(Op1Idx)) && | ||
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The constrainSelectedInstRegOperands is redundant with the two constrainOperandRegClass calls. Either directly constrain the result register, or just use the one constrainSelectedInstRegOperands
}; | ||
|
||
const auto FoldCmp = [&](Register Dst, MachineInstr *CmpMI) -> bool { | ||
// Fold ballot of a compare. Active lanes when the ballot is executed need |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you check that the block parents are the same? It's only broken in the mid block exec mask change, which we should move away from having
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can do better. There is no need to check anything if compare is used directly.
For example:
%bb.entry
%cmp = ...
s_and_saveexec_b32 ...
%bb.divergent_block
%res = ballot(%cmp)
Works because active lanes in %bb.divergent_block
are subset of active lanes in %bb.entry
, so "sinking compare" to other block works according to ballot description (put 0 in inactive lanes).
the only potential problem is when
%bb.entry
s_and_saveexec_b32 ...
%bb.divergent_block
%cmp = ...
; %bb.merge.control.flow
s_or_b32 %exec.from.bb.divergent_block, ...
%res = ballot(%cmp)
Lane could be inactive in %bb.divergent_block
but active in %bb.merge.control.flow,
and ballot result could be 1 instead of 0.
However that case is not written correctly. ballot can't use such compare directly, it has to use phi.
%bb.entry
s_and_saveexec_b32 ...
%bb.divergent_block
**%cmp** = ...
...
; %bb.merge.control.flow
s_or_b32 %exec.from.bb.divergent_block, ...
%phi = phi i1 [ **%cmp**, %bb.divergent_block ], [ ..., ...]
%res = ballot(%phi)
So in summary we do same thing as sdag if input is compare (sink it). Otherwise select and with exec.
This is better then what selection dag does when input is not compare (sdag does select 0, 1 (lane mask to vgpr) + compare vgpr with 0)
c0fc9c7
to
a0a2f8f
Compare
; CHECK-NEXT: [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_1]], $exec, implicit-def dead $scc | ||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nearly all of the test changes are regressing the waterfall loop case, for the ballot from regbankselect. Can we do anything about that? Intuitively we should be able to use the raw value
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will rewrite the patch next week, need to look through AND and OR and see if some of the values was made with same exec(going back to your suggestion to just check for same block). I was thinking to simply do 'copy' instead of sinking compare or 'AND with exec'
auto [Src0, Src0Mods] = selectVOP3ModsImpl(Src0Reg); | ||
auto [Src1, Src1Mods] = selectVOP3ModsImpl(Src1Reg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need tests with source modifiers?
// Try to fold sgpr compare. | ||
if (SrcMI->getOpcode() == AMDGPU::G_TRUNC) | ||
SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This pattern probably shouldn't reach selection
a0a2f8f
to
0c0e21b
Compare
; GFX11-NEXT: ; %bb.4: ; %exit | ||
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 | ||
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo | ||
; GFX11-NEXT: v_mov_b32_e32 v2, s0 | ||
; GFX11-NEXT: global_store_b32 v[0:1], v2, off |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is ballot source that is not compare (source is phi), compared to what sdag does:
https://github.com/llvm/llvm-project/pull/109986/files/0c0e21bf407bb4616e7283befec8ac0aec361ee3#diff-8048303f4c9f4c844e010c047a71b7bfcfb6c612f0faf5d9a89104acb11ee39fR571-R576
Reg = MI->getOperand(1).getReg(); | ||
if (!Reg.isVirtual()) | ||
return false; | ||
MI = MRI.getVRegDef(Reg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe should break if the block changed
// Ballot has to zero bits in input lane-mask that are zero in current exec, | ||
// Done as AND with exec. For inputs that are results of instruction that | ||
// implicitly use same exec, for example compares in same basic block, use copy. | ||
bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static
bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI, | ||
MachineBasicBlock *MBB) { | ||
MachineInstr *MI = MRI.getVRegDef(Reg); | ||
// Look through copies, truncs and anyext. TODO: just copies |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What situations are there redundant copies that need to be looked through? I would assume any trivial copies to be optimized out
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
non-trivial cross bank copies scc to vcc for example.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would be at most one copy?
Also thinking about whether such a trivially uniform ballot should have folded out already
// Ballot has to zero bits in input lane-mask that are zero in current exec, | ||
// Done as AND with exec. For inputs that are results of instruction that | ||
// implicitly use same exec, for example compares in same basic block, use copy. | ||
bool isBallotCopy(Register Reg, MachineRegisterInfo &MRI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Name is misleading, this is looking for a vector compare
5185602
to
1420d35
Compare
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
} | ||
|
||
// Lane mask generated using compare with same exec. | ||
if (isa<GAnyCmp>(MI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is now missing a check that MI is in MBB.
if (MI->getParent() != MBB) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same MBB check is here. Deleted the loop
Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input.
Both input and output of ballot are lane-masks:
result is lane-mask with 'S32/S64 LLT and SGPR bank'
input is lane-mask with 'S1 LLT and VCC reg bank'.
Ballot copies bits from input lane-mask for
all active lanes and puts 0 for inactive lanes.
GlobalISel did not set 0 in result for inactive lanes
for non-constant input.