Skip to content

Commit 2258bc4

Browse files
authored
[AMDGPU] Simplify, fix and improve known bits for mbcnt (#104768)
Simplify by using KnownBits::add. Fix GlobalISel path which was ignoring the known bits of src1. Improve analysis of mbcnt.hi which adds at most 31 even in wave64.
1 parent 340fb65 commit 2258bc4

File tree

2 files changed

+34
-20
lines changed

2 files changed

+34
-20
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15758,16 +15758,12 @@ void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
1575815758
case Intrinsic::amdgcn_mbcnt_hi: {
1575915759
const GCNSubtarget &ST =
1576015760
DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
15761-
// These return at most the (wavefront size - 1) + src1
15762-
// As long as src1 is an immediate we can calc known bits
15763-
KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15764-
unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15765-
unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15766-
// Cater for potential carry
15767-
MaxActiveBits += Src1ValBits ? 1 : 0;
15768-
unsigned Size = Op.getValueType().getSizeInBits();
15769-
if (MaxActiveBits < Size)
15770-
Known.Zero.setHighBits(Size - MaxActiveBits);
15761+
// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15762+
// most 31 + src1.
15763+
Known.Zero.setBitsFrom(
15764+
IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
15765+
KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15766+
Known = KnownBits::add(Known, Known2);
1577115767
return;
1577215768
}
1577315769
}
@@ -15802,7 +15798,8 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
1580215798
switch (MI->getOpcode()) {
1580315799
case AMDGPU::G_INTRINSIC:
1580415800
case AMDGPU::G_INTRINSIC_CONVERGENT: {
15805-
switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15801+
Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
15802+
switch (IID) {
1580615803
case Intrinsic::amdgcn_workitem_id_x:
1580715804
knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
1580815805
break;
@@ -15814,9 +15811,15 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
1581415811
break;
1581515812
case Intrinsic::amdgcn_mbcnt_lo:
1581615813
case Intrinsic::amdgcn_mbcnt_hi: {
15817-
// These return at most the wavefront size - 1.
15818-
unsigned Size = MRI.getType(R).getSizeInBits();
15819-
Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15814+
// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15815+
// most 31 + src1.
15816+
Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
15817+
? getSubtarget()->getWavefrontSizeLog2()
15818+
: 5);
15819+
KnownBits Known2;
15820+
KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
15821+
Depth + 1);
15822+
Known = KnownBits::add(Known, Known2);
1582015823
break;
1582115824
}
1582215825
case Intrinsic::amdgcn_groupstaticsize: {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GCN %s
33

44
; GCN-LABEL: {{^}}mbcnt_intrinsics:
5-
; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0
6-
; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
7-
; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]]
5+
; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
6+
; GCN: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
87
define amdgpu_ps void @mbcnt_intrinsics(ptr addrspace(4) inreg %arg, ptr addrspace(4) inreg %arg1, ptr addrspace(4) inreg %arg2, i32 inreg %arg3) {
98
main_body:
109
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
@@ -80,13 +79,25 @@ define i32 @mbcnt_hi_known_bits_3(i32 %x) #0 {
8079

8180
; GCN-LABEL: {{^}}mbcnt_hi_known_bits_4:
8281
; GCN: v_mbcnt_hi_u32_b32
83-
; GCN: v_and_b32_e32
82+
; GCN-NOT: v_and_b32_e32
8483
define i32 @mbcnt_hi_known_bits_4(i32 %x) #0 {
8584
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 15)
8685
%mask = and i32 %hi, 63
8786
ret i32 %mask
8887
}
8988

89+
; TODO: Special case mbcnt.lo feeding into mbcnt.hi to remove this AND.
90+
; GCN-LABEL: {{^}}mbcnt_lo_hi_known_bits_1:
91+
; GCN: v_mbcnt_lo_u32_b32
92+
; GCN: v_mbcnt_hi_u32_b32
93+
; GCN: v_and_b32_e32
94+
define i32 @mbcnt_lo_hi_known_bits_1(i32 %x) #0 {
95+
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 %x, i32 0)
96+
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 %x, i32 %lo)
97+
%mask = and i32 %hi, 63
98+
ret i32 %mask
99+
}
100+
90101
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
91102
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
92103
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1

0 commit comments

Comments
 (0)