Skip to content

Commit c657a6f

Browse files
authored
[AMDGPU] Fix selection of s_load_b96 on GFX11 (#108029)
Fix a bug which resulted in selection of s_load_b96 on GFX11, which only exists in GFX12. The root cause was a mismatch between legalization and selection. The condition used to check that the load was uniform in legalization (SITargetLowering::LowerLOAD) was "!Op->isDivergent()". The condition used to detect a non-uniform load during selection (AMDGPUDAGToDAGISel::isUniformLoad()) was "N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO)". This makes a difference when IR uniformity analysis has more information than SDAG's built in analysis. In the test case this is because IR UA reports that everything is uniform if isSingleLaneExecution() returns true, e.g. if the specified max flat workgroup size is 1, but SDAG does not have this optimization. The immediate fix is to use the same condition to detect uniform loads in legalization and selection. In future SDAG should learn about isSingleLaneExecution(), and then it could probably stop relying on IR metadata to detect uniform loads.
1 parent 36adf8e commit c657a6f

File tree

2 files changed

+65
-18
lines changed

2 files changed

+65
-18
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10262,6 +10262,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1026210262
LoadSDNode *Load = cast<LoadSDNode>(Op);
1026310263
ISD::LoadExtType ExtType = Load->getExtensionType();
1026410264
EVT MemVT = Load->getMemoryVT();
10265+
MachineMemOperand *MMO = Load->getMemOperand();
1026510266

1026610267
if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
1026710268
if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
@@ -10272,7 +10273,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1027210273

1027310274
SDValue Chain = Load->getChain();
1027410275
SDValue BasePtr = Load->getBasePtr();
10275-
MachineMemOperand *MMO = Load->getMemOperand();
1027610276

1027710277
EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
1027810278

@@ -10328,25 +10328,12 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1032810328

1032910329
unsigned NumElements = MemVT.getVectorNumElements();
1033010330

10331-
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10332-
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
10333-
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10334-
if (MemVT.isPow2VectorType() ||
10335-
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10336-
return SDValue();
10337-
return WidenOrSplitVectorLoad(Op, DAG);
10338-
}
10339-
// Non-uniform loads will be selected to MUBUF instructions, so they
10340-
// have the same legalization requirements as global and private
10341-
// loads.
10342-
//
10343-
}
10344-
1034510331
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1034610332
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10347-
AS == AMDGPUAS::GLOBAL_ADDRESS) {
10348-
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10349-
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10333+
(AS == AMDGPUAS::GLOBAL_ADDRESS &&
10334+
Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10335+
isMemOpHasNoClobberedMemOperand(Load))) {
10336+
if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
1035010337
Alignment >= Align(4) && NumElements < 32) {
1035110338
if (MemVT.isPow2VectorType() ||
1035210339
(Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
4+
5+
define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "amdgpu-flat-work-group-size"="1,1" {
6+
; GFX11-LABEL: test_uniform_load_b96:
7+
; GFX11: ; %bb.0: ; %bb
8+
; GFX11-NEXT: v_mov_b32_e32 v3, 0
9+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10+
; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3]
11+
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
12+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
13+
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
14+
; GFX11-NEXT: v_readfirstlane_b32 s0, v2
15+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
16+
; GFX11-NEXT: v_readfirstlane_b32 s1, v3
17+
; GFX11-NEXT: s_clause 0x1
18+
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
19+
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8
20+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
21+
; GFX11-NEXT: v_mov_b32_e32 v2, s3
22+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
23+
; GFX11-NEXT: v_or3_b32 v2, s2, v2, s0
24+
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
25+
; GFX11-NEXT: s_nop 0
26+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
27+
; GFX11-NEXT: s_endpgm
28+
;
29+
; GFX12-LABEL: test_uniform_load_b96:
30+
; GFX12: ; %bb.0: ; %bb
31+
; GFX12-NEXT: v_mov_b32_e32 v3, 0
32+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
33+
; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
34+
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
35+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
36+
; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
37+
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
38+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
39+
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
40+
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
41+
; GFX12-NEXT: s_wait_kmcnt 0x0
42+
; GFX12-NEXT: v_mov_b32_e32 v2, s0
43+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
44+
; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2
45+
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
46+
; GFX12-NEXT: s_nop 0
47+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
48+
; GFX12-NEXT: s_endpgm
49+
bb:
50+
%i = zext i32 %arg to i64
51+
%i1 = getelementptr i32, ptr addrspace(1) %ptr, i64 %i
52+
%i2 = load <3 x i32>, ptr addrspace(1) %i1, align 4
53+
%i3 = extractelement <3 x i32> %i2, i32 0
54+
%i4 = extractelement <3 x i32> %i2, i32 1
55+
%i5 = extractelement <3 x i32> %i2, i32 2
56+
%i6 = or i32 %i3, %i4
57+
%i7 = or i32 %i5, %i6
58+
store i32 %i7, ptr addrspace(1) %ptr, align 4
59+
ret void
60+
}

0 commit comments

Comments
 (0)