Skip to content

Commit 79769a4

Browse files
committed
[InstCombine][AMDGPU] Fix crash with v3i16/v3f16 buffer intrinsics
Summary: This is something of a workaround to avoid a crash later on in type legalizer (WidenVectorResult()). Also added some f16 tests, including a non-working v3f16 case with a FIXME. Reviewers: arsenm, tpr, nhaehnle Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68865 llvm-svn: 374993
1 parent 5a13188 commit 79769a4

File tree

2 files changed

+52
-0
lines changed

2 files changed

+52
-0
lines changed

llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
971971
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
972972
APInt DemandedElts,
973973
int DMaskIdx) {
974+
975+
// FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
976+
if (DMaskIdx < 0 &&
977+
II->getType()->getScalarSizeInBits() != 32 &&
978+
DemandedElts.getActiveBits() == 3)
979+
return nullptr;
980+
974981
unsigned VWidth = II->getType()->getVectorNumElements();
975982
if (VWidth == 1)
976983
return nullptr;

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,6 +1474,51 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32
14741474

14751475
declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
14761476

1477+
; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
1478+
; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1479+
; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3
1480+
; CHECK-NEXT: ret half %elt1
1481+
define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
1482+
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1483+
%elt1 = extractelement <4 x half> %data, i32 3
1484+
ret half %elt1
1485+
}
1486+
1487+
; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16).
1488+
; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
1489+
; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1490+
; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 2
1491+
; CHECK-NEXT: ret half %elt1
1492+
define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
1493+
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1494+
%elt1 = extractelement <4 x half> %data, i32 2
1495+
ret half %elt1
1496+
}
1497+
1498+
; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
1499+
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1500+
; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1
1501+
; CHECK-NEXT: ret half %elt1
1502+
define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
1503+
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1504+
%elt1 = extractelement <4 x half> %data, i32 1
1505+
ret half %elt1
1506+
}
1507+
1508+
; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
1509+
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1510+
; CHECK-NEXT: ret half %data
1511+
define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
1512+
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
1513+
%elt1 = extractelement <4 x half> %data, i32 0
1514+
ret half %elt1
1515+
}
1516+
1517+
declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1
1518+
declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1
1519+
declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
1520+
declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
1521+
14771522
; --------------------------------------------------------------------
14781523
; llvm.amdgcn.struct.tbuffer.load
14791524
; --------------------------------------------------------------------

0 commit comments

Comments
 (0)