Skip to content

Commit 88421ea

Browse files
author
Mateja Marjanovic
committed
[AMDGPU] Trim zero components from buffer and image stores
For image and buffer stores the default behaviour on GFX11 and older is to set all unset components to zero. So if we pass only X component it will be the same as X000, or XY same as XY00. This patch simplifies the passed vector of components in InstCombine by removing zero components from the end. For image stores it also trims DMask if necessary. Reviewed by: arsenm, foad, nhaehnle, piotr
1 parent 9912bcc commit 88421ea

File tree

3 files changed

+219
-25
lines changed

3 files changed

+219
-25
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -872,10 +872,12 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
872872

873873
defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
874874
"STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
875-
[IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
875+
[IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>,
876+
AMDGPUImageDMaskIntrinsic;
876877
defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
877878
"STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
878-
[IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
879+
[IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>,
880+
AMDGPUImageDMaskIntrinsic;
879881

880882
//////////////////////////////////////////////////////////////////////////
881883
// MSAA intrinsics

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 105 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,36 @@ static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
376376
return false;
377377
}
378378

379+
// Trim all zero components from the end of the vector \p UseV and return
380+
// an appropriate bitset with known elements.
381+
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
382+
Instruction *I) {
383+
auto *VTy = cast<FixedVectorType>(UseV->getType());
384+
unsigned VWidth = VTy->getNumElements();
385+
APInt DemandedElts = APInt::getAllOnes(VWidth);
386+
387+
for (int i = VWidth - 1; i > 0; --i) {
388+
APInt DemandOneElt = APInt::getOneBitSet(VWidth, i);
389+
KnownFPClass KnownFPClass =
390+
computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(),
391+
/*InterestedClasses=*/fcAllFlags,
392+
/*Depth=*/0, &IC.getTargetLibraryInfo(),
393+
&IC.getAssumptionCache(), I,
394+
&IC.getDominatorTree(),
395+
&IC.getOptimizationRemarkEmitter());
396+
if (KnownFPClass.KnownFPClasses != fcPosZero)
397+
break;
398+
DemandedElts.clearBit(i);
399+
}
400+
return DemandedElts;
401+
}
402+
403+
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
404+
IntrinsicInst &II,
405+
APInt DemandedElts,
406+
int DMaskIdx = -1,
407+
bool IsLoad = true);
408+
379409
std::optional<Instruction *>
380410
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
381411
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1035,26 +1065,62 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
10351065
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
10361066
break;
10371067
}
1038-
default: {
1039-
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1040-
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1041-
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1068+
case Intrinsic::amdgcn_buffer_store_format:
1069+
case Intrinsic::amdgcn_raw_buffer_store_format:
1070+
case Intrinsic::amdgcn_struct_buffer_store_format:
1071+
case Intrinsic::amdgcn_raw_tbuffer_store:
1072+
case Intrinsic::amdgcn_struct_tbuffer_store:
1073+
case Intrinsic::amdgcn_tbuffer_store:
1074+
case Intrinsic::amdgcn_image_store_1d:
1075+
case Intrinsic::amdgcn_image_store_1darray:
1076+
case Intrinsic::amdgcn_image_store_2d:
1077+
case Intrinsic::amdgcn_image_store_2darray:
1078+
case Intrinsic::amdgcn_image_store_2darraymsaa:
1079+
case Intrinsic::amdgcn_image_store_2dmsaa:
1080+
case Intrinsic::amdgcn_image_store_3d:
1081+
case Intrinsic::amdgcn_image_store_cube:
1082+
case Intrinsic::amdgcn_image_store_mip_1d:
1083+
case Intrinsic::amdgcn_image_store_mip_1darray:
1084+
case Intrinsic::amdgcn_image_store_mip_2d:
1085+
case Intrinsic::amdgcn_image_store_mip_2darray:
1086+
case Intrinsic::amdgcn_image_store_mip_3d:
1087+
case Intrinsic::amdgcn_image_store_mip_cube: {
1088+
if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1089+
break;
1090+
1091+
APInt DemandedElts =
1092+
trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1093+
1094+
int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1095+
if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1096+
false)) {
1097+
return IC.eraseInstFromFunction(II);
10421098
}
1099+
1100+
break;
1101+
}
10431102
}
1103+
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1104+
AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1105+
return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
10441106
}
10451107
return std::nullopt;
10461108
}
10471109

10481110
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
10491111
///
1112+
/// The result of simplifying amdgcn image and buffer store intrinsics is updating
1113+
/// definitions of the intrinsics vector argument, not Uses of the result like
1114+
/// image and buffer loads.
10501115
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
10511116
/// struct returns.
10521117
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
10531118
IntrinsicInst &II,
10541119
APInt DemandedElts,
1055-
int DMaskIdx = -1) {
1120+
int DMaskIdx, bool IsLoad) {
10561121

1057-
auto *IIVTy = cast<FixedVectorType>(II.getType());
1122+
auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1123+
: II.getOperand(0)->getType());
10581124
unsigned VWidth = IIVTy->getNumElements();
10591125
if (VWidth == 1)
10601126
return nullptr;
@@ -1125,13 +1191,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
11251191
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
11261192

11271193
unsigned NewDMaskVal = 0;
1128-
unsigned OrigLoadIdx = 0;
1194+
unsigned OrigLdStIdx = 0;
11291195
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
11301196
const unsigned Bit = 1 << SrcIdx;
11311197
if (!!(DMaskVal & Bit)) {
1132-
if (!!DemandedElts[OrigLoadIdx])
1198+
if (!!DemandedElts[OrigLdStIdx])
11331199
NewDMaskVal |= Bit;
1134-
OrigLoadIdx++;
1200+
OrigLdStIdx++;
11351201
}
11361202
}
11371203

@@ -1159,29 +1225,45 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
11591225
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
11601226
OverloadTys[0] = NewTy;
11611227

1228+
if (!IsLoad) {
1229+
SmallVector<int, 8> EltMask;
1230+
for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1231+
if (DemandedElts[OrigStoreIdx])
1232+
EltMask.push_back(OrigStoreIdx);
1233+
1234+
if (NewNumElts == 1)
1235+
Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1236+
else
1237+
Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1238+
}
1239+
11621240
Function *NewIntrin = Intrinsic::getDeclaration(
11631241
II.getModule(), II.getIntrinsicID(), OverloadTys);
11641242
CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
11651243
NewCall->takeName(&II);
11661244
NewCall->copyMetadata(II);
11671245

1168-
if (NewNumElts == 1) {
1169-
return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1170-
DemandedElts.countr_zero());
1171-
}
1246+
if (IsLoad) {
1247+
if (NewNumElts == 1) {
1248+
return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1249+
DemandedElts.countr_zero());
1250+
}
11721251

1173-
SmallVector<int, 8> EltMask;
1174-
unsigned NewLoadIdx = 0;
1175-
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1176-
if (!!DemandedElts[OrigLoadIdx])
1177-
EltMask.push_back(NewLoadIdx++);
1178-
else
1179-
EltMask.push_back(NewNumElts);
1180-
}
1252+
SmallVector<int, 8> EltMask;
1253+
unsigned NewLoadIdx = 0;
1254+
for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1255+
if (!!DemandedElts[OrigLoadIdx])
1256+
EltMask.push_back(NewLoadIdx++);
1257+
else
1258+
EltMask.push_back(NewNumElts);
1259+
}
1260+
1261+
auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
11811262

1182-
Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1263+
return Shuffle;
1264+
}
11831265

1184-
return Shuffle;
1266+
return NewCall;
11851267
}
11861268

11871269
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -mcpu=gfx900 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
3+
; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
4+
; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s
5+
6+
define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 {
7+
; GCN-LABEL: @image_store_1d_store_all_zeros(
8+
; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
9+
; GCN-NEXT: ret void
10+
;
11+
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
12+
ret void
13+
}
14+
15+
define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 {
16+
; GCN-LABEL: @image_store_1d_store_insert_zeros_at_end(
17+
; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
18+
; GCN-NEXT: ret void
19+
;
20+
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
21+
%newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
22+
%newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
23+
%newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
24+
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
25+
ret void
26+
}
27+
28+
define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, float %vdata2, i32 %s, i32 %mip) #0 {
29+
; GCN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end(
30+
; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> <float 0.000000e+00, float poison, float poison>, float [[VDATA1:%.*]], i64 1
31+
; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2
32+
; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
33+
; GCN-NEXT: ret void
34+
;
35+
%newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
36+
%newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
37+
%newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2
38+
%newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
39+
call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
40+
ret void
41+
}
42+
43+
define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) {
44+
; GCN-LABEL: @buffer_store_format_insert_zeros_at_end(
45+
; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[VDATA1:%.*]], i64 0
46+
; GCN-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
47+
; GCN-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
48+
; GCN-NEXT: ret void
49+
;
50+
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
51+
%newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
52+
%newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
53+
%newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
54+
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0)
55+
ret void
56+
}
57+
58+
define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) {
59+
; GCN-LABEL: @struct_buffer_store_format_insert_zeros(
60+
; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
61+
; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2
62+
; GCN-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0)
63+
; GCN-NEXT: ret void
64+
;
65+
%newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
66+
%newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
67+
%newvdata3 = insertelement <4 x float> %newvdata2, float %vdata1, i32 2
68+
%newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
69+
call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0)
70+
ret void
71+
}
72+
73+
define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> inreg %a, float %vdata1, i32 %b) {
74+
; GCN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning(
75+
; GCN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 3
76+
; GCN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15)
77+
; GCN-NEXT: ret void
78+
;
79+
%newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0
80+
%newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1
81+
%newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
82+
%newvdata4 = insertelement <4 x float> %newvdata3, float %vdata1, i32 3
83+
call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15)
84+
ret void
85+
}
86+
87+
declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2
88+
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
89+
declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2
90+
declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
91+
declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
92+
declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
93+
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
94+
declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
95+
declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
96+
declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
97+
declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
98+
declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
99+
declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
100+
declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
101+
declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
102+
declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
103+
declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
104+
declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
105+
declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
106+
declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
107+
108+
attributes #0 = { nounwind }
109+
attributes #1 = { nounwind writeonly }
110+
attributes #2 = { nounwind }

0 commit comments

Comments
 (0)