Skip to content

Commit f71f5f3

Browse files
committed
[AMDGPU] Implement hardware bug workaround for image instructions
Summary: This implements a workaround for a hardware bug in gfx8 and gfx9, where register usage is not estimated correctly for image_store and image_gather4 instructions when D16 is used. Change-Id: I4e30744da6796acac53a9b5ad37ac1c2035c8899 Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D81172
1 parent dce03e3 commit f71f5f3

12 files changed

+377
-123
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,18 @@ def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
234234
"Branch offset of 3f hardware bug"
235235
>;
236236

237+
def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
238+
"HasImageStoreD16Bug",
239+
"true",
240+
"Image Store D16 hardware bug"
241+
>;
242+
243+
def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
244+
"HasImageGather4D16Bug",
245+
"true",
246+
"Image Gather4 D16 hardware bug"
247+
>;
248+
237249
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
238250
"ldsbankcount"#Value,
239251
"LDSBankCount",
@@ -810,31 +822,36 @@ def FeatureISAVersion8_1_0 : FeatureSet<
810822
[FeatureVolcanicIslands,
811823
FeatureLDSBankCount16,
812824
FeatureXNACK,
813-
FeatureCodeObjectV3]>;
825+
FeatureCodeObjectV3,
826+
FeatureImageStoreD16Bug,
827+
FeatureImageGather4D16Bug]>;
814828

815829
def FeatureISAVersion9_0_0 : FeatureSet<
816830
[FeatureGFX9,
817831
FeatureMadMixInsts,
818832
FeatureLDSBankCount32,
819833
FeatureCodeObjectV3,
820834
FeatureDoesNotSupportXNACK,
821-
FeatureDoesNotSupportSRAMECC]>;
835+
FeatureDoesNotSupportSRAMECC,
836+
FeatureImageGather4D16Bug]>;
822837

823838
def FeatureISAVersion9_0_2 : FeatureSet<
824839
[FeatureGFX9,
825840
FeatureMadMixInsts,
826841
FeatureLDSBankCount32,
827842
FeatureXNACK,
828843
FeatureDoesNotSupportSRAMECC,
829-
FeatureCodeObjectV3]>;
844+
FeatureCodeObjectV3,
845+
FeatureImageGather4D16Bug]>;
830846

831847
def FeatureISAVersion9_0_4 : FeatureSet<
832848
[FeatureGFX9,
833849
FeatureLDSBankCount32,
834850
FeatureFmaMixInsts,
835851
FeatureDoesNotSupportXNACK,
836852
FeatureDoesNotSupportSRAMECC,
837-
FeatureCodeObjectV3]>;
853+
FeatureCodeObjectV3,
854+
FeatureImageGather4D16Bug]>;
838855

839856
def FeatureISAVersion9_0_6 : FeatureSet<
840857
[FeatureGFX9,
@@ -845,7 +862,8 @@ def FeatureISAVersion9_0_6 : FeatureSet<
845862
FeatureDot1Insts,
846863
FeatureDot2Insts,
847864
FeatureDoesNotSupportXNACK,
848-
FeatureCodeObjectV3]>;
865+
FeatureCodeObjectV3,
866+
FeatureImageGather4D16Bug]>;
849867

850868
def FeatureISAVersion9_0_8 : FeatureSet<
851869
[FeatureGFX9,
@@ -864,14 +882,16 @@ def FeatureISAVersion9_0_8 : FeatureSet<
864882
FeatureAtomicFaddInsts,
865883
FeatureSRAMECC,
866884
FeatureMFMAInlineLiteralBug,
867-
FeatureCodeObjectV3]>;
885+
FeatureCodeObjectV3,
886+
FeatureImageGather4D16Bug]>;
868887

869888
def FeatureISAVersion9_0_9 : FeatureSet<
870889
[FeatureGFX9,
871890
FeatureMadMixInsts,
872891
FeatureLDSBankCount32,
873892
FeatureXNACK,
874-
FeatureCodeObjectV3]>;
893+
FeatureCodeObjectV3,
894+
FeatureImageGather4D16Bug]>;
875895

876896
// TODO: Organize more features into groups.
877897
def FeatureGroup {

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1539,6 +1539,16 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
15391539
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
15401540
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
15411541

1542+
// One memoperand is mandatory, except for getresinfo.
1543+
// FIXME: Check this in verifier.
1544+
if (!MI.memoperands_empty()) {
1545+
const MachineMemOperand *MMO = *MI.memoperands_begin();
1546+
1547+
// Infer d16 from the memory size, as the register type will be mangled by
1548+
// unpacked subtargets, or by TFE.
1549+
IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1550+
}
1551+
15421552
if (BaseOpcode->Store) {
15431553
VDataIn = MI.getOperand(1).getReg();
15441554
VDataTy = MRI->getType(VDataIn);
@@ -1548,18 +1558,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
15481558
VDataTy = MRI->getType(VDataOut);
15491559
NumVDataDwords = DMaskLanes;
15501560

1551-
// One memoperand is mandatory, except for getresinfo.
1552-
// FIXME: Check this in verifier.
1553-
if (!MI.memoperands_empty()) {
1554-
const MachineMemOperand *MMO = *MI.memoperands_begin();
1555-
1556-
// Infer d16 from the memory size, as the register type will be mangled by
1557-
// unpacked subtargets, or by TFE.
1558-
IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1559-
1560-
if (IsD16 && !STI.hasUnpackedD16VMem())
1561-
NumVDataDwords = (DMaskLanes + 1) / 2;
1562-
}
1561+
if (IsD16 && !STI.hasUnpackedD16VMem())
1562+
NumVDataDwords = (DMaskLanes + 1) / 2;
15631563
}
15641564
}
15651565

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3528,24 +3528,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
35283528
/// Handle register layout difference for f16 images for some subtargets.
35293529
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
35303530
MachineRegisterInfo &MRI,
3531-
Register Reg) const {
3532-
if (!ST.hasUnpackedD16VMem())
3533-
return Reg;
3534-
3531+
Register Reg,
3532+
bool ImageStore) const {
35353533
const LLT S16 = LLT::scalar(16);
35363534
const LLT S32 = LLT::scalar(32);
35373535
LLT StoreVT = MRI.getType(Reg);
35383536
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
35393537

3540-
auto Unmerge = B.buildUnmerge(S16, Reg);
3538+
if (ST.hasUnpackedD16VMem()) {
3539+
auto Unmerge = B.buildUnmerge(S16, Reg);
3540+
3541+
SmallVector<Register, 4> WideRegs;
3542+
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3543+
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3544+
3545+
int NumElts = StoreVT.getNumElements();
3546+
3547+
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3548+
}
3549+
3550+
if (ImageStore && ST.hasImageStoreD16Bug()) {
3551+
if (StoreVT.getNumElements() == 2) {
3552+
SmallVector<Register, 4> PackedRegs;
3553+
Reg = B.buildBitcast(S32, Reg).getReg(0);
3554+
PackedRegs.push_back(Reg);
3555+
PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3556+
return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3557+
}
3558+
3559+
if (StoreVT.getNumElements() == 3) {
3560+
SmallVector<Register, 4> PackedRegs;
3561+
auto Unmerge = B.buildUnmerge(S16, Reg);
3562+
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3563+
PackedRegs.push_back(Unmerge.getReg(I));
3564+
PackedRegs.resize(8, B.buildUndef(S16).getReg(0));
3565+
Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0);
3566+
return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0);
3567+
}
35413568

3542-
SmallVector<Register, 4> WideRegs;
3543-
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3544-
WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3569+
if (StoreVT.getNumElements() == 4) {
3570+
SmallVector<Register, 4> PackedRegs;
3571+
Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3572+
auto Unmerge = B.buildUnmerge(S32, Reg);
3573+
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3574+
PackedRegs.push_back(Unmerge.getReg(I));
3575+
PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3576+
return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3577+
}
35453578

3546-
int NumElts = StoreVT.getNumElements();
3579+
llvm_unreachable("invalid data type");
3580+
}
35473581

3548-
return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3582+
return Reg;
35493583
}
35503584

35513585
Register AMDGPULegalizerInfo::fixStoreSourceType(
@@ -4215,7 +4249,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
42154249
if (!Ty.isVector() || Ty.getElementType() != S16)
42164250
return true;
42174251

4218-
Register RepackedReg = handleD16VData(B, *MRI, VData);
4252+
Register RepackedReg = handleD16VData(B, *MRI, VData, true);
42194253
if (RepackedReg != VData) {
42204254
MI.getOperand(1).setReg(RepackedReg);
42214255
}

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
146146
splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
147147

148148
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
149-
Register Reg) const;
149+
Register Reg, bool ImageStore = false) const;
150150
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
151151
MachineIRBuilder &B, bool IsFormat) const;
152152
bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
271271
HasNSAtoVMEMBug(false),
272272
HasOffset3fBug(false),
273273
HasFlatSegmentOffsetBug(false),
274+
HasImageStoreD16Bug(false),
275+
HasImageGather4D16Bug(false),
274276

275277
FeatureDisable(false),
276278
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,8 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
411411
bool HasNSAtoVMEMBug;
412412
bool HasOffset3fBug;
413413
bool HasFlatSegmentOffsetBug;
414+
bool HasImageStoreD16Bug;
415+
bool HasImageGather4D16Bug;
414416

415417
// Dummy feature to use for assembler in tablegen.
416418
bool FeatureDisable;
@@ -1025,9 +1027,11 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
10251027
return HasOffset3fBug;
10261028
}
10271029

1028-
bool hasNSAEncoding() const {
1029-
return HasNSAEncoding;
1030-
}
1030+
bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1031+
1032+
bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1033+
1034+
bool hasNSAEncoding() const { return HasNSAEncoding; }
10311035

10321036
bool hasGFX10_BEncoding() const {
10331037
return GFX10_BEncoding;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5851,7 +5851,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
58515851
SDValue Data(Result, 0);
58525852
SDValue TexFail;
58535853

5854-
if (IsTexFail) {
5854+
if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
58555855
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
58565856
if (MaskPopVT.isVector()) {
58575857
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
@@ -5860,10 +5860,6 @@ static SDValue constructRetValue(SelectionDAG &DAG,
58605860
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
58615861
SDValue(Result, 0), ZeroIdx);
58625862
}
5863-
5864-
TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5865-
SDValue(Result, 0),
5866-
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
58675863
}
58685864

58695865
if (DataDwordVT.isVector())
@@ -5887,8 +5883,13 @@ static SDValue constructRetValue(SelectionDAG &DAG,
58875883
}
58885884
Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
58895885

5890-
if (TexFail)
5886+
if (IsTexFail) {
5887+
TexFail =
5888+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
5889+
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
5890+
58915891
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
5892+
}
58925893

58935894
if (Result->getNumValues() == 1)
58945895
return Data;
@@ -6007,7 +6008,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
60076008
return Op; // D16 is unsupported for this instruction
60086009

60096010
IsD16 = true;
6010-
VData = handleD16VData(VData, DAG);
6011+
VData = handleD16VData(VData, DAG, true);
60116012
}
60126013

60136014
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
@@ -6027,7 +6028,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
60276028
(!LoadVT.isVector() && DMaskLanes > 1))
60286029
return Op;
60296030

6030-
if (IsD16 && !Subtarget->hasUnpackedD16VMem())
6031+
// The sq block of gfx8 and gfx9 do not estimate register use correctly
6032+
// for d16 image_gather4, image_gather4_l, and image_gather4_lz
6033+
// instructions.
6034+
if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
6035+
!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
60316036
NumVDataDwords = (DMaskLanes + 1) / 2;
60326037
else
60336038
NumVDataDwords = DMaskLanes;
@@ -7401,8 +7406,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
74017406
return NewOp;
74027407
}
74037408

7404-
SDValue SITargetLowering::handleD16VData(SDValue VData,
7405-
SelectionDAG &DAG) const {
7409+
SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
7410+
bool ImageStore) const {
74067411
EVT StoreVT = VData.getValueType();
74077412

74087413
// No change for f16 and legal vector D16 types.
@@ -7434,6 +7439,36 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
74347439
return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
74357440
}
74367441

7442+
// The sq block of gfx8.1 does not estimate register use correctly for d16
7443+
// image store instructions. The data operand is computed as if it were not a
7444+
// d16 image instruction.
7445+
if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
7446+
// Bitcast to i16
7447+
EVT IntStoreVT = StoreVT.changeTypeToInteger();
7448+
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7449+
7450+
// Decompose into scalars
7451+
SmallVector<SDValue, 4> Elts;
7452+
DAG.ExtractVectorElements(IntVData, Elts);
7453+
7454+
// Group pairs of i16 into v2i16 and bitcast to i32
7455+
SmallVector<SDValue, 4> PackedElts;
7456+
for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
7457+
SDValue Pair =
7458+
DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
7459+
SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
7460+
PackedElts.push_back(IntPair);
7461+
}
7462+
7463+
// Pad using UNDEF
7464+
PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));
7465+
7466+
// Build final vector
7467+
EVT VecVT =
7468+
EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
7469+
return DAG.getBuildVector(VecVT, DL, PackedElts);
7470+
}
7471+
74377472
assert(isTypeLegal(StoreVT));
74387473
return VData;
74397474
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
108108
ArrayRef<SDValue> Ops, EVT MemVT,
109109
MachineMemOperand *MMO, SelectionDAG &DAG) const;
110110

111-
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;
111+
SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
112+
bool ImageStore = false) const;
112113

113114
/// Converts \p Op, which must be of floating point type, to the
114115
/// floating point type \p VT, by either extending or truncating it.

0 commit comments

Comments
 (0)