Skip to content

Commit 15428e0

Browse files
authored
[AMDGPU] Add support for point sample accel out of order returns (#127991)
Add target feature for point sample acceleration and enable it for relevant targets. Also add support to insert waitcnts where required when point sample accel may have occurred. This has implications for out of order returns, which is why extra waitcnts are required. Add a VMEM_NOSAMPLER bit in the register masks to determine when waitcnt is required.
1 parent f989db5 commit 15428e0

File tree

7 files changed

+334
-29
lines changed

7 files changed

+334
-29
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,12 @@ def FeatureBVHDualAndBVH8Insts : SubtargetFeature<"bvh-dual-bvh-8-insts",
11001100
"Has image_bvh_dual_intersect_ray and image_bvh8_intersect_ray instructions"
11011101
>;
11021102

1103+
def FeaturePointSampleAccel : SubtargetFeature<"point-sample-accel",
1104+
"HasPointSampleAccel",
1105+
"true",
1106+
"Has point sample acceleration feature"
1107+
>;
1108+
11031109
//===------------------------------------------------------------===//
11041110
// Subtarget Features (options and debugging)
11051111
//===------------------------------------------------------------===//
@@ -1811,20 +1817,23 @@ def FeatureISAVersion11_5_0 : FeatureSet<
18111817
!listconcat(FeatureISAVersion11_Common.Features,
18121818
[FeatureSALUFloatInsts,
18131819
FeatureDPPSrc1SGPR,
1814-
FeatureRequiredExportPriority])>;
1820+
FeatureRequiredExportPriority,
1821+
FeaturePointSampleAccel])>;
18151822

18161823
def FeatureISAVersion11_5_1 : FeatureSet<
18171824
!listconcat(FeatureISAVersion11_Common.Features,
18181825
[FeatureSALUFloatInsts,
18191826
FeatureDPPSrc1SGPR,
18201827
Feature1_5xVGPRs,
1821-
FeatureRequiredExportPriority])>;
1828+
FeatureRequiredExportPriority,
1829+
FeaturePointSampleAccel])>;
18221830

18231831
def FeatureISAVersion11_5_2 : FeatureSet<
18241832
!listconcat(FeatureISAVersion11_Common.Features,
18251833
[FeatureSALUFloatInsts,
18261834
FeatureDPPSrc1SGPR,
1827-
FeatureRequiredExportPriority])>;
1835+
FeatureRequiredExportPriority,
1836+
FeaturePointSampleAccel])>;
18281837

18291838
def FeatureISAVersion11_5_3 : FeatureSet<
18301839
!listconcat(FeatureISAVersion11_Common.Features,

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
259259
bool HasMinimum3Maximum3F16 = false;
260260
bool HasMinimum3Maximum3PKF16 = false;
261261
bool HasLshlAddU64Inst = false;
262+
bool HasPointSampleAccel = false;
262263

263264
bool RequiresCOV6 = false;
264265

@@ -1363,6 +1364,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13631364
return HasMinimum3Maximum3PKF16;
13641365
}
13651366

1367+
bool hasPointSampleAccel() const { return HasPointSampleAccel; }
1368+
13661369
/// \returns The maximum number of instructions that can be enclosed in an
13671370
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
13681371
/// instruction.

llvm/lib/Target/AMDGPU/MIMGInstructions.td

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class MIMGBaseOpcode : PredicateControl {
5252
bit BVH = 0;
5353
bit A16 = 0;
5454
bit NoReturn = 0;
55+
bit PointSampleAccel = 0; // Opcode eligible for gfx11.5 point sample acceleration
5556
}
5657

5758
def MIMGBaseOpcode : GenericEnum {
@@ -63,7 +64,8 @@ def MIMGBaseOpcodesTable : GenericTable {
6364
let CppTypeName = "MIMGBaseOpcodeInfo";
6465
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
6566
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
66-
"LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn"];
67+
"LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16", "NoReturn",
68+
"PointSampleAccel"];
6769
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
6870

6971
let PrimaryKey = ["BaseOpcode"];
@@ -1458,13 +1460,14 @@ multiclass MIMG_Sampler_NoReturn <mimgopc op, AMDGPUSampleVariant sample, bit wq
14581460
}
14591461
}
14601462

1461-
multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
1462-
bit isG16 = 0, bit isGetLod = 0,
1463+
multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0,
1464+
bit wqm = 0, bit isG16 = 0, bit isGetLod = 0,
14631465
string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""),
14641466
bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> {
14651467
def "" : MIMG_Sampler_BaseOpcode<sample> {
14661468
let HasD16 = !not(isGetLod);
14671469
let G16 = isG16;
1470+
let PointSampleAccel = isPointSampleAccel;
14681471
}
14691472

14701473
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -1485,8 +1488,8 @@ multiclass MIMG_Sampler <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
14851488
defm "_nortn" : MIMG_Sampler_NoReturn <op, sample, wqm, isG16, asm>;
14861489
}
14871490

1488-
multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample>
1489-
: MIMG_Sampler<op, sample, 1>;
1491+
multiclass MIMG_Sampler_WQM <mimgopc op, AMDGPUSampleVariant sample, bit isPointSampleAccel = 0>
1492+
: MIMG_Sampler<op, sample, isPointSampleAccel, 1>;
14901493

14911494
multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
14921495
string asm = "image_gather4"#sample.LowerCaseMod> {
@@ -1684,15 +1687,15 @@ let AssemblerPredicate = isGFX12Plus in {
16841687
def : AMDGPUMnemonicAlias<"image_atomic_fmax", "image_atomic_max_flt">;
16851688
}
16861689

1687-
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
1690+
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample, 1>;
16881691
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts] in {
16891692
defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>;
16901693
defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>;
16911694
defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x41, 0x23>, AMDGPUSample_d_cl>;
1692-
defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l>;
1695+
defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l, 1>;
16931696
defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x1e, 0x25>, AMDGPUSample_b>;
16941697
defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x42, 0x26>, AMDGPUSample_b_cl>;
1695-
defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz>;
1698+
defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz, 1>;
16961699
defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x20, 0x28>, AMDGPUSample_c>;
16971700
defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x43, 0x29>, AMDGPUSample_c_cl>;
16981701
defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x21, 0x2a>, AMDGPUSample_c_d>;
@@ -1745,7 +1748,7 @@ defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x37, 0x5f>, AMDGPU
17451748
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, isGFX9Plus] in
17461749
defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">;
17471750

1748-
defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
1751+
defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 0, 1, 0, 1, "image_get_lod">;
17491752

17501753
defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x68>, AMDGPUSample_cd>;
17511754
defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
@@ -1758,22 +1761,22 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6f
17581761
} // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts]
17591762

17601763
let OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16] in {
1761-
defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
1762-
defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
1763-
defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
1764-
defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
1765-
defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
1766-
defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
1767-
defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
1768-
defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
1769-
defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
1770-
defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
1771-
defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
1772-
defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
1773-
defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
1774-
defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
1775-
defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
1776-
defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
1764+
defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 0, 1>;
1765+
defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 0, 1>;
1766+
defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 0, 1>;
1767+
defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 0, 1>;
1768+
defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 0, 1>;
1769+
defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 0, 1>;
1770+
defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 0, 1>;
1771+
defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 0, 1>;
1772+
defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 0, 1>;
1773+
defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 0, 1>;
1774+
defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 0, 1>;
1775+
defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 0, 1>;
1776+
defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 0, 1>;
1777+
defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 0, 1>;
1778+
defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 0, 1>;
1779+
defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 0, 1>;
17771780
} // End OtherPredicates = [HasImageInsts, HasExtendedImageInsts, HasG16]
17781781

17791782
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,10 @@ class WaitcntBrackets {
387387
return LDSDMAStores;
388388
}
389389

390+
bool hasPointSampleAccel(const MachineInstr &MI) const;
391+
bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
392+
RegInterval Interval) const;
393+
390394
void print(raw_ostream &) const;
391395
void dump() const { print(dbgs()); }
392396

@@ -826,6 +830,34 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
826830
setScoreByInterval(Interval, CntTy, Score);
827831
}
828832

833+
// Return true if the subtarget is one that enables Point Sample Acceleration
834+
// and the MachineInstr passed in is one to which it might be applied (the
835+
// hardware makes this decision based on several factors, but we can't determine
836+
// this at compile time, so we have to assume it might be applied if the
837+
// instruction supports it).
838+
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
839+
if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
840+
return false;
841+
842+
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
843+
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
844+
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
845+
return BaseInfo->PointSampleAccel;
846+
}
847+
848+
// Return true if the subtarget enables Point Sample Acceleration, the supplied
849+
// MachineInstr is one to which it might be applied and the supplied interval is
850+
// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
851+
// (this is the type that a point sample accelerated instruction effectively
852+
// becomes)
853+
bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
854+
const MachineInstr &MI, RegInterval Interval) const {
855+
if (!hasPointSampleAccel(MI))
856+
return false;
857+
858+
return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
859+
}
860+
829861
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
830862
const SIRegisterInfo *TRI,
831863
const MachineRegisterInfo *MRI,
@@ -942,8 +974,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
942974
// defs. That's required for a sane index into `VgprMemTypes` below
943975
assert(TRI->isVectorRegister(*MRI, Op.getReg()));
944976
VmemType V = getVmemType(Inst);
977+
unsigned char TypesMask = 1 << V;
978+
// If instruction can have Point Sample Accel applied, we have to flag
979+
// this with another potential dependency
980+
if (hasPointSampleAccel(Inst))
981+
TypesMask |= 1 << VMEM_NOSAMPLER;
945982
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
946-
VgprVmemTypes[RegNo] |= 1 << V;
983+
VgprVmemTypes[RegNo] |= TypesMask;
947984
}
948985
}
949986
setScoreByInterval(Interval, T, CurrScore);
@@ -1813,9 +1850,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18131850
// previous write and this write are the same type of VMEM
18141851
// instruction, in which case they are (in some architectures)
18151852
// guaranteed to write their results in order anyway.
1853+
// Additionally check instructions where Point Sample Acceleration
1854+
// might be applied.
18161855
if (Op.isUse() || !updateVMCntOnly(MI) ||
18171856
ScoreBrackets.hasOtherPendingVmemTypes(Interval,
18181857
getVmemType(MI)) ||
1858+
ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
18191859
!ST->hasVmemWriteVgprInOrder()) {
18201860
ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
18211861
ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ struct MIMGBaseOpcodeInfo {
420420
bool BVH;
421421
bool A16;
422422
bool NoReturn;
423+
bool PointSampleAccel;
423424
};
424425

425426
LLVM_READONLY

0 commit comments

Comments
 (0)