Skip to content

Commit 8887178

Browse files
authored
[AMDGPU] Allow buffer intrinsics to be marked volatile at the IR level (#77847)
In order to ensure the correctness of ptr addrspace(7) lowering, we need a backwards-compatible way to flag buffer intrinsics as volatile that can't be dropped (unlike metadata). To acheive this in a backwards-compatible way, we use bit 31 of the auxilliary immediates of buffer intrinsics as the volatile flag. When this bit is set, the MachineMemOperand for said intrinsic is marked volatile. Existing code will ensure that this results in the appropriate use of flags like glc and dlc. This commit also harmorizes the handling of the auxilliary immediate for atomic intrinsics, which new go through extract_cpol like loads and stores, which masks off the volatile bit.
1 parent 9d8e538 commit 8887178

12 files changed

+194
-58
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 52 additions & 27 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,8 +379,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
379379
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
380380
GISDNodeXFormEquiv<extract_swz>;
381381

382-
def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
383-
GISDNodeXFormEquiv<set_glc>;
382+
def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">,
383+
GISDNodeXFormEquiv<extract_cpol_set_glc>;
384384

385385
def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
386386
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
19171917
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
19181918
if (BaseOpcode->Atomic)
19191919
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1920-
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
1920+
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1921+
AMDGPU::CPol::VOLATILE))
19211922
return false;
19221923

19231924
int NumVAddrRegs = 0;
@@ -5496,11 +5497,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
54965497
MIB.addImm(Swizzle);
54975498
}
54985499

5499-
void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
5500-
const MachineInstr &MI,
5501-
int OpIdx) const {
5500+
void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5501+
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
55025502
assert(OpIdx >= 0 && "expected to match an immediate operand");
5503-
MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
5503+
const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5504+
(AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
5505+
: AMDGPU::CPol::ALL_pregfx12);
5506+
MIB.addImm(Cpol | AMDGPU::CPol::GLC);
55045507
}
55055508

55065509
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
331331
int OpIdx) const;
332332
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
333333
int OpIdx) const;
334-
void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
335-
int OpIdx) const;
334+
void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
335+
int OpIdx) const;
336336

337337
void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
338338
int OpIdx) const;

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,36 +1628,36 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
16281628

16291629
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
16301630
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
1631-
(set_glc $cachepolicy), (timm:$cachepolicy));
1631+
(extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary));
16321632

16331633
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
16341634
def : GCNPat<
16351635
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
1636-
timm:$offset, timm:$cachepolicy, 0)),
1636+
timm:$offset, timm:$auxiliary, 0)),
16371637
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
16381638
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
16391639
timm:$offset, CachePolicy)
16401640
>;
16411641

16421642
def : GCNPat<
16431643
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset),
1644-
timm:$offset, timm:$cachepolicy, timm)),
1644+
timm:$offset, timm:$auxiliary, timm)),
16451645
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
16461646
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
16471647
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
16481648
>;
16491649

16501650
def : GCNPat<
16511651
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
1652-
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)),
1652+
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)),
16531653
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
16541654
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
16551655
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
16561656
>;
16571657

16581658
def : GCNPat<
16591659
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
1660-
(BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)),
1660+
(BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)),
16611661
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
16621662
getVregSrcForVT<vt>.ret:$vdata_in,
16631663
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1726,35 +1726,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType
17261726
def : GCNPat<
17271727
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
17281728
0, (BUFSOffset i32:$soffset), timm:$offset,
1729-
timm:$cachepolicy, 0),
1729+
timm:$auxiliary, 0),
17301730
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
1731-
timm:$offset, timm:$cachepolicy)
1731+
timm:$offset, (extract_cpol $auxiliary))
17321732
>;
17331733

17341734
def : GCNPat<
17351735
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
17361736
0, (BUFSOffset i32:$soffset), timm:$offset,
1737-
timm:$cachepolicy, timm),
1737+
timm:$auxiliary, timm),
17381738
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
1739-
timm:$offset, timm:$cachepolicy)
1739+
timm:$offset, (extract_cpol $auxiliary))
17401740
>;
17411741

17421742
def : GCNPat<
17431743
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
17441744
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
1745-
timm:$cachepolicy, 0),
1745+
timm:$auxiliary, 0),
17461746
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
1747-
timm:$offset, timm:$cachepolicy)
1747+
timm:$offset, (extract_cpol $auxiliary))
17481748
>;
17491749

17501750
def : GCNPat<
17511751
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
17521752
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
1753-
timm:$cachepolicy, timm),
1753+
timm:$auxiliary, timm),
17541754
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
17551755
getVregSrcForVT<vt>.ret:$vdata_in,
17561756
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
1757-
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy)
1757+
SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary))
17581758
>;
17591759
}
17601760

@@ -1791,8 +1791,9 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
17911791
defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
17921792
# !if(!eq(RtnMode, "ret"), "", "_noret"));
17931793
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
1794-
defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
1795-
(timm:$cachepolicy));
1794+
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
1795+
(extract_cpol_set_glc $auxiliary),
1796+
(extract_cpol $auxiliary));
17961797
defvar SrcRC = getVregSrcForVT<vt>.ret;
17971798
defvar DataRC = getVregSrcForVT<data_vt>.ret;
17981799
defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
@@ -1804,7 +1805,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
18041805
def : GCNPat<
18051806
(vt (Op
18061807
vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
1807-
timm:$offset, timm:$cachepolicy, 0)),
1808+
timm:$offset, timm:$auxiliary, 0)),
18081809
!if(!eq(RtnMode, "ret"),
18091810
(EXTRACT_SUBREG OffsetResDag, SubLo),
18101811
OffsetResDag)
@@ -1818,7 +1819,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
18181819
(vt (Op
18191820
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
18201821
0, (BUFSOffset i32:$soffset), timm:$offset,
1821-
timm:$cachepolicy, timm)),
1822+
timm:$auxiliary, timm)),
18221823
!if(!eq(RtnMode, "ret"),
18231824
(EXTRACT_SUBREG IdxenResDag, SubLo),
18241825
IdxenResDag)
@@ -1832,7 +1833,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
18321833
(vt (Op
18331834
vt:$data, vt:$cmp, v4i32:$rsrc, 0,
18341835
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
1835-
timm:$cachepolicy, 0)),
1836+
timm:$auxiliary, 0)),
18361837
!if(!eq(RtnMode, "ret"),
18371838
(EXTRACT_SUBREG OffenResDag, SubLo),
18381839
OffenResDag)
@@ -1846,7 +1847,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
18461847
(vt (Op
18471848
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
18481849
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
1849-
timm:$cachepolicy, timm)),
1850+
timm:$auxiliary, timm)),
18501851
!if(!eq(RtnMode, "ret"),
18511852
(EXTRACT_SUBREG BothenResDag, SubLo),
18521853
BothenResDag)

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,10 @@ enum CPol {
400400
TH_TYPE_STORE = 1 << 8, // TH_STORE policy
401401
TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
402402
TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not
403+
404+
// Volatile (used to preserve/signal operation volatility for buffer
405+
// operations not a real instruction bit)
406+
VOLATILE = 1 << 31,
403407
};
404408

405409
} // namespace CPol

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
11831183
Info.ptrVal = RsrcArg;
11841184
}
11851185

1186+
auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1187+
if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1188+
Info.flags |= MachineMemOperand::MOVolatile;
11861189
Info.flags |= MachineMemOperand::MODereferenceable;
11871190
if (ME.onlyReadsMemory()) {
11881191
unsigned MaxNumLanes = 4;
@@ -7639,7 +7642,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
76397642
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
76407643
if (BaseOpcode->Atomic)
76417644
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7642-
if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
7645+
if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7646+
AMDGPU::CPol::VOLATILE))
76437647
return Op;
76447648

76457649
SmallVector<SDValue, 26> Ops;
@@ -8005,6 +8009,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
80058009
SDLoc(Op), MVT::i32);
80068010
case Intrinsic::amdgcn_s_buffer_load: {
80078011
unsigned CPol = Op.getConstantOperandVal(3);
8012+
// s_buffer_load, because of how it's optimized, can't be volatile
8013+
// so reject ones with the volatile bit set.
80088014
if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
80098015
? AMDGPU::CPol::ALL
80108016
: AMDGPU::CPol::ALL_pregfx12))

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -892,8 +892,11 @@ def extract_swz : SDNodeXForm<timm, [{
892892
return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8);
893893
}]>;
894894

895-
def set_glc : SDNodeXForm<timm, [{
896-
return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
895+
def extract_cpol_set_glc : SDNodeXForm<timm, [{
896+
const uint32_t cpol = N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
897+
? AMDGPU::CPol::ALL
898+
: AMDGPU::CPol::ALL_pregfx12);
899+
return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
897900
}]>;
898901

899902
//===----------------------------------------------------------------------===//

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,25 @@ define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_s
270270
ret float %val
271271
}
272272

273+
define amdgpu_ps float @raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
274+
; CHECK-LABEL: name: raw_ptr_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset_volatile
275+
; CHECK: bb.1 (%ir-block.0):
276+
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0
277+
; CHECK-NEXT: {{ $}}
278+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
279+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
280+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
281+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
282+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
283+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
284+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
285+
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (volatile dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
286+
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
287+
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
288+
%val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
289+
ret float %val
290+
}
291+
273292
; Natural mapping
274293
define amdgpu_ps <2 x float> @raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
275294
; CHECK-LABEL: name: raw_ptr_buffer_load_v2f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,25 @@ define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__
327327
ret void
328328
}
329329

330+
define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile(ptr addrspace(8) inreg %rsrc, float %val, i32 %voffset, i32 inreg %soffset) {
331+
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_volatile
332+
; CHECK: bb.1 (%ir-block.0):
333+
; CHECK-NEXT: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1
334+
; CHECK-NEXT: {{ $}}
335+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
336+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
337+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
338+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
339+
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
340+
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
341+
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
342+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
343+
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (volatile dereferenceable store (s32) into %ir.rsrc, align 1, addrspace 8)
344+
; CHECK-NEXT: S_ENDPGM 0
345+
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 -2147483648)
346+
ret void
347+
}
348+
330349
define amdgpu_ps void @raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32(ptr addrspace(8) inreg %rsrc, <2 x float> %val, i32 %voffset, i32 inreg %soffset) {
331350
; CHECK-LABEL: name: raw_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f32
332351
; CHECK: bb.1 (%ir-block.0):

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,26 @@ main_body:
129129
ret float %out
130130
}
131131

132+
;CHECK-LABEL: {{^}}test_volatile:
133+
;CHECK-NOT: s_waitcnt
134+
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}}
135+
;CHECK-DAG: s_waitcnt vmcnt(0)
136+
define amdgpu_ps float @test_volatile(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
137+
main_body:
138+
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
139+
%out = bitcast i32 %t1 to float
140+
ret float %out
141+
}
142+
143+
;CHECK-LABEL: {{^}}test_volatile_noret:
144+
;CHECK-NOT: s_waitcnt
145+
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen{{$}}
146+
define amdgpu_ps void @test_volatile_noret(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %voffset) {
147+
main_body:
148+
%t1 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 -2147483648)
149+
ret void
150+
}
151+
132152
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0
133153
declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0
134154
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,42 @@ main_body:
7676
ret {<4 x float>, <4 x float>, <4 x float>} %r2
7777
}
7878

79+
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) {
80+
; PREGFX10-LABEL: buffer_load_volatile:
81+
; PREGFX10: ; %bb.0: ; %main_body
82+
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
83+
; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
84+
; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
85+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
86+
; PREGFX10-NEXT: ; return to shader part epilog
87+
;
88+
; GFX10-LABEL: buffer_load_volatile:
89+
; GFX10: ; %bb.0: ; %main_body
90+
; GFX10-NEXT: s_clause 0x2
91+
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
92+
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
93+
; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
94+
; GFX10-NEXT: s_waitcnt vmcnt(0)
95+
; GFX10-NEXT: ; return to shader part epilog
96+
;
97+
; GFX11-LABEL: buffer_load_volatile:
98+
; GFX11: ; %bb.0: ; %main_body
99+
; GFX11-NEXT: s_clause 0x2
100+
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
101+
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
102+
; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
103+
; GFX11-NEXT: s_waitcnt vmcnt(0)
104+
; GFX11-NEXT: ; return to shader part epilog
105+
main_body:
106+
%data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648)
107+
%data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647)
108+
%data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646)
109+
%r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
110+
%r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
111+
%r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
112+
ret {<4 x float>, <4 x float>, <4 x float>} %r2
113+
}
114+
79115
define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
80116
; PREGFX10-LABEL: buffer_load_immoffs:
81117
; PREGFX10: ; %bb.0: ; %main_body

0 commit comments

Comments
 (0)