Skip to content

Commit a96ec01

Browse files
authored
[AMDGPU] Optimize out s_barrier_signal/_wait (#116993)
Extend the optimization that converts s_barrier to wave_barrier (nop) when the number of work items is not larger than wave size. This handles the "split barrier" form of s_barrier where the barrier is represented by separate intrinsics (s_barrier_signal/s_barrier_wait). Note: the version where s_barrier is used in gfx12 (and later split) has the optimization already, but some front-ends may prefer to use split intrinsics and this is being addressed by the patch.
1 parent bb8bf85 commit a96ec01

File tree

3 files changed

+92
-12
lines changed

3 files changed

+92
-12
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1878,19 +1878,25 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
18781878
}
18791879

18801880
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1881+
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
18811882
if (TM.getOptLevel() > CodeGenOptLevel::None) {
18821883
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
18831884
if (WGSize <= STI.getWavefrontSize()) {
1884-
MachineBasicBlock *MBB = MI.getParent();
1885-
const DebugLoc &DL = MI.getDebugLoc();
1886-
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1885+
// If the workgroup fits in a wave, remove s_barrier_signal and lower
1886+
// s_barrier/s_barrier_wait to wave_barrier.
1887+
if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1888+
IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1889+
MachineBasicBlock *MBB = MI.getParent();
1890+
const DebugLoc &DL = MI.getDebugLoc();
1891+
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1892+
}
18871893
MI.eraseFromParent();
18881894
return true;
18891895
}
18901896
}
18911897

1892-
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1893-
if (STI.hasSplitBarriers()) {
1898+
if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1899+
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
18941900
MachineBasicBlock *MBB = MI.getParent();
18951901
const DebugLoc &DL = MI.getDebugLoc();
18961902
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
@@ -2207,6 +2213,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
22072213
case Intrinsic::amdgcn_init_whole_wave:
22082214
return selectInitWholeWave(I);
22092215
case Intrinsic::amdgcn_s_barrier:
2216+
case Intrinsic::amdgcn_s_barrier_signal:
2217+
case Intrinsic::amdgcn_s_barrier_wait:
22102218
return selectSBarrier(I);
22112219
case Intrinsic::amdgcn_raw_buffer_load_lds:
22122220
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9614,18 +9614,26 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
96149614
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
96159615
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
96169616
}
9617-
case Intrinsic::amdgcn_s_barrier: {
9617+
case Intrinsic::amdgcn_s_barrier:
9618+
case Intrinsic::amdgcn_s_barrier_signal:
9619+
case Intrinsic::amdgcn_s_barrier_wait: {
96189620
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
96199621
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
96209622
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9621-
if (WGSize <= ST.getWavefrontSize())
9622-
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9623-
Op.getOperand(0)),
9624-
0);
9623+
if (WGSize <= ST.getWavefrontSize()) {
9624+
// If the workgroup fits in a wave, remove s_barrier_signal and lower
9625+
// s_barrier/s_barrier_wait to wave_barrier.
9626+
if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
9627+
return Op.getOperand(0);
9628+
else
9629+
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
9630+
MVT::Other, Op.getOperand(0)),
9631+
0);
9632+
}
96259633
}
96269634

9627-
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9628-
if (ST.hasSplitBarriers()) {
9635+
if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
9636+
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
96299637
SDValue K =
96309638
DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
96319639
SDValue BarSignal =
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
4+
5+
define amdgpu_kernel void @signal_unknown_wgs() {
6+
; CHECK-LABEL: signal_unknown_wgs:
7+
; CHECK: ; %bb.0:
8+
; CHECK-NEXT: s_barrier_signal -1
9+
; CHECK-NEXT: s_endpgm
10+
tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
11+
ret void
12+
}
13+
14+
define amdgpu_kernel void @signal_flat_wgs_attr_32_128() #1 {
15+
; CHECK-LABEL: signal_flat_wgs_attr_32_128:
16+
; CHECK: ; %bb.0:
17+
; CHECK-NEXT: s_barrier_signal -1
18+
; CHECK-NEXT: s_endpgm
19+
tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
20+
ret void
21+
}
22+
23+
define amdgpu_kernel void @signal_flat_wgs_attr_16_32() #2 {
24+
; CHECK-LABEL: signal_flat_wgs_attr_16_32:
25+
; CHECK: ; %bb.0:
26+
; CHECK-NEXT: s_endpgm
27+
tail call void @llvm.amdgcn.s.barrier.signal(i32 -1)
28+
ret void
29+
}
30+
31+
32+
define amdgpu_kernel void @wait_unknown_wgs() {
33+
; CHECK-LABEL: wait_unknown_wgs:
34+
; CHECK: ; %bb.0:
35+
; CHECK-NEXT: s_barrier_wait -1
36+
; CHECK-NEXT: s_endpgm
37+
tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
38+
ret void
39+
}
40+
41+
define amdgpu_kernel void @wait_flat_wgs_attr_32_128() #1 {
42+
; CHECK-LABEL: wait_flat_wgs_attr_32_128:
43+
; CHECK: ; %bb.0:
44+
; CHECK-NEXT: s_barrier_wait -1
45+
; CHECK-NEXT: s_endpgm
46+
tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
47+
ret void
48+
}
49+
50+
define amdgpu_kernel void @wait_flat_wgs_attr_16_32() #2 {
51+
; CHECK-LABEL: wait_flat_wgs_attr_16_32:
52+
; CHECK: ; %bb.0:
53+
; CHECK-NEXT: ; wave barrier
54+
; CHECK-NEXT: s_endpgm
55+
tail call void @llvm.amdgcn.s.barrier.wait(i16 -1)
56+
ret void
57+
}
58+
59+
declare void @llvm.amdgcn.s.barrier.signal(i32 immarg) #0
60+
declare void @llvm.amdgcn.s.barrier.wait(i16 immarg) #0
61+
62+
attributes #0 = { convergent nounwind }
63+
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
64+
attributes #2 = { nounwind "amdgpu-flat-work-group-size"="16,32" }

0 commit comments

Comments
 (0)