Skip to content

[AMDGPU][AtomicOptimizer] Fix DT update for divergent values with Iterative strategy #87605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,

Function *F = I.getFunction();
LLVMContext &C = F->getContext();

// For atomic sub, perform scan with add operation and allow one lane to
// subtract the reduced value later.
AtomicRMWInst::BinOp ScanOp = Op;
Expand Down Expand Up @@ -855,7 +855,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));

// Store I's original basic block before we split the block.
BasicBlock *const EntryBB = I.getParent();
BasicBlock *const OriginalBB = I.getParent();

// We need to introduce some new control flow to force a single lane to be
// active. We do this by splitting I's basic block at I, and introducing the
Expand All @@ -876,25 +876,36 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
BasicBlock *Predecessor = nullptr;
if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
// Move terminator from I's block to ComputeEnd block.
Instruction *Terminator = EntryBB->getTerminator();
//
// OriginalBB is known to have a branch as terminator because
// SplitBlockAndInsertIfThen will have inserted one.
BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());
B.SetInsertPoint(ComputeEnd);
Terminator->removeFromParent();
B.Insert(Terminator);

// Branch to ComputeLoop Block unconditionally from the I's block for
// iterative approach.
B.SetInsertPoint(EntryBB);
B.SetInsertPoint(OriginalBB);
B.CreateBr(ComputeLoop);

// Update the dominator tree for new control flow.
DTU.applyUpdates(
{{DominatorTree::Insert, EntryBB, ComputeLoop},
{DominatorTree::Insert, ComputeLoop, ComputeEnd},
{DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}});
SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(
{{DominatorTree::Insert, OriginalBB, ComputeLoop},
{DominatorTree::Insert, ComputeLoop, ComputeEnd}});

// We're moving the terminator from EntryBB to ComputeEnd, make sure we move
// the DT edges as well.
for (auto *Succ : Terminator->successors()) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This loops twice at most (for a conditional branch)
The loop may be a bit overkill but I think it just makes more sense, see the comment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: could you use ComputeEnd->successors() here? Then you would not have to explain how Terminator is known to be a BranchInst.

DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
}

DTU.applyUpdates(DomTreeUpdates);

Predecessor = ComputeEnd;
} else {
Predecessor = EntryBB;
Predecessor = OriginalBB;
}
// Move the IR builder into single_lane next.
B.SetInsertPoint(SingleLaneTerminator);
Expand Down
88 changes: 88 additions & 0 deletions llvm/test/CodeGen/AMDGPU/atomic_optimization_split_dt_update.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx908 -passes="amdgpu-atomic-optimizer,verify<domtree>" %s -S -o - | FileCheck %s

; Check we're properly adding an edge from ComputeEnd to the "End" block added by
; SplitBlockAndInsertIfThen
;
; If the edge is not added, domtree verification will fail.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you merge the test with the existing ones?

declare i32 @quux()

define amdgpu_kernel void @ham(ptr addrspace(4) %arg) {
; CHECK-LABEL: define amdgpu_kernel void @ham(
; CHECK-SAME: ptr addrspace(4) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: bb:
; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @quux()
; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i32 [[CALL]], 0
; CHECK-NEXT: br i1 [[ICMP]], label [[BB1:%.*]], label [[BB3:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[CALL2:%.*]] = tail call i32 @quux()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[CALL2]], [[BB1]] ], [ [[CALL]], [[BB:%.*]] ]
; CHECK-NEXT: br label [[BB4:%.*]]
; CHECK: bb4:
; CHECK-NEXT: [[CALL5:%.*]] = tail call i32 @quux()
; CHECK-NEXT: [[ICMP6:%.*]] = icmp eq i32 [[CALL5]], 0
; CHECK-NEXT: br i1 [[ICMP6]], label [[BB8:%.*]], label [[BB7:%.*]]
; CHECK: bb7:
; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[ARG]], align 8
; CHECK-NEXT: [[ADDRSPACECAST:%.*]] = addrspacecast ptr [[LOAD]] to ptr addrspace(1)
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; CHECK-NEXT: br label [[COMPUTELOOP:%.*]]
; CHECK: 7:
; CHECK-NEXT: [[TMP8:%.*]] = atomicrmw add ptr addrspace(1) [[ADDRSPACECAST]], i32 [[TMP13:%.*]] syncscope("agent-one-as") monotonic, align 4
; CHECK-NEXT: br label [[TMP9:%.*]]
; CHECK: 9:
; CHECK-NEXT: br label [[BB8]]
; CHECK: bb8:
; CHECK-NEXT: br label [[BB4]]
; CHECK: ComputeLoop:
; CHECK-NEXT: [[ACCUMULATOR:%.*]] = phi i32 [ 0, [[BB7]] ], [ [[TMP13]], [[COMPUTELOOP]] ]
; CHECK-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP6]], [[BB7]] ], [ [[TMP16:%.*]], [[COMPUTELOOP]] ]
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true)
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[PHI]], i32 [[TMP11]])
; CHECK-NEXT: [[TMP13]] = add i32 [[ACCUMULATOR]], [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = shl i64 1, [[TMP10]]
; CHECK-NEXT: [[TMP15:%.*]] = xor i64 [[TMP14]], -1
; CHECK-NEXT: [[TMP16]] = and i64 [[ACTIVEBITS]], [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP16]], 0
; CHECK-NEXT: br i1 [[TMP17]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]]
; CHECK: ComputeEnd:
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: br i1 [[TMP18]], label [[TMP7:%.*]], label [[TMP9]]
;
bb:
%call = tail call i32 @quux()
%icmp = icmp eq i32 %call, 0
br i1 %icmp, label %bb1, label %bb3

bb1: ; preds = %bb
%call2 = tail call i32 @quux()
br label %bb3

bb3: ; preds = %bb1, %bb
%phi = phi i32 [ %call2, %bb1 ], [ %call, %bb ]
br label %bb4

bb4: ; preds = %bb8, %bb3
%call5 = tail call i32 @quux()
%icmp6 = icmp eq i32 %call5, 0
br i1 %icmp6, label %bb8, label %bb7

bb7: ; preds = %bb4
%load = load ptr, ptr addrspace(4) %arg, align 8
%addrspacecast = addrspacecast ptr %load to ptr addrspace(1)
%atomicrmw = atomicrmw add ptr addrspace(1) %addrspacecast, i32 %phi syncscope("agent-one-as") monotonic, align 4
br label %bb8

bb8: ; preds = %bb7, %bb4
br label %bb4
}