Skip to content

Commit 5bc37d0

Browse files
committed
[AMDGPU] Cleanup bitcast spam in atomic optimizer
1 parent aa3589f commit 5bc37d0

14 files changed

+1296
-1563
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 27 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
386386
Value *V,
387387
Value *const Identity) const {
388388
Type *AtomicTy = V->getType();
389-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
390389
Module *M = B.GetInsertBlock()->getModule();
391390
Function *UpdateDPP =
392391
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
402401

403402
// Reduce within each pair of rows (i.e. 32 lanes).
404403
assert(ST->hasPermLaneX16());
405-
V = B.CreateBitCast(V, IntNTy);
406404
Value *Permlanex16Call = B.CreateIntrinsic(
407405
V->getType(), Intrinsic::amdgcn_permlanex16,
408406
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
409-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
410-
B.CreateBitCast(Permlanex16Call, AtomicTy));
407+
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
411408
if (ST->isWave32()) {
412409
return V;
413410
}
414411

415412
if (ST->hasPermLane64()) {
416413
// Reduce across the upper and lower 32 lanes.
417-
V = B.CreateBitCast(V, IntNTy);
418414
Value *Permlane64Call =
419415
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
420-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
421-
B.CreateBitCast(Permlane64Call, AtomicTy));
416+
return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
422417
}
423418

424419
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
425420
// combine them with a scalar operation.
426421
Function *ReadLane =
427-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
428-
V = B.CreateBitCast(V, IntNTy);
422+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
429423
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
430424
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
431-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
432-
B.CreateBitCast(Lane32, AtomicTy));
425+
return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
433426
}
434427

435428
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
438431
AtomicRMWInst::BinOp Op, Value *V,
439432
Value *Identity) const {
440433
Type *AtomicTy = V->getType();
441-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
442-
443434
Module *M = B.GetInsertBlock()->getModule();
444435
Function *UpdateDPP =
445436
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -470,29 +461,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
470461
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
471462
// 48..63).
472463
assert(ST->hasPermLaneX16());
473-
V = B.CreateBitCast(V, IntNTy);
474464
Value *PermX = B.CreateIntrinsic(
475465
V->getType(), Intrinsic::amdgcn_permlanex16,
476466
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
477467

478-
Value *UpdateDPPCall =
479-
B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
480-
B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
481-
B.getInt32(0xf), B.getFalse()});
482-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
468+
Value *UpdateDPPCall = B.CreateCall(
469+
UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
470+
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
471+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
483472

484473
if (!ST->isWave32()) {
485474
// Combine lane 31 into lanes 32..63.
486-
V = B.CreateBitCast(V, IntNTy);
487475
Value *const Lane31 = B.CreateIntrinsic(
488476
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
489477

490478
Value *UpdateDPPCall = B.CreateCall(
491479
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
492480
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
493481

494-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
495-
UpdateDPPCall);
482+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
496483
}
497484
}
498485
return V;
@@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
503490
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
504491
Value *Identity) const {
505492
Type *AtomicTy = V->getType();
506-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
507-
508493
Module *M = B.GetInsertBlock()->getModule();
509494
Function *UpdateDPP =
510495
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
514499
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
515500
B.getInt32(0xf), B.getFalse()});
516501
} else {
517-
Function *ReadLane = Intrinsic::getDeclaration(
518-
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
519-
Function *WriteLane = Intrinsic::getDeclaration(
520-
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
502+
Function *ReadLane =
503+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
504+
Function *WriteLane =
505+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
521506

522507
// On GFX10 all DPP operations are confined to a single row. To get cross-
523508
// row operations we have to use permlane or readlane.
@@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
527512
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
528513

529514
// Copy the old lane 15 to the new lane 16.
530-
V = B.CreateCall(
531-
WriteLane,
532-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
533-
B.getInt32(16), B.CreateBitCast(V, IntNTy)});
534-
V = B.CreateBitCast(V, AtomicTy);
515+
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
516+
B.getInt32(16), V});
517+
535518
if (!ST->isWave32()) {
536519
// Copy the old lane 31 to the new lane 32.
537-
V = B.CreateBitCast(V, IntNTy);
538-
V = B.CreateCall(WriteLane,
539-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
540-
B.getInt32(31)}),
541-
B.getInt32(32), V});
520+
V = B.CreateCall(
521+
WriteLane,
522+
{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
542523

543524
// Copy the old lane 47 to the new lane 48.
544525
V = B.CreateCall(
545526
WriteLane,
546527
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
547-
V = B.CreateBitCast(V, AtomicTy);
548528
}
549529
}
550530

@@ -584,24 +564,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
584564
auto *FF1 =
585565
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
586566

587-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
588-
auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
567+
auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
589568

590569
// Get the value required for atomic operation
591-
V = B.CreateBitCast(V, IntNTy);
592570
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
593571
{V, LaneIdxInt});
594-
LaneValue = B.CreateBitCast(LaneValue, Ty);
595572

596573
// Perform writelane if intermediate scan results are required later in the
597574
// kernel computations
598575
Value *OldValue = nullptr;
599576
if (NeedResult) {
600-
OldValue =
601-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
602-
{B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
603-
B.CreateBitCast(OldValuePhi, IntNTy)});
604-
OldValue = B.CreateBitCast(OldValue, Ty);
577+
OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
578+
{Accumulator, LaneIdxInt, OldValuePhi});
605579
OldValuePhi->addIncoming(OldValue, ComputeLoop);
606580
}
607581

@@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
700674

701675
Type *const Ty = I.getType();
702676
Type *Int32Ty = B.getInt32Ty();
703-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
704677
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
705678
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
706-
auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
707679

708680
// This is the value in the atomic operation we need to combine in order to
709681
// reduce the number of atomic operations.
@@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
758730
if (ScanImpl == ScanOptions::DPP) {
759731
// First we need to set all inactive invocations to the identity value, so
760732
// that they can correctly contribute to the final result.
761-
V = B.CreateBitCast(V, IntNTy);
762-
Identity = B.CreateBitCast(Identity, IntNTy);
763-
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
764-
{V, Identity});
765-
NewV = B.CreateBitCast(NewV, Ty);
766-
V = B.CreateBitCast(V, Ty);
767-
Identity = B.CreateBitCast(Identity, Ty);
733+
NewV =
734+
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
768735
if (!NeedResult && ST->hasPermLaneX16()) {
769736
// On GFX10 the permlanex16 instruction helps us build a reduction
770737
// without too many readlanes and writelanes, which are generally bad
@@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
779746
// which we will provide to the atomic operation.
780747
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
781748
assert(TyBitWidth == 32);
782-
NewV = B.CreateBitCast(NewV, IntNTy);
783-
NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
749+
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
784750
{NewV, LastLaneIdx});
785-
NewV = B.CreateBitCast(NewV, Ty);
786751
}
787752
// Finally mark the readlanes in the WWM section.
788753
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
922887
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
923888
Value *BroadcastI = nullptr;
924889

925-
if (TyBitWidth == 64) {
926-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
927-
Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
928-
Value *const ExtractHi =
929-
B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
930-
CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
931-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
932-
CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
933-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
934-
Value *const PartialInsert = B.CreateInsertElement(
935-
PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
936-
Value *const Insert =
937-
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
938-
BroadcastI = B.CreateBitCast(Insert, Ty);
939-
} else if (TyBitWidth == 32) {
940-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
941-
BroadcastI =
942-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
943-
BroadcastI = B.CreateBitCast(BroadcastI, Ty);
944-
890+
if (TyBitWidth == 32 || TyBitWidth == 64) {
891+
BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
945892
} else {
946893
llvm_unreachable("Unhandled atomic bit width");
947894
}

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,38 +169,37 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
169169
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
170170
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
171171
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
172-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
172+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
173173
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
174174
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
175-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
176-
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
175+
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
177176
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
178177
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
179-
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
178+
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
180179
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
181180
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
182-
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
181+
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
183182
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
184183
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
185-
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
184+
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
186185
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
187186
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
188-
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
187+
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
189188
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
190189
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
191-
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
190+
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
192191
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
193192
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
194-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
195-
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
193+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
194+
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
196195
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
197196
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
198197
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
199198
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
200199
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
201200
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
202201
; GFX90A_GFX940-NEXT: {{ $}}
203-
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
202+
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
204203
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
205204
; GFX90A_GFX940-NEXT: {{ $}}
206205
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
211210
; GFX90A_GFX940-NEXT: {{ $}}
212211
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
213212
; GFX90A_GFX940-NEXT: {{ $}}
214-
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
213+
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
215214
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
216215
; GFX90A_GFX940-NEXT: S_ENDPGM 0
217216
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

0 commit comments

Comments
 (0)