Skip to content

Commit 5d5a92a

Browse files
committed
[AMDGPU] Cleanup bitcast spam in atomic optimizer (llvm#96933)
Change-Id: I48b78d265985d38dbe270791c8c394a6a60199b3
1 parent 670289e commit 5d5a92a

14 files changed

+1527
-1567
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 26 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
395395
Value *V,
396396
Value *const Identity) const {
397397
Type *AtomicTy = V->getType();
398-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
399398
Module *M = B.GetInsertBlock()->getModule();
400399
Function *UpdateDPP =
401400
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -411,34 +410,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
411410

412411
// Reduce within each pair of rows (i.e. 32 lanes).
413412
assert(ST->hasPermLaneX16());
414-
V = B.CreateBitCast(V, IntNTy);
415413
Value *Permlanex16Call = B.CreateIntrinsic(
416414
V->getType(), Intrinsic::amdgcn_permlanex16,
417415
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
418-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
419-
B.CreateBitCast(Permlanex16Call, AtomicTy));
416+
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
420417
if (ST->isWave32()) {
421418
return V;
422419
}
423420

424421
if (ST->hasPermLane64()) {
425422
// Reduce across the upper and lower 32 lanes.
426-
V = B.CreateBitCast(V, IntNTy);
427423
Value *Permlane64Call =
428424
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
429-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
430-
B.CreateBitCast(Permlane64Call, AtomicTy));
425+
return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
431426
}
432427

433428
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
434429
// combine them with a scalar operation.
435430
Function *ReadLane =
436-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
437-
V = B.CreateBitCast(V, IntNTy);
431+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
438432
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
439433
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
440-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
441-
B.CreateBitCast(Lane32, AtomicTy));
434+
return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
442435
}
443436

444437
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -447,8 +440,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
447440
AtomicRMWInst::BinOp Op, Value *V,
448441
Value *Identity) const {
449442
Type *AtomicTy = V->getType();
450-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
451-
452443
Module *M = B.GetInsertBlock()->getModule();
453444
Function *UpdateDPP =
454445
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -479,29 +470,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
479470
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
480471
// 48..63).
481472
assert(ST->hasPermLaneX16());
482-
V = B.CreateBitCast(V, IntNTy);
483473
Value *PermX = B.CreateIntrinsic(
484474
V->getType(), Intrinsic::amdgcn_permlanex16,
485475
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
486476

487-
Value *UpdateDPPCall =
488-
B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
489-
B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
490-
B.getInt32(0xf), B.getFalse()});
491-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
477+
Value *UpdateDPPCall = B.CreateCall(
478+
UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
479+
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
480+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
492481

493482
if (!ST->isWave32()) {
494483
// Combine lane 31 into lanes 32..63.
495-
V = B.CreateBitCast(V, IntNTy);
496484
Value *const Lane31 = B.CreateIntrinsic(
497485
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
498486

499487
Value *UpdateDPPCall = B.CreateCall(
500488
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
501489
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
502490

503-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
504-
UpdateDPPCall);
491+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
505492
}
506493
}
507494
return V;
@@ -512,8 +499,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
512499
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
513500
Value *Identity) const {
514501
Type *AtomicTy = V->getType();
515-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
516-
517502
Module *M = B.GetInsertBlock()->getModule();
518503
Function *UpdateDPP =
519504
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -523,10 +508,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
523508
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
524509
B.getInt32(0xf), B.getFalse()});
525510
} else {
526-
Function *ReadLane = Intrinsic::getDeclaration(
527-
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
528-
Function *WriteLane = Intrinsic::getDeclaration(
529-
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
511+
Function *ReadLane =
512+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
513+
Function *WriteLane =
514+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
530515

531516
// On GFX10 all DPP operations are confined to a single row. To get cross-
532517
// row operations we have to use permlane or readlane.
@@ -536,24 +521,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
536521
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
537522

538523
// Copy the old lane 15 to the new lane 16.
539-
V = B.CreateCall(
540-
WriteLane,
541-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
542-
B.getInt32(16), B.CreateBitCast(V, IntNTy)});
543-
V = B.CreateBitCast(V, AtomicTy);
524+
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
525+
B.getInt32(16), V});
526+
544527
if (!ST->isWave32()) {
545528
// Copy the old lane 31 to the new lane 32.
546-
V = B.CreateBitCast(V, IntNTy);
547-
V = B.CreateCall(WriteLane,
548-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
549-
B.getInt32(31)}),
550-
B.getInt32(32), V});
529+
V = B.CreateCall(
530+
WriteLane,
531+
{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
551532

552533
// Copy the old lane 47 to the new lane 48.
553534
V = B.CreateCall(
554535
WriteLane,
555536
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
556-
V = B.CreateBitCast(V, AtomicTy);
557537
}
558538
}
559539

@@ -593,24 +573,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
593573
auto *FF1 =
594574
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
595575

596-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
597-
auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
576+
auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
598577

599578
// Get the value required for atomic operation
600-
V = B.CreateBitCast(V, IntNTy);
601579
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
602580
{V, LaneIdxInt});
603-
LaneValue = B.CreateBitCast(LaneValue, Ty);
604581

605582
// Perform writelane if intermediate scan results are required later in the
606583
// kernel computations
607584
Value *OldValue = nullptr;
608585
if (NeedResult) {
609-
OldValue =
610-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
611-
{B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
612-
B.CreateBitCast(OldValuePhi, IntNTy)});
613-
OldValue = B.CreateBitCast(OldValue, Ty);
586+
OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
587+
{Accumulator, LaneIdxInt, OldValuePhi});
614588
OldValuePhi->addIncoming(OldValue, ComputeLoop);
615589
}
616590

@@ -709,10 +683,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
709683

710684
Type *const Ty = I.getType();
711685
Type *Int32Ty = B.getInt32Ty();
712-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
713686
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
714687
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
715-
auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
716688

717689
// This is the value in the atomic operation we need to combine in order to
718690
// reduce the number of atomic operations.
@@ -767,13 +739,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
767739
if (ScanImpl == ScanOptions::DPP) {
768740
// First we need to set all inactive invocations to the identity value, so
769741
// that they can correctly contribute to the final result.
770-
V = B.CreateBitCast(V, IntNTy);
771-
Identity = B.CreateBitCast(Identity, IntNTy);
772-
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
773-
{V, Identity});
774-
NewV = B.CreateBitCast(NewV, Ty);
775-
V = B.CreateBitCast(V, Ty);
776-
Identity = B.CreateBitCast(Identity, Ty);
742+
NewV =
743+
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
777744
if (!NeedResult && ST->hasPermLaneX16()) {
778745
// On GFX10 the permlanex16 instruction helps us build a reduction
779746
// without too many readlanes and writelanes, which are generally bad
@@ -788,10 +755,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
788755
// which we will provide to the atomic operation.
789756
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
790757
assert(TyBitWidth == 32);
791-
NewV = B.CreateBitCast(NewV, IntNTy);
792-
NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
758+
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
793759
{NewV, LastLaneIdx});
794-
NewV = B.CreateBitCast(NewV, Ty);
795760
}
796761
// Finally mark the readlanes in the WWM section.
797762
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -930,30 +895,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
930895
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
931896
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
932897
Value *BroadcastI = nullptr;
933-
934-
if (TyBitWidth == 64) {
935-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
936-
Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
937-
Value *const ExtractHi =
938-
B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
939-
CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
940-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
941-
CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
942-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
943-
Value *const PartialInsert = B.CreateInsertElement(
944-
PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
945-
Value *const Insert =
946-
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
947-
BroadcastI = B.CreateBitCast(Insert, Ty);
948-
} else if (TyBitWidth == 32) {
949-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
950-
BroadcastI =
951-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
952-
BroadcastI = B.CreateBitCast(BroadcastI, Ty);
953-
954-
} else {
955-
llvm_unreachable("Unhandled atomic bit width");
956-
}
898+
BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
957899

958900
// Now that we have the result of our single atomic operation, we need to
959901
// get our individual lane's slice into the result. We use the lane offset

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,38 +169,37 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
169169
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
170170
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
171171
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
172-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
172+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
173173
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
174174
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
175-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
176-
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
175+
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
177176
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
178177
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
179-
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
178+
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
180179
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
181180
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
182-
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
181+
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
183182
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
184183
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
185-
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
184+
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
186185
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
187186
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
188-
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
187+
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
189188
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
190189
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
191-
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
190+
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
192191
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
193192
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
194-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
195-
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
193+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
194+
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
196195
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
197196
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
198197
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
199198
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
200199
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
201200
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
202201
; GFX90A_GFX940-NEXT: {{ $}}
203-
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
202+
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
204203
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
205204
; GFX90A_GFX940-NEXT: {{ $}}
206205
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -212,7 +211,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
212211
; GFX90A_GFX940-NEXT: {{ $}}
213212
; GFX90A_GFX940-NEXT: SI_WAVE_RECONVERGE [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
214213
; GFX90A_GFX940-NEXT: {{ $}}
215-
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
214+
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
216215
; GFX90A_GFX940-NEXT: S_ENDPGM 0
217216
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
218217
ret void

0 commit comments

Comments
 (0)