Skip to content

Commit 2a96071

Browse files
authored
[AMDGPU] Cleanup bitcast spam in atomic optimizer (#96933)
1 parent e603451 commit 2a96071

14 files changed

+1421
-1683
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 26 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
393393
Value *V,
394394
Value *const Identity) const {
395395
Type *AtomicTy = V->getType();
396-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
397396
Module *M = B.GetInsertBlock()->getModule();
398397
Function *UpdateDPP =
399398
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -409,34 +408,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
409408

410409
// Reduce within each pair of rows (i.e. 32 lanes).
411410
assert(ST->hasPermLaneX16());
412-
V = B.CreateBitCast(V, IntNTy);
413411
Value *Permlanex16Call = B.CreateIntrinsic(
414412
V->getType(), Intrinsic::amdgcn_permlanex16,
415413
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
416-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
417-
B.CreateBitCast(Permlanex16Call, AtomicTy));
414+
V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
418415
if (ST->isWave32()) {
419416
return V;
420417
}
421418

422419
if (ST->hasPermLane64()) {
423420
// Reduce across the upper and lower 32 lanes.
424-
V = B.CreateBitCast(V, IntNTy);
425421
Value *Permlane64Call =
426422
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
427-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
428-
B.CreateBitCast(Permlane64Call, AtomicTy));
423+
return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
429424
}
430425

431426
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
432427
// combine them with a scalar operation.
433428
Function *ReadLane =
434-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
435-
V = B.CreateBitCast(V, IntNTy);
429+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
436430
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
437431
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
438-
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
439-
B.CreateBitCast(Lane32, AtomicTy));
432+
return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
440433
}
441434

442435
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -445,8 +438,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
445438
AtomicRMWInst::BinOp Op, Value *V,
446439
Value *Identity) const {
447440
Type *AtomicTy = V->getType();
448-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
449-
450441
Module *M = B.GetInsertBlock()->getModule();
451442
Function *UpdateDPP =
452443
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -477,29 +468,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
477468
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
478469
// 48..63).
479470
assert(ST->hasPermLaneX16());
480-
V = B.CreateBitCast(V, IntNTy);
481471
Value *PermX = B.CreateIntrinsic(
482472
V->getType(), Intrinsic::amdgcn_permlanex16,
483473
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
484474

485-
Value *UpdateDPPCall =
486-
B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
487-
B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
488-
B.getInt32(0xf), B.getFalse()});
489-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
475+
Value *UpdateDPPCall = B.CreateCall(
476+
UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
477+
B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
478+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
490479

491480
if (!ST->isWave32()) {
492481
// Combine lane 31 into lanes 32..63.
493-
V = B.CreateBitCast(V, IntNTy);
494482
Value *const Lane31 = B.CreateIntrinsic(
495483
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
496484

497485
Value *UpdateDPPCall = B.CreateCall(
498486
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
499487
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
500488

501-
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
502-
UpdateDPPCall);
489+
V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
503490
}
504491
}
505492
return V;
@@ -510,8 +497,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
510497
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
511498
Value *Identity) const {
512499
Type *AtomicTy = V->getType();
513-
Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
514-
515500
Module *M = B.GetInsertBlock()->getModule();
516501
Function *UpdateDPP =
517502
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -521,10 +506,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
521506
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
522507
B.getInt32(0xf), B.getFalse()});
523508
} else {
524-
Function *ReadLane = Intrinsic::getDeclaration(
525-
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
526-
Function *WriteLane = Intrinsic::getDeclaration(
527-
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
509+
Function *ReadLane =
510+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
511+
Function *WriteLane =
512+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
528513

529514
// On GFX10 all DPP operations are confined to a single row. To get cross-
530515
// row operations we have to use permlane or readlane.
@@ -534,24 +519,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
534519
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
535520

536521
// Copy the old lane 15 to the new lane 16.
537-
V = B.CreateCall(
538-
WriteLane,
539-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
540-
B.getInt32(16), B.CreateBitCast(V, IntNTy)});
541-
V = B.CreateBitCast(V, AtomicTy);
522+
V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
523+
B.getInt32(16), V});
524+
542525
if (!ST->isWave32()) {
543526
// Copy the old lane 31 to the new lane 32.
544-
V = B.CreateBitCast(V, IntNTy);
545-
V = B.CreateCall(WriteLane,
546-
{B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
547-
B.getInt32(31)}),
548-
B.getInt32(32), V});
527+
V = B.CreateCall(
528+
WriteLane,
529+
{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
549530

550531
// Copy the old lane 47 to the new lane 48.
551532
V = B.CreateCall(
552533
WriteLane,
553534
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
554-
V = B.CreateBitCast(V, AtomicTy);
555535
}
556536
}
557537

@@ -591,24 +571,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
591571
auto *FF1 =
592572
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
593573

594-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
595-
auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
574+
auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
596575

597576
// Get the value required for atomic operation
598-
V = B.CreateBitCast(V, IntNTy);
599577
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
600578
{V, LaneIdxInt});
601-
LaneValue = B.CreateBitCast(LaneValue, Ty);
602579

603580
// Perform writelane if intermediate scan results are required later in the
604581
// kernel computations
605582
Value *OldValue = nullptr;
606583
if (NeedResult) {
607-
OldValue =
608-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
609-
{B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
610-
B.CreateBitCast(OldValuePhi, IntNTy)});
611-
OldValue = B.CreateBitCast(OldValue, Ty);
584+
OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
585+
{Accumulator, LaneIdxInt, OldValuePhi});
612586
OldValuePhi->addIncoming(OldValue, ComputeLoop);
613587
}
614588

@@ -710,10 +684,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
710684

711685
Type *const Ty = I.getType();
712686
Type *Int32Ty = B.getInt32Ty();
713-
Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
714687
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
715688
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
716-
auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
717689

718690
// This is the value in the atomic operation we need to combine in order to
719691
// reduce the number of atomic operations.
@@ -768,13 +740,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
768740
if (ScanImpl == ScanOptions::DPP) {
769741
// First we need to set all inactive invocations to the identity value, so
770742
// that they can correctly contribute to the final result.
771-
V = B.CreateBitCast(V, IntNTy);
772-
Identity = B.CreateBitCast(Identity, IntNTy);
773-
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
774-
{V, Identity});
775-
NewV = B.CreateBitCast(NewV, Ty);
776-
V = B.CreateBitCast(V, Ty);
777-
Identity = B.CreateBitCast(Identity, Ty);
743+
NewV =
744+
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
778745
if (!NeedResult && ST->hasPermLaneX16()) {
779746
// On GFX10 the permlanex16 instruction helps us build a reduction
780747
// without too many readlanes and writelanes, which are generally bad
@@ -789,10 +756,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
789756
// which we will provide to the atomic operation.
790757
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
791758
assert(TyBitWidth == 32);
792-
NewV = B.CreateBitCast(NewV, IntNTy);
793-
NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
759+
NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
794760
{NewV, LastLaneIdx});
795-
NewV = B.CreateBitCast(NewV, Ty);
796761
}
797762
// Finally mark the readlanes in the WWM section.
798763
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -931,30 +896,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
931896
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
932897
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
933898
Value *BroadcastI = nullptr;
934-
935-
if (TyBitWidth == 64) {
936-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
937-
Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
938-
Value *const ExtractHi =
939-
B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
940-
CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
941-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
942-
CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
943-
Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
944-
Value *const PartialInsert = B.CreateInsertElement(
945-
PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
946-
Value *const Insert =
947-
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
948-
BroadcastI = B.CreateBitCast(Insert, Ty);
949-
} else if (TyBitWidth == 32) {
950-
Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
951-
BroadcastI =
952-
B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
953-
BroadcastI = B.CreateBitCast(BroadcastI, Ty);
954-
955-
} else {
956-
llvm_unreachable("Unhandled atomic bit width");
957-
}
899+
BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
958900

959901
// Now that we have the result of our single atomic operation, we need to
960902
// get our individual lane's slice into the result. We use the lane offset

llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,38 +169,37 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
169169
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
170170
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
171171
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
172-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
172+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
173173
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
174174
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
175-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
176-
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
175+
; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
177176
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
178177
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
179-
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
178+
; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
180179
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
181180
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
182-
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
181+
; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
183182
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
184183
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
185-
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
184+
; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
186185
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
187186
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
188-
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
187+
; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
189188
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
190189
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
191-
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
190+
; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
192191
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
193192
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
194-
; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
195-
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
193+
; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
194+
; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
196195
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
197196
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
198197
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
199198
; GFX90A_GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
200199
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
201200
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
202201
; GFX90A_GFX940-NEXT: {{ $}}
203-
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
202+
; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
204203
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
205204
; GFX90A_GFX940-NEXT: {{ $}}
206205
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
211210
; GFX90A_GFX940-NEXT: {{ $}}
212211
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
213212
; GFX90A_GFX940-NEXT: {{ $}}
214-
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
213+
; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
215214
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
216215
; GFX90A_GFX940-NEXT: S_ENDPGM 0
217216
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic

0 commit comments

Comments
 (0)