@@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
386
386
Value *V,
387
387
Value *const Identity) const {
388
388
Type *AtomicTy = V->getType ();
389
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
390
389
Module *M = B.GetInsertBlock ()->getModule ();
391
390
Function *UpdateDPP =
392
391
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
402
401
403
402
// Reduce within each pair of rows (i.e. 32 lanes).
404
403
assert (ST->hasPermLaneX16 ());
405
- V = B.CreateBitCast (V, IntNTy);
406
404
Value *Permlanex16Call = B.CreateIntrinsic (
407
405
V->getType (), Intrinsic::amdgcn_permlanex16,
408
406
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
409
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
410
- B.CreateBitCast (Permlanex16Call, AtomicTy));
407
+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
411
408
if (ST->isWave32 ()) {
412
409
return V;
413
410
}
414
411
415
412
if (ST->hasPermLane64 ()) {
416
413
// Reduce across the upper and lower 32 lanes.
417
- V = B.CreateBitCast (V, IntNTy);
418
414
Value *Permlane64Call =
419
415
B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
420
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
421
- B.CreateBitCast (Permlane64Call, AtomicTy));
416
+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
422
417
}
423
418
424
419
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
425
420
// combine them with a scalar operation.
426
421
Function *ReadLane =
427
- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
428
- V = B.CreateBitCast (V, IntNTy);
422
+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
429
423
Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
430
424
Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
431
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
432
- B.CreateBitCast (Lane32, AtomicTy));
425
+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
433
426
}
434
427
435
428
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
438
431
AtomicRMWInst::BinOp Op, Value *V,
439
432
Value *Identity) const {
440
433
Type *AtomicTy = V->getType ();
441
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
442
-
443
434
Module *M = B.GetInsertBlock ()->getModule ();
444
435
Function *UpdateDPP =
445
436
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -470,29 +461,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
470
461
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
471
462
// 48..63).
472
463
assert (ST->hasPermLaneX16 ());
473
- V = B.CreateBitCast (V, IntNTy);
474
464
Value *PermX = B.CreateIntrinsic (
475
465
V->getType (), Intrinsic::amdgcn_permlanex16,
476
466
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
477
467
478
- Value *UpdateDPPCall =
479
- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
480
- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
481
- B.getInt32 (0xf ), B.getFalse ()});
482
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
468
+ Value *UpdateDPPCall = B.CreateCall (
469
+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
470
+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
471
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
483
472
484
473
if (!ST->isWave32 ()) {
485
474
// Combine lane 31 into lanes 32..63.
486
- V = B.CreateBitCast (V, IntNTy);
487
475
Value *const Lane31 = B.CreateIntrinsic (
488
476
V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
489
477
490
478
Value *UpdateDPPCall = B.CreateCall (
491
479
UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
492
480
B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
493
481
494
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
495
- UpdateDPPCall);
482
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
496
483
}
497
484
}
498
485
return V;
@@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
503
490
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
504
491
Value *Identity) const {
505
492
Type *AtomicTy = V->getType ();
506
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
507
-
508
493
Module *M = B.GetInsertBlock ()->getModule ();
509
494
Function *UpdateDPP =
510
495
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
514
499
{Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
515
500
B.getInt32 (0xf ), B.getFalse ()});
516
501
} else {
517
- Function *ReadLane = Intrinsic::getDeclaration (
518
- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
519
- Function *WriteLane = Intrinsic::getDeclaration (
520
- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
502
+ Function *ReadLane =
503
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
504
+ Function *WriteLane =
505
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
521
506
522
507
// On GFX10 all DPP operations are confined to a single row. To get cross-
523
508
// row operations we have to use permlane or readlane.
@@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
527
512
B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
528
513
529
514
// Copy the old lane 15 to the new lane 16.
530
- V = B.CreateCall (
531
- WriteLane,
532
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
533
- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
534
- V = B.CreateBitCast (V, AtomicTy);
515
+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
516
+ B.getInt32 (16 ), V});
517
+
535
518
if (!ST->isWave32 ()) {
536
519
// Copy the old lane 31 to the new lane 32.
537
- V = B.CreateBitCast (V, IntNTy);
538
- V = B.CreateCall (WriteLane,
539
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
540
- B.getInt32 (31 )}),
541
- B.getInt32 (32 ), V});
520
+ V = B.CreateCall (
521
+ WriteLane,
522
+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
542
523
543
524
// Copy the old lane 47 to the new lane 48.
544
525
V = B.CreateCall (
545
526
WriteLane,
546
527
{B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
547
- V = B.CreateBitCast (V, AtomicTy);
548
528
}
549
529
}
550
530
@@ -584,24 +564,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
584
564
auto *FF1 =
585
565
B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
586
566
587
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
588
- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
567
+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
589
568
590
569
// Get the value required for atomic operation
591
- V = B.CreateBitCast (V, IntNTy);
592
570
Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
593
571
{V, LaneIdxInt});
594
- LaneValue = B.CreateBitCast (LaneValue, Ty);
595
572
596
573
// Perform writelane if intermediate scan results are required later in the
597
574
// kernel computations
598
575
Value *OldValue = nullptr ;
599
576
if (NeedResult) {
600
- OldValue =
601
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
602
- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
603
- B.CreateBitCast (OldValuePhi, IntNTy)});
604
- OldValue = B.CreateBitCast (OldValue, Ty);
577
+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
578
+ {Accumulator, LaneIdxInt, OldValuePhi});
605
579
OldValuePhi->addIncoming (OldValue, ComputeLoop);
606
580
}
607
581
@@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
700
674
701
675
Type *const Ty = I.getType ();
702
676
Type *Int32Ty = B.getInt32Ty ();
703
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
704
677
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
705
678
const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
706
- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
707
679
708
680
// This is the value in the atomic operation we need to combine in order to
709
681
// reduce the number of atomic operations.
@@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
758
730
if (ScanImpl == ScanOptions::DPP) {
759
731
// First we need to set all inactive invocations to the identity value, so
760
732
// that they can correctly contribute to the final result.
761
- V = B.CreateBitCast (V, IntNTy);
762
- Identity = B.CreateBitCast (Identity, IntNTy);
763
- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
764
- {V, Identity});
765
- NewV = B.CreateBitCast (NewV, Ty);
766
- V = B.CreateBitCast (V, Ty);
767
- Identity = B.CreateBitCast (Identity, Ty);
733
+ NewV =
734
+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
768
735
if (!NeedResult && ST->hasPermLaneX16 ()) {
769
736
// On GFX10 the permlanex16 instruction helps us build a reduction
770
737
// without too many readlanes and writelanes, which are generally bad
@@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
779
746
// which we will provide to the atomic operation.
780
747
Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
781
748
assert (TyBitWidth == 32 );
782
- NewV = B.CreateBitCast (NewV, IntNTy);
783
- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
749
+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
784
750
{NewV, LastLaneIdx});
785
- NewV = B.CreateBitCast (NewV, Ty);
786
751
}
787
752
// Finally mark the readlanes in the WWM section.
788
753
NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
922
887
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
923
888
Value *BroadcastI = nullptr ;
924
889
925
- if (TyBitWidth == 64 ) {
926
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
927
- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
928
- Value *const ExtractHi =
929
- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
930
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
931
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
932
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
933
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
934
- Value *const PartialInsert = B.CreateInsertElement (
935
- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
936
- Value *const Insert =
937
- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
938
- BroadcastI = B.CreateBitCast (Insert, Ty);
939
- } else if (TyBitWidth == 32 ) {
940
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
941
- BroadcastI =
942
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
943
- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
944
-
890
+ if (TyBitWidth == 32 || TyBitWidth == 64 ) {
891
+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
945
892
} else {
946
893
llvm_unreachable (" Unhandled atomic bit width" );
947
894
}
0 commit comments