@@ -393,7 +393,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
393
393
Value *V,
394
394
Value *const Identity) const {
395
395
Type *AtomicTy = V->getType ();
396
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
397
396
Module *M = B.GetInsertBlock ()->getModule ();
398
397
Function *UpdateDPP =
399
398
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -409,34 +408,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
409
408
410
409
// Reduce within each pair of rows (i.e. 32 lanes).
411
410
assert (ST->hasPermLaneX16 ());
412
- V = B.CreateBitCast (V, IntNTy);
413
411
Value *Permlanex16Call = B.CreateIntrinsic (
414
412
V->getType (), Intrinsic::amdgcn_permlanex16,
415
413
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
416
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
417
- B.CreateBitCast (Permlanex16Call, AtomicTy));
414
+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
418
415
if (ST->isWave32 ()) {
419
416
return V;
420
417
}
421
418
422
419
if (ST->hasPermLane64 ()) {
423
420
// Reduce across the upper and lower 32 lanes.
424
- V = B.CreateBitCast (V, IntNTy);
425
421
Value *Permlane64Call =
426
422
B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
427
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
428
- B.CreateBitCast (Permlane64Call, AtomicTy));
423
+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
429
424
}
430
425
431
426
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
432
427
// combine them with a scalar operation.
433
428
Function *ReadLane =
434
- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
435
- V = B.CreateBitCast (V, IntNTy);
429
+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
436
430
Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
437
431
Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
438
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
439
- B.CreateBitCast (Lane32, AtomicTy));
432
+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
440
433
}
441
434
442
435
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -445,8 +438,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
445
438
AtomicRMWInst::BinOp Op, Value *V,
446
439
Value *Identity) const {
447
440
Type *AtomicTy = V->getType ();
448
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
449
-
450
441
Module *M = B.GetInsertBlock ()->getModule ();
451
442
Function *UpdateDPP =
452
443
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -477,29 +468,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
477
468
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
478
469
// 48..63).
479
470
assert (ST->hasPermLaneX16 ());
480
- V = B.CreateBitCast (V, IntNTy);
481
471
Value *PermX = B.CreateIntrinsic (
482
472
V->getType (), Intrinsic::amdgcn_permlanex16,
483
473
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
484
474
485
- Value *UpdateDPPCall =
486
- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
487
- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
488
- B.getInt32 (0xf ), B.getFalse ()});
489
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
475
+ Value *UpdateDPPCall = B.CreateCall (
476
+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
477
+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
478
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
490
479
491
480
if (!ST->isWave32 ()) {
492
481
// Combine lane 31 into lanes 32..63.
493
- V = B.CreateBitCast (V, IntNTy);
494
482
Value *const Lane31 = B.CreateIntrinsic (
495
483
V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
496
484
497
485
Value *UpdateDPPCall = B.CreateCall (
498
486
UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
499
487
B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
500
488
501
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
502
- UpdateDPPCall);
489
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
503
490
}
504
491
}
505
492
return V;
@@ -510,8 +497,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
510
497
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
511
498
Value *Identity) const {
512
499
Type *AtomicTy = V->getType ();
513
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
514
-
515
500
Module *M = B.GetInsertBlock ()->getModule ();
516
501
Function *UpdateDPP =
517
502
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -521,10 +506,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
521
506
{Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
522
507
B.getInt32 (0xf ), B.getFalse ()});
523
508
} else {
524
- Function *ReadLane = Intrinsic::getDeclaration (
525
- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
526
- Function *WriteLane = Intrinsic::getDeclaration (
527
- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
509
+ Function *ReadLane =
510
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
511
+ Function *WriteLane =
512
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
528
513
529
514
// On GFX10 all DPP operations are confined to a single row. To get cross-
530
515
// row operations we have to use permlane or readlane.
@@ -534,24 +519,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
534
519
B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
535
520
536
521
// Copy the old lane 15 to the new lane 16.
537
- V = B.CreateCall (
538
- WriteLane,
539
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
540
- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
541
- V = B.CreateBitCast (V, AtomicTy);
522
+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
523
+ B.getInt32 (16 ), V});
524
+
542
525
if (!ST->isWave32 ()) {
543
526
// Copy the old lane 31 to the new lane 32.
544
- V = B.CreateBitCast (V, IntNTy);
545
- V = B.CreateCall (WriteLane,
546
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
547
- B.getInt32 (31 )}),
548
- B.getInt32 (32 ), V});
527
+ V = B.CreateCall (
528
+ WriteLane,
529
+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
549
530
550
531
// Copy the old lane 47 to the new lane 48.
551
532
V = B.CreateCall (
552
533
WriteLane,
553
534
{B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
554
- V = B.CreateBitCast (V, AtomicTy);
555
535
}
556
536
}
557
537
@@ -591,24 +571,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
591
571
auto *FF1 =
592
572
B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
593
573
594
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
595
- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
574
+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
596
575
597
576
// Get the value required for atomic operation
598
- V = B.CreateBitCast (V, IntNTy);
599
577
Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
600
578
{V, LaneIdxInt});
601
- LaneValue = B.CreateBitCast (LaneValue, Ty);
602
579
603
580
// Perform writelane if intermediate scan results are required later in the
604
581
// kernel computations
605
582
Value *OldValue = nullptr ;
606
583
if (NeedResult) {
607
- OldValue =
608
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
609
- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
610
- B.CreateBitCast (OldValuePhi, IntNTy)});
611
- OldValue = B.CreateBitCast (OldValue, Ty);
584
+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
585
+ {Accumulator, LaneIdxInt, OldValuePhi});
612
586
OldValuePhi->addIncoming (OldValue, ComputeLoop);
613
587
}
614
588
@@ -710,10 +684,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
710
684
711
685
Type *const Ty = I.getType ();
712
686
Type *Int32Ty = B.getInt32Ty ();
713
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
714
687
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
715
688
const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
716
- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
717
689
718
690
// This is the value in the atomic operation we need to combine in order to
719
691
// reduce the number of atomic operations.
@@ -768,13 +740,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
768
740
if (ScanImpl == ScanOptions::DPP) {
769
741
// First we need to set all inactive invocations to the identity value, so
770
742
// that they can correctly contribute to the final result.
771
- V = B.CreateBitCast (V, IntNTy);
772
- Identity = B.CreateBitCast (Identity, IntNTy);
773
- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
774
- {V, Identity});
775
- NewV = B.CreateBitCast (NewV, Ty);
776
- V = B.CreateBitCast (V, Ty);
777
- Identity = B.CreateBitCast (Identity, Ty);
743
+ NewV =
744
+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
778
745
if (!NeedResult && ST->hasPermLaneX16 ()) {
779
746
// On GFX10 the permlanex16 instruction helps us build a reduction
780
747
// without too many readlanes and writelanes, which are generally bad
@@ -789,10 +756,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
789
756
// which we will provide to the atomic operation.
790
757
Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
791
758
assert (TyBitWidth == 32 );
792
- NewV = B.CreateBitCast (NewV, IntNTy);
793
- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
759
+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
794
760
{NewV, LastLaneIdx});
795
- NewV = B.CreateBitCast (NewV, Ty);
796
761
}
797
762
// Finally mark the readlanes in the WWM section.
798
763
NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -931,30 +896,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
931
896
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
932
897
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
933
898
Value *BroadcastI = nullptr ;
934
-
935
- if (TyBitWidth == 64 ) {
936
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
937
- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
938
- Value *const ExtractHi =
939
- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
940
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
941
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
942
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
943
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
944
- Value *const PartialInsert = B.CreateInsertElement (
945
- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
946
- Value *const Insert =
947
- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
948
- BroadcastI = B.CreateBitCast (Insert, Ty);
949
- } else if (TyBitWidth == 32 ) {
950
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
951
- BroadcastI =
952
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
953
- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
954
-
955
- } else {
956
- llvm_unreachable (" Unhandled atomic bit width" );
957
- }
899
+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
958
900
959
901
// Now that we have the result of our single atomic operation, we need to
960
902
// get our individual lane's slice into the result. We use the lane offset
0 commit comments