@@ -395,7 +395,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
395
395
Value *V,
396
396
Value *const Identity) const {
397
397
Type *AtomicTy = V->getType ();
398
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
399
398
Module *M = B.GetInsertBlock ()->getModule ();
400
399
Function *UpdateDPP =
401
400
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -411,34 +410,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
411
410
412
411
// Reduce within each pair of rows (i.e. 32 lanes).
413
412
assert (ST->hasPermLaneX16 ());
414
- V = B.CreateBitCast (V, IntNTy);
415
413
Value *Permlanex16Call = B.CreateIntrinsic (
416
414
V->getType (), Intrinsic::amdgcn_permlanex16,
417
415
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
418
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
419
- B.CreateBitCast (Permlanex16Call, AtomicTy));
416
+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
420
417
if (ST->isWave32 ()) {
421
418
return V;
422
419
}
423
420
424
421
if (ST->hasPermLane64 ()) {
425
422
// Reduce across the upper and lower 32 lanes.
426
- V = B.CreateBitCast (V, IntNTy);
427
423
Value *Permlane64Call =
428
424
B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
429
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
430
- B.CreateBitCast (Permlane64Call, AtomicTy));
425
+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
431
426
}
432
427
433
428
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
434
429
// combine them with a scalar operation.
435
430
Function *ReadLane =
436
- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
437
- V = B.CreateBitCast (V, IntNTy);
431
+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
438
432
Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
439
433
Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
440
- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
441
- B.CreateBitCast (Lane32, AtomicTy));
434
+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
442
435
}
443
436
444
437
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -447,8 +440,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
447
440
AtomicRMWInst::BinOp Op, Value *V,
448
441
Value *Identity) const {
449
442
Type *AtomicTy = V->getType ();
450
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
451
-
452
443
Module *M = B.GetInsertBlock ()->getModule ();
453
444
Function *UpdateDPP =
454
445
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -479,29 +470,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
479
470
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
480
471
// 48..63).
481
472
assert (ST->hasPermLaneX16 ());
482
- V = B.CreateBitCast (V, IntNTy);
483
473
Value *PermX = B.CreateIntrinsic (
484
474
V->getType (), Intrinsic::amdgcn_permlanex16,
485
475
{V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
486
476
487
- Value *UpdateDPPCall =
488
- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
489
- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
490
- B.getInt32 (0xf ), B.getFalse ()});
491
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
477
+ Value *UpdateDPPCall = B.CreateCall (
478
+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
479
+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
480
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
492
481
493
482
if (!ST->isWave32 ()) {
494
483
// Combine lane 31 into lanes 32..63.
495
- V = B.CreateBitCast (V, IntNTy);
496
484
Value *const Lane31 = B.CreateIntrinsic (
497
485
V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
498
486
499
487
Value *UpdateDPPCall = B.CreateCall (
500
488
UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
501
489
B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
502
490
503
- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
504
- UpdateDPPCall);
491
+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
505
492
}
506
493
}
507
494
return V;
@@ -512,8 +499,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
512
499
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
513
500
Value *Identity) const {
514
501
Type *AtomicTy = V->getType ();
515
- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
516
-
517
502
Module *M = B.GetInsertBlock ()->getModule ();
518
503
Function *UpdateDPP =
519
504
Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -523,10 +508,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
523
508
{Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
524
509
B.getInt32 (0xf ), B.getFalse ()});
525
510
} else {
526
- Function *ReadLane = Intrinsic::getDeclaration (
527
- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
528
- Function *WriteLane = Intrinsic::getDeclaration (
529
- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
511
+ Function *ReadLane =
512
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
513
+ Function *WriteLane =
514
+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
530
515
531
516
// On GFX10 all DPP operations are confined to a single row. To get cross-
532
517
// row operations we have to use permlane or readlane.
@@ -536,24 +521,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
536
521
B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
537
522
538
523
// Copy the old lane 15 to the new lane 16.
539
- V = B.CreateCall (
540
- WriteLane,
541
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
542
- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
543
- V = B.CreateBitCast (V, AtomicTy);
524
+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
525
+ B.getInt32 (16 ), V});
526
+
544
527
if (!ST->isWave32 ()) {
545
528
// Copy the old lane 31 to the new lane 32.
546
- V = B.CreateBitCast (V, IntNTy);
547
- V = B.CreateCall (WriteLane,
548
- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
549
- B.getInt32 (31 )}),
550
- B.getInt32 (32 ), V});
529
+ V = B.CreateCall (
530
+ WriteLane,
531
+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
551
532
552
533
// Copy the old lane 47 to the new lane 48.
553
534
V = B.CreateCall (
554
535
WriteLane,
555
536
{B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
556
- V = B.CreateBitCast (V, AtomicTy);
557
537
}
558
538
}
559
539
@@ -593,24 +573,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
593
573
auto *FF1 =
594
574
B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
595
575
596
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
597
- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
576
+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
598
577
599
578
// Get the value required for atomic operation
600
- V = B.CreateBitCast (V, IntNTy);
601
579
Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
602
580
{V, LaneIdxInt});
603
- LaneValue = B.CreateBitCast (LaneValue, Ty);
604
581
605
582
// Perform writelane if intermediate scan results are required later in the
606
583
// kernel computations
607
584
Value *OldValue = nullptr ;
608
585
if (NeedResult) {
609
- OldValue =
610
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
611
- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
612
- B.CreateBitCast (OldValuePhi, IntNTy)});
613
- OldValue = B.CreateBitCast (OldValue, Ty);
586
+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
587
+ {Accumulator, LaneIdxInt, OldValuePhi});
614
588
OldValuePhi->addIncoming (OldValue, ComputeLoop);
615
589
}
616
590
@@ -709,10 +683,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
709
683
710
684
Type *const Ty = I.getType ();
711
685
Type *Int32Ty = B.getInt32Ty ();
712
- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
713
686
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
714
687
const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
715
- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
716
688
717
689
// This is the value in the atomic operation we need to combine in order to
718
690
// reduce the number of atomic operations.
@@ -767,13 +739,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
767
739
if (ScanImpl == ScanOptions::DPP) {
768
740
// First we need to set all inactive invocations to the identity value, so
769
741
// that they can correctly contribute to the final result.
770
- V = B.CreateBitCast (V, IntNTy);
771
- Identity = B.CreateBitCast (Identity, IntNTy);
772
- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
773
- {V, Identity});
774
- NewV = B.CreateBitCast (NewV, Ty);
775
- V = B.CreateBitCast (V, Ty);
776
- Identity = B.CreateBitCast (Identity, Ty);
742
+ NewV =
743
+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
777
744
if (!NeedResult && ST->hasPermLaneX16 ()) {
778
745
// On GFX10 the permlanex16 instruction helps us build a reduction
779
746
// without too many readlanes and writelanes, which are generally bad
@@ -788,10 +755,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
788
755
// which we will provide to the atomic operation.
789
756
Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
790
757
assert (TyBitWidth == 32 );
791
- NewV = B.CreateBitCast (NewV, IntNTy);
792
- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
758
+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
793
759
{NewV, LastLaneIdx});
794
- NewV = B.CreateBitCast (NewV, Ty);
795
760
}
796
761
// Finally mark the readlanes in the WWM section.
797
762
NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -930,30 +895,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
930
895
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
931
896
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
932
897
Value *BroadcastI = nullptr ;
933
-
934
- if (TyBitWidth == 64 ) {
935
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
936
- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
937
- Value *const ExtractHi =
938
- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
939
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
940
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
941
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
942
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
943
- Value *const PartialInsert = B.CreateInsertElement (
944
- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
945
- Value *const Insert =
946
- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
947
- BroadcastI = B.CreateBitCast (Insert, Ty);
948
- } else if (TyBitWidth == 32 ) {
949
- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
950
- BroadcastI =
951
- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
952
- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
953
-
954
- } else {
955
- llvm_unreachable (" Unhandled atomic bit width" );
956
- }
898
+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
957
899
958
900
// Now that we have the result of our single atomic operation, we need to
959
901
// get our individual lane's slice into the result. We use the lane offset
0 commit comments