[LLVM][AMDGPU] AMDGPUInstCombineIntrinsic for *lane intrinsics #99878

Acim-Maravic · 2024-07-22T14:22:26Z

For AMDGCN it would be nice to have UniformityAnalysis in InstCombine pass to allow folding of more lane* intrinsics and other ones in the future. Is this acceptable for other targets?

…recommit

This patch adds UniformityAnalysis to InstCombine pass, thereby enabling the use of UniformityAnalysis instead of just checking for trivially uniform constants.

llvmbot · 2024-07-22T14:23:05Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-amdgpu

Author: Acim Maravic (Acim-Maravic)

Changes

For AMDGCN it would be nice to have UniformityAnalysis in InstCombine pass to allow folding of more lane* intrinsics and other ones in the future. Is this acceptable for other targets?

Patch is 26.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99878.diff

14 Files Affected:

(modified) llvm/include/llvm/Transforms/InstCombine/InstCombiner.h (+6-2)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+10-7)
(modified) llvm/lib/Transforms/InstCombine/InstCombineInternal.h (+3-2)
(modified) llvm/lib/Transforms/InstCombine/InstructionCombining.cpp (+9-4)
(modified) llvm/test/Other/new-pm-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-lto-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-thinlto-postlink-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-thinlto-prelink-defaults.ll (+2)
(modified) llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll (+4)
(modified) llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll (+2)
(added) llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-*lane-intrinsic-combine.ll (+202)
(modified) llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll (+7-14)

diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ebcbd5d9e8880..1b949729bf90e 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/DomConditionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
@@ -79,6 +80,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   BranchProbabilityInfo *BPI;
   ProfileSummaryInfo *PSI;
   DomConditionCache DC;
+  UniformityInfo &UI;
 
   // Optional analyses. When non-null, these can both be used to do better
   // combining and will be updated to reflect any changes.
@@ -98,12 +100,13 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
                TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                DominatorTree &DT, OptimizationRemarkEmitter &ORE,
                BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
+               ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI,
+               UniformityInfo &UI)
       : TTI(TTI), Builder(Builder), Worklist(Worklist),
         MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
         SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true,
            /*CanUseUndef*/ true, &DC),
-        ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), LI(LI) {}
+        ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), UI(UI), LI(LI) {}
 
   virtual ~InstCombiner() = default;
 
@@ -345,6 +348,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   }
   BlockFrequencyInfo *getBlockFrequencyInfo() const { return BFI; }
   ProfileSummaryInfo *getProfileSummaryInfo() const { return PSI; }
+  UniformityInfo &getUniformityInfo() const { return UI; }
   LoopInfo *getLoopInfo() const { return LI; }
 
   // Call target specific combiners
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9197404309663..5da4ba62a08a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1059,17 +1059,20 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
   }
-  case Intrinsic::amdgcn_permlane64:
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
-      return IC.replaceInstUsesWith(II, C);
+  case Intrinsic::amdgcn_permlane64: {
+    UniformityInfo &UI = IC.getUniformityInfo();
+    Value *Src = II.getOperand(0);
+    if (UI.isUniform(Src)) {
+      return IC.replaceInstUsesWith(II, Src);
     }
     break;
+  }
   case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
-    // A constant value is trivially uniform.
-    if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
-      return IC.replaceInstUsesWith(II, C);
+    UniformityInfo &UI = IC.getUniformityInfo();
+    Value *Srcv = II.getOperand(0);
+    if (UI.isUniform(Srcv)) {
+      return IC.replaceInstUsesWith(II, Srcv);
     }
 
     // The rest of these may not be safe if the exec may not be the same between
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 64fbcc80e0edf..7926783827330 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -68,9 +68,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                    DominatorTree &DT, OptimizationRemarkEmitter &ORE,
                    BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
-                   ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
+                   ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI,
+                   UniformityInfo &UI)
       : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
-                     BFI, BPI, PSI, DL, LI) {}
+                     BFI, BPI, PSI, DL, LI, UI) {}
 
   virtual ~InstCombinerImpl() = default;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0d8e7e92c5c8e..e402906b6c064 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -5380,7 +5381,7 @@ static bool combineInstructionsOverFunction(
     AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
     DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
     BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI, LoopInfo *LI,
-    const InstCombineOptions &Opts) {
+    UniformityInfo &UI, const InstCombineOptions &Opts) {
   auto &DL = F.getDataLayout();
 
   /// Builder - This is an IRBuilder that automatically inserts new
@@ -5418,7 +5419,7 @@ static bool combineInstructionsOverFunction(
                       << F.getName() << "\n");
 
     InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
-                        ORE, BFI, BPI, PSI, DL, LI);
+                        ORE, BFI, BPI, PSI, DL, LI, UI);
     IC.MaxArraySizeForCombine = MaxArraySize;
     bool MadeChangeInThisIteration = IC.prepareWorklist(F, RPOT);
     MadeChangeInThisIteration |= IC.run();
@@ -5466,6 +5467,7 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &UI = AM.getResult<UniformityInfoAnalysis>(F);
 
   // TODO: Only use LoopInfo when the option is set. This requires that the
   //       callers in the pass pipeline explicitly set the option.
@@ -5482,7 +5484,7 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto *BPI = AM.getCachedResult<BranchProbabilityAnalysis>(F);
 
   if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
-                                       BFI, BPI, PSI, LI, Options))
+                                       BFI, BPI, PSI, LI, UI, Options))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -5505,6 +5507,7 @@ void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<BasicAAWrapperPass>();
   AU.addPreserved<GlobalsAAWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  AU.addRequired<UniformityInfoWrapperPass>();
   LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
 }
 
@@ -5519,6 +5522,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  auto &UI = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
   // Optional analyses.
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
@@ -5535,7 +5539,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
     BPI = &WrapperPass->getBPI();
 
   return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
-                                         BFI, BPI, PSI, LI,
+                                         BFI, BPI, PSI, LI, UI,
                                          InstCombineOptions());
 }
 
@@ -5556,6 +5560,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
                     "Combine redundant instructions", false, false)
 
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 588337c15625e..5c9001355ba0c 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -119,6 +119,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index d451d2897f673..99014bd48f167 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -67,6 +67,8 @@
 ; CHECK-O23SZ-NEXT: Running pass: ConstantMergePass
 ; CHECK-O23SZ-NEXT: Running pass: DeadArgumentEliminationPass
 ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O23SZ-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf83..d08865291e381 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -55,6 +55,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434a..920708564a101 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -40,6 +40,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index e5aebc4850e6d..617873f1a5d83 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -48,6 +48,8 @@
 ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager on foo
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 42ef49f8f7c7e..6218c793c782a 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -87,6 +87,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index e74f88c1a3bf9..79ff01d9df9b5 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -51,6 +51,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
@@ -121,6 +123,8 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis
+; CHECK-O-NEXT: Running analysis: CycleAnalysis
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 0bb26330d000a..d8dd7b6a50468 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -54,6 +54,8 @@
 ; CHECK-O-NEXT: Running pass: PromotePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis on foo
+; CHECK-O-NEXT: Running analysis: UniformityInfoAnalysis on foo
+; CHECK-O-NEXT: Running analysis: CycleAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: AAManager on foo
 ; CHECK-O-NEXT: Running analysis: BasicAA
 ; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-*lane-intrinsic-combine.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-*lane-intrinsic-combine.ll
new file mode 100644
index 0000000000000..f33356ab88be0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-*lane-intrinsic-combine.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=instcombine -S < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=instcombine -S < %s | FileCheck %s --check-prefixes=GFX,GFX11
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=instcombine -S < %s | FileCheck %s --check-prefixes=GFX,GFX12
+
+define amdgpu_kernel void @permlane64_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: @permlane64_constant(
+; GFX-NEXT:    store i32 77, ptr addrspace(1) [[OUT:%.*]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 77)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: @permlane64_undef(
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_sgpr(ptr addrspace(1) %out, i32 %src) {
+; GFX-LABEL: @permlane64_sgpr(
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.permlane64(i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: @permlane64_vgpr(
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @permlane64_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: @permlane64_vgpr_expression(
+; GFX-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TID2:%.*]] = add i32 [[TID]], 1
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[TID2]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TID]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid2 = add i32 %tid, 1
+  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readlane_constant(ptr addrspace(1) %out) {
+; GFX-LABEL: @readlane_constant(
+; GFX-NEXT:    store i32 7, ptr addrspace(1) [[OUT:%.*]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 7, i32 5)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_undef(ptr addrspace(1) %out) {
+; GFX-LABEL: @readlane_undef(
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 undef, i32 undef)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_sgpr(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
+; GFX-LABEL: @readlane_sgpr(
+; GFX-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(1) [[OUT:%.*]], align 4
+; GFX-NEXT:    ret void
+;
+  %v = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
+  store i32 %v, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr(i32 addrspace(1)* %out) {
+; GFX-LABEL: @readlane_vgpr(
+; GFX-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy)
+  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tidx
+  store i32 %v, i32 addrspace(1)* %out_ptr
+  ret void
+}
+
+define amdgpu_kernel void @readlane_vgpr_expression(i32 addrspace(1)* %out) {
+; GFX-LABEL: @readlane_vgpr_expression(
+; GFX-NEXT:    [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; GFX-NEXT:    [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; GFX-NEXT:    [[TIDX2:%.*]] = add i32 [[TIDX]], 1
+; GFX-NEXT:    [[TIDY2:%.*]] = add i32 [[TIDY]], 2
+; GFX-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX2]], i32 [[TIDY2]])
+; GFX-NEXT:    [[TMP1:%.*]] = sext i32 [[TIDX]] to i64
+; GFX-NEXT:    [[OUT_PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP1]]
+; GFX-NEXT:    store i32 [[V]], ptr addrspace(1) [[OUT_PTR]], align 4
+; GFX-NEXT:    ret void
+;
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %tidx2 = add i32 %tidx, 1
+  %t...
[truncated]

nikic · 2024-07-22T14:26:59Z

Pretty sure we do not want a UniformityAnalysis dependency in InstCombine.

arsenm · 2024-07-22T14:30:20Z

It's also possible for transforms based on uniformity to become incorrect later after code motion until we have convergence tokens wired up to prevent that

jayfoad · 2024-07-22T14:38:37Z

Pretty sure we do not want a UniformityAnalysis dependency in InstCombine.

Why not? Because of compile time? Isn't UA supposed to be a no-op on targets that don't care about divergence?

jayfoad · 2024-07-22T14:44:29Z

It's also possible for transforms based on uniformity to become incorrect later after code motion until we have convergence tokens wired up to prevent that

I don't understand this. Surely that would just be a bug in the code motion? Do you have an example?

nikic · 2024-07-22T14:50:49Z

Compile-time: http://llvm-compile-time-tracker.com/compare.php?from=a7fb25dd1fcc2e5afcc65cccfa83b7b381b48906&to=18cda5b39d709a861cea69e690d674c4b49789b4&stat=instructions:u

jayfoad · 2024-07-22T14:58:45Z

Compile-time: http://llvm-compile-time-tracker.com/compare.php?from=a7fb25dd1fcc2e5afcc65cccfa83b7b381b48906&to=18cda5b39d709a861cea69e690d674c4b49789b4&stat=instructions:u

I wonder if the slow down is due to running CycleAnalysis, and whether we could somehow avoid that on targets that don't care about divergence.

nikic · 2024-07-22T15:01:34Z

Compile-time: http://llvm-compile-time-tracker.com/compare.php?from=a7fb25dd1fcc2e5afcc65cccfa83b7b381b48906&to=18cda5b39d709a861cea69e690d674c4b49789b4&stat=instructions:u

I wonder if the slow down is due to running CycleAnalysis, and whether we could somehow avoid that on targets that don't care about divergence.

Very likely. And yes, you should be able to easily avoid that in the new pass manager by delaying the analysis fetch until after the hasBranchDivergence() check.

ssahasra · 2024-07-23T05:15:09Z

It's also possible for transforms based on uniformity to become incorrect later after code motion until we have convergence tokens wired up to prevent that

UA and convergence tokens should not have this dependency. The code motion is probably doing something wrong.

ssahasra · 2024-07-23T05:22:43Z

Pretty sure we do not want a UniformityAnalysis dependency in InstCombine.

Why not? Because of compile time? Isn't UA supposed to be a no-op on targets that don't care about divergence?

I had once proposed a TTI query to make UA a no-op on targets that don't care about it. That can solve a handful of really sticky issues, but people didn't like this extra dependence on TTI.

But currently, there is no formal framework to incrementally update UA when the program changes. So for targets that care about it, UA has to be recomputed on every change. Every time InstCombine changes an instruction, it will have to do one of three things:

Recompute UA, or
Prove that the unmodified UA is safe after each iteration, were, "safe" means that no value is incorrectly marked uniform, or
Incrementally update UA.

I think a combination of 2 and 3 is completely feasible, but needs careful work.

ssahasra · 2024-07-23T05:28:26Z

For AMDGCN it would be nice to have UniformityAnalysis in InstCombine pass to allow folding of more lane* intrinsics and other ones in the future. Is this acceptable for other targets?

For AMDGCN, having UniformityAnalysis inside InstCombine is a huge deal and totally worth pursuing. A good example is that it will allow us to eliminate trivial waterfall loops where we "know" that all threads finish in the first iteration. That's especially good for the implementation of the HIP cross-lane builtins:

https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h

jayfoad · 2024-07-23T10:24:23Z

Pretty sure we do not want a UniformityAnalysis dependency in InstCombine.

Why not? Because of compile time? Isn't UA supposed to be a no-op on targets that don't care about divergence?

I had once proposed a TTI query to make UA a no-op on targets that don't care about it. That can solve a handful of really sticky issues, but people didn't like this extra dependence on TTI.

Nevertheless it was done: https://reviews.llvm.org/D151986
This makes the uniformity analysis a no-op, but still runs the cycle analysis.

But currently, there is no formal framework to incrementally update UA when the program changes. So for targets that care about it, UA has to be recomputed on every change. Every time InstCombine changes an instruction, it will have to do one of three things:

Recompute UA, or

Prove that the unmodified UA is safe after each iteration, were, "safe" means that no value is incorrectly marked uniform, or

Incrementally update UA.

I think a combination of 2 and 3 is completely feasible, but needs careful work.

Good points. I had not thought about that. Does UA have a decent verify() implementation that could help to catch cases where it was not updated (or recomputed or invalidated etc) when it should have been?

nikic · 2024-07-23T10:33:11Z

If UA requires any non-trivial amount of analysis updates/invalidation (basically, anything that goes significantly beyond "remove from the analysis if the instruction is removed") then we cannot use it inside InstCombine.

For AMDGCN, having UniformityAnalysis inside InstCombine is a huge deal and totally worth pursuing.

Does it have to be inside InstCombine proper, rather than a separate pass that does uniformity based optimizations, and knows how to keep the analysis up to date through transforms?

arsenm · 2024-07-23T12:15:18Z

I think such optimizations do belong in a separate pass (and there aren't too many of them?). We also need to be careful, since we do not currently have a way to incrementally update UA

ssahasra · 2024-07-24T04:38:41Z

For AMDGCN, having UniformityAnalysis inside InstCombine is a huge deal and totally worth pursuing.

Does it have to be inside InstCombine proper, rather than a separate pass that does uniformity based optimizations, and knows how to keep the analysis up to date through transforms?

That could work, but probably have a lesser improvement, unless we can arrange for this pass to be followed by a run of InstCombine. Also, does it need to be specific to AMDGPU? It's worth investigating if we can improve LLVM IR to provide information to such a pass in a target-neutral way.

ssahasra · 2024-07-24T04:40:12Z

I had once proposed a TTI query to make UA a no-op on targets that don't care about it. That can solve a handful of really sticky issues, but people didn't like this extra dependence on TTI.

Nevertheless it was done: https://reviews.llvm.org/D151986 This makes the uniformity analysis a no-op, but still runs the cycle analysis.
snip.
Does UA have a decent verify() implementation that could help to catch cases where it was not updated (or recomputed or invalidated etc) when it should have been?

Both look like useful improvements to UA.

ssahasra · 2024-07-24T04:43:25Z

#100185 is yet another reason to have some form of incremental updates to UniformityAnalysis. And that PR is only about identifying divergent branches, with or without convergent operations. Both branch divergence and convergent operations separately force us to step back from some optimization or the other, and the inability to update UA is in the centre of all that.

ssahasra · 2024-07-31T15:24:57Z

Is this PR still active? The general direction looks okay to me, but every place that uses UI needs to show why it is safe to continue using UI after each local change to the code.

arsenm · 2024-07-31T16:22:59Z

Is this PR still active? The general direction looks okay to me, but every place that uses UI needs to show why it is safe to continue using UI after each local change to the code.

We probably shouldn't be doing this in InstCombine, if we're doing to do this anywhere. It's going to trigger infrequently

ssahasra · 2024-08-25T02:47:33Z

Is this PR still active? The general direction looks okay to me, but every place that uses UI needs to show why it is safe to continue using UI after each local change to the code.

We probably shouldn't be doing this in InstCombine, if we're doing to do this anywhere. It's going to trigger infrequently

@Acim-Maravic are you still working on this? I would like to initiate work on a separate pass. Besides just writing the pass, I think some effort will be spent in evaluating the benefit and figuring out where to run it in the pipeline.

Acim-Maravic added 2 commits July 22, 2024 14:52

[NFC][LLVM][AMDGPU] AMDGPUInstCombineIntrinsic for *lane intrinsics p…

5fa8e34

…recommit

[LLVM][AMDGPU] AMDGPUInstCombineIntrinsic for *lane intrinsics

d14cd90

This patch adds UniformityAnalysis to InstCombine pass, thereby enabling the use of UniformityAnalysis instead of just checking for trivially uniform constants.

Acim-Maravic requested a review from nikic as a code owner July 22, 2024 14:22

llvmbot added backend:AMDGPU llvm:transforms labels Jul 22, 2024

Acim-Maravic marked this pull request as draft July 22, 2024 14:22

Merge branch 'main' into addUniformityAnalysis

ea98eed

jayfoad requested review from ssahasra and nhaehnle July 22, 2024 14:38

PankajDwivedi-25 mentioned this pull request Nov 20, 2024

[AMDGPU] Introduce "amdgpu-uniform-intrinsic-combine" pass to combine uniform AMDGPU lane Intrinsics. #116953

Open

[LLVM][AMDGPU] AMDGPUInstCombineIntrinsic for *lane intrinsics #99878

Are you sure you want to change the base?

[LLVM][AMDGPU] AMDGPUInstCombineIntrinsic for *lane intrinsics #99878

Uh oh!

Conversation

Acim-Maravic commented Jul 22, 2024

Uh oh!

llvmbot commented Jul 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

nikic commented Jul 22, 2024

Uh oh!

arsenm commented Jul 22, 2024

Uh oh!

jayfoad commented Jul 22, 2024

Uh oh!

jayfoad commented Jul 22, 2024

Uh oh!

nikic commented Jul 22, 2024

Uh oh!

jayfoad commented Jul 22, 2024

Uh oh!

nikic commented Jul 22, 2024

Uh oh!

ssahasra commented Jul 23, 2024

Uh oh!

ssahasra commented Jul 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ssahasra commented Jul 23, 2024

Uh oh!

jayfoad commented Jul 23, 2024

Uh oh!

nikic commented Jul 23, 2024

Uh oh!

arsenm commented Jul 23, 2024

Uh oh!

ssahasra commented Jul 24, 2024

Uh oh!

ssahasra commented Jul 24, 2024

Uh oh!

ssahasra commented Jul 24, 2024

Uh oh!

ssahasra commented Jul 31, 2024

Uh oh!

arsenm commented Jul 31, 2024

Uh oh!

ssahasra commented Aug 25, 2024

Uh oh!

Uh oh!

llvmbot commented Jul 22, 2024 •

edited

Loading

ssahasra commented Jul 23, 2024 •

edited

Loading