llvm · PankajDwivedi-25 · Nov 21, 2024 · Nov 22, 2024 · Nov 22, 2024 · Dec 5, 2024
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -491,6 +491,11 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+struct AMDGPUUniformIntrinsicCombinePass
+    : public PassInfoMixin<AMDGPUUniformIntrinsicCombinePass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
 MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("amdgpu-uniform-intrinsic-combine", AMDGPUUniformIntrinsicCombinePass())
 #undef MODULE_PASS
 
 #ifndef MODULE_PASS_WITH_PARAMS

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -478,6 +478,11 @@ static cl::opt<bool> HasClosedWorldAssumption(
     cl::desc("Whether has closed-world assumption at link time"),
     cl::init(false), cl::Hidden);
 
+static cl::opt<bool> EnableUniformIntrinsicCombine(
+    "amdgpu-enable-uniform-intrinsic-combine",
+    cl::desc("Enable/Disable the Uniform Intrinsic Combine Pass"),
+    cl::init(true), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -827,6 +832,9 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
+
+        if (EnableUniformIntrinsicCombine)
+          PM.addPass(AMDGPUUniformIntrinsicCombinePass());
       });
 
   PB.registerPeepholeEPCallback(

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp
@@ -0,0 +1,138 @@
+//===-- AMDGPUUniformIntrinsicCombine.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass simplifies certain intrinsic calls when the arguments are uniform.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-uniform-intrinsic-combine"
+
+using namespace llvm;
+using namespace llvm::AMDGPU;
+using namespace llvm::PatternMatch;
+
+/// Optimizes uniform intrinsics.
+static bool optimizeUniformIntrinsic(IntrinsicInst &II,
+                                     const UniformityInfo &UI) {
+  llvm::Intrinsic::ID IID = II.getIntrinsicID();
+
+  switch (IID) {
+  case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_readfirstlane:
+  case Intrinsic::amdgcn_readlane: {
+    Value *Src = II.getArgOperand(0);
+    // Check if the argument use is divergent
+    if (UI.isDivergentUse(II.getOperandUse(0)))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing " << II << " with " << *Src << '\n');
+    II.replaceAllUsesWith(Src);
+    II.eraseFromParent();
+    return true;
+  }
+  case Intrinsic::amdgcn_ballot: {
+    Value *Src = II.getArgOperand(0);
+    if (UI.isDivergentUse(II.getOperandUse(0)))
+      return false;
+    LLVM_DEBUG(dbgs() << "Found uniform ballot intrinsic: " << II << '\n');
+
+    // If there are no ICmp users, return early.
+    if (none_of(II.users(), [](User *U) { return isa<ICmpInst>(U); }))
+      return false;
+
+    bool Changed = false;
+    for (User *U : make_early_inc_range(II.users())) {
+      if (auto *ICmp = dyn_cast<ICmpInst>(U)) {
+        Value *Op0 = ICmp->getOperand(0);
+        Value *Op1 = ICmp->getOperand(1);
+        ICmpInst::Predicate Pred = ICmp->getPredicate();
+        Value *OtherOp = Op0 == &II ? Op1 : Op0;
+
+        if (Pred == ICmpInst::ICMP_EQ && match(OtherOp, m_Zero())) {
+          // Case (icmp eq %ballot, 0) -->  xor %ballot_arg, 1
+          Instruction *NotOp =
+              BinaryOperator::CreateNot(Src, "", ICmp->getIterator());
+          LLVM_DEBUG(dbgs() << "Replacing ICMP_EQ: " << *NotOp << '\n');
+          ICmp->replaceAllUsesWith(NotOp);
+          ICmp->eraseFromParent();
+          Changed = true;
+        } else if (Pred == ICmpInst::ICMP_NE && match(OtherOp, m_Zero())) {
+          // (icmp ne %ballot, 0)  -->  %ballot_arg
+          LLVM_DEBUG(dbgs() << "Replacing ICMP_NE with ballot argument: "
+                            << *Src << '\n');
+          ICmp->replaceAllUsesWith(Src);
+          ICmp->eraseFromParent();
+          Changed = true;
+        }
+      }
+    }
+    // Erase the intrinsic if it has no remaining uses.
+    if (II.use_empty())
+      II.eraseFromParent();
+    return Changed;
+  }
+  default:
+    llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
+  }
+  return false;
+}
+
+/// Iterate over the Intrinsics use in the Module to optimise.
+static bool runUniformIntrinsicCombine(Module &M, ModuleAnalysisManager &AM) {
+  bool IsChanged = false;
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M) {
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::amdgcn_permlane64:
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_ballot:
+      break;
+    default:
+      continue;
+    }
+
+    for (User *U : F.users()) {
+      auto *II = cast<IntrinsicInst>(U);
+      Function *ParentF = II->getFunction();
+      if (ParentF->isDeclaration())
+        continue;
+
+      const auto &UI = FAM.getResult<UniformityInfoAnalysis>(*ParentF);
+      IsChanged |= optimizeUniformIntrinsic(*II, UI);
+    }
+  }
+  return IsChanged;
+}
+
+PreservedAnalyses
+AMDGPUUniformIntrinsicCombinePass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!runUniformIntrinsicCombine(M, AM))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<UniformityInfoAnalysis>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -62,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
+  AMDGPUUniformIntrinsicCombine.cpp
   AMDGPUInstrInfo.cpp
   AMDGPUInstructionSelector.cpp
   AMDGPUISelDAGToDAG.cpp