AMDGPU: Don't try to fold wavefrontsize intrinsic in libcall simplify

arsenm · arsenm · commit 5dfdd3494bf2 · 2023-08-01T18:20:50.000-04:00
It's not a libcall so doesn't really belong here to begin
with. Relying on checking the target name and explicit features isn't
particularly sound either. The library doesn't use the intrinsic
anymore, so it doesn't matter anyway.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -48,7 +48,7 @@ FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
 
 FunctionPass *createSIPostRABundlerPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
+FunctionPass *createAMDGPUSimplifyLibCallsPass();
 FunctionPass *createAMDGPUUseNativeCallsPass();
 ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
@@ -60,11 +60,8 @@ FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
-  AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
+  AMDGPUSimplifyLibCallsPass() {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-private:
-  TargetMachine &TM;
 };
 
 struct AMDGPUUseNativeCallsPass : PassInfoMixin<AMDGPUUseNativeCallsPass> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -20,7 +20,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
 #include <cmath>
 
 #define DEBUG_TYPE "amdgpu-simplifylib"
@@ -49,8 +48,6 @@ class AMDGPULibCalls {
 
   typedef llvm::AMDGPULibFunc FuncInfo;
 
-  const TargetMachine *TM;
-
   bool UnsafeFPMath = false;
 
   // -fuse-native.
@@ -101,13 +98,11 @@ class AMDGPULibCalls {
   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                             const FuncInfo &FInfo);
 
-  // llvm.amdgcn.wavefrontsize
-  bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
-
   // Get insertion point at entry.
   BasicBlock::iterator getEntryIns(CallInst * UI);
   // Insert an Alloc instruction.
   AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
+
   // Get a scalar native builtin single argument FP function
   FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
 
@@ -126,7 +121,7 @@ class AMDGPULibCalls {
   }
 
 public:
-  AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+  AMDGPULibCalls() {}
 
   bool fold(CallInst *CI);
 
@@ -148,8 +143,7 @@ namespace {
   public:
     static char ID; // Pass identification
 
-    AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), Simplifier(TM) {
+    AMDGPUSimplifyLibCalls() : FunctionPass(ID) {
       initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -602,18 +596,8 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
 bool AMDGPULibCalls::fold(CallInst *CI) {
   Function *Callee = CI->getCalledFunction();
   // Ignore indirect calls.
-  if (!Callee || CI->isNoBuiltin())
-    return false;
-
-  IRBuilder<> B(CI);
-  switch (Callee->getIntrinsicID()) {
-  case Intrinsic::not_intrinsic:
-    break;
-  case Intrinsic::amdgcn_wavefrontsize:
-    return !EnablePreLink && fold_wavefrontsize(CI, B);
-  default:
+  if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
     return false;
-  }
 
   FuncInfo FInfo;
   if (!parseFunctionName(Callee->getName(), FInfo))
@@ -629,6 +613,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   if (TDOFold(CI, FInfo))
     return true;
 
+  IRBuilder<> B(CI);
+
   if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
     // Under unsafe-math, evaluate calls if possible.
     // According to Brian Sumner, we can do this for all f32 function calls
@@ -1310,28 +1296,6 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
   return true;
 }
 
-bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
-  if (!TM)
-    return false;
-
-  StringRef CPU = TM->getTargetCPU();
-  StringRef Features = TM->getTargetFeatureString();
-  if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
-      (Features.empty() || !Features.contains_insensitive("wavefrontsize")))
-    return false;
-
-  Function *F = CI->getParent()->getParent();
-  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
-  unsigned N = ST.getWavefrontSize();
-
-  LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
-               << N << "\n");
-
-  CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
-  CI->eraseFromParent();
-  return true;
-}
-
 // Get insertion point at entry.
 BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
   Function * Func = UI->getParent()->getParent();
@@ -1642,8 +1606,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
 }
 
 // Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) {
-  return new AMDGPUSimplifyLibCalls(TM);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass() {
+  return new AMDGPUSimplifyLibCalls();
 }
 
 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
@@ -1677,7 +1641,7 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
 
 PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
                                                   FunctionAnalysisManager &AM) {
-  AMDGPULibCalls Simplifier(&TM);
+  AMDGPULibCalls Simplifier;
   Simplifier.initNativeFuncs();
   Simplifier.initFunction(F);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -631,7 +631,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       [this](StringRef PassName, FunctionPassManager &PM,
              ArrayRef<PassBuilder::PipelineElement>) {
         if (PassName == "amdgpu-simplifylib") {
-          PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+          PM.addPass(AMDGPUSimplifyLibCallsPass());
           return true;
         }
         if (PassName == "amdgpu-usenative") {
@@ -683,11 +683,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   });
 
   PB.registerPipelineStartEPCallback(
-      [this](ModulePassManager &PM, OptimizationLevel Level) {
+      [](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
         FPM.addPass(AMDGPUUseNativeCallsPass());
         if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
-          FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+          FPM.addPass(AMDGPUSimplifyLibCallsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
       });
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -4,16 +4,16 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s
+; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
 ; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
@@ -22,10 +22,8 @@
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 32, ptr addrspace(1) %arg, align 4
-; OPT-W64:   store i32 64, ptr addrspace(1) %arg, align 4
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   store i32 %tmp, ptr addrspace(1) %arg, align 4
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
@@ -43,12 +41,10 @@ bb:
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT-W32:   store i32 1, ptr addrspace(1) %arg, align 4
-; OPT-W64:   store i32 2, ptr addrspace(1) %arg, align 4
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT-WXX:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT-WXX:   store i32 %tmp2, ptr addrspace(1) %arg
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
+; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
@@ -64,10 +60,9 @@ bb:
 ; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
 
 ; OPT:       bb:
-; OPT-WXX:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT-WXX:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT-WXX:   bb3:
-; OPT-W64:   store i32 1, ptr addrspace(1) %arg, align 4
+; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
+; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
+; OPT:   bb3:
 ; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {