llvm · arsenm · Dec 9, 2024 · Oct 18, 2024 · Oct 21, 2024 · Oct 22, 2024
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -179,6 +179,11 @@ class AMDGPUInformationCache : public InformationCache {
     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
   }
 
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
+    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+    return ST.getMaxNumWorkGroups(F);
+  }
+
   /// Get code object version.
   unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
 
@@ -821,6 +826,145 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
       "AAAMDFlatWorkGroupSize is only valid for function position");
 }
 
+struct TupleDecIntegerRangeState : public AbstractState {
+  DecIntegerState<uint32_t> X, Y, Z;
+
+  bool isValidState() const override {
+    return X.isValidState() && Y.isValidState() && Z.isValidState();
+  }
+
+  bool isAtFixpoint() const override {
+    return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
+  }
+
+  ChangeStatus indicateOptimisticFixpoint() override {
+    return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
+           Z.indicateOptimisticFixpoint();
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
+           Z.indicatePessimisticFixpoint();
+  }
+
+  TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
+    X ^= Other.X;
+    Y ^= Other.Y;
+    Z ^= Other.Z;
+    return *this;
+  }
+
+  bool operator==(const TupleDecIntegerRangeState &Other) const {
+    return X == Other.X && Y == Other.Y && Z == Other.Z;
+  }
+
+  TupleDecIntegerRangeState &getAssumed() { return *this; }
+  const TupleDecIntegerRangeState &getAssumed() const { return *this; }
+};
+
+using AAAMDMaxNumWorkgroupsState =
+    StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
+
+/// Propagate amdgpu-max-num-workgroups attribute.
+struct AAAMDMaxNumWorkgroups
+    : public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
+  using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
+
+  AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+    SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
+
+    X.takeKnownMinimum(MaxNumWorkgroups[0]);
+    Y.takeKnownMinimum(MaxNumWorkgroups[1]);
+    Z.takeKnownMinimum(MaxNumWorkgroups[2]);
+
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+    auto CheckCallSite = [&](AbstractCallSite CS) {
+      Function *Caller = CS.getInstruction()->getFunction();
+      LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
+                        << "->" << getAssociatedFunction()->getName() << '\n');
+
+      const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
+          *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+      if (!CallerInfo || !CallerInfo->isValidState())
+        return false;
+
+      Change |=
+          clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
+      return true;
+    };
+
+    bool AllCallSitesKnown = true;
+    if (!A.checkForAllCallSites(CheckCallSite, *this,
+                                /*RequireAllCallSites=*/true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return Change;
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
+                                                  Attributor &A);
+
+  ChangeStatus manifest(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    LLVMContext &Ctx = F->getContext();
+    SmallString<32> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
+
+    // TODO: Should annotate loads of the group size for this to do anything
+    // useful.
+    return A.manifestAttrs(
+        getIRPosition(),
+        {Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
+        /* ForceReplace= */ true);
+  }
+
+  const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
+
+  const std::string getAsStr(Attributor *) const override {
+    std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
+    raw_string_ostream OS(Buffer);
+    OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
+       << ']';
+    return OS.str();
+  }
+
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDMaxNumWorkgroups
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  void trackStatistics() const override {}
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+const char AAAMDMaxNumWorkgroups::ID = 0;
+
+AAAMDMaxNumWorkgroups &
+AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
+  if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+    return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
+  llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
+}
+
 /// Propagate amdgpu-waves-per-eu attribute.
 struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
   AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
@@ -1046,8 +1190,8 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
-       &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
+       &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
        &AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
        &AAInstanceInfo::ID});
 
@@ -1071,6 +1215,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   for (auto *F : Functions) {
     A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
+    A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
     A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
     CallingConv::ID CC = F->getCallingConv();
     if (!AMDGPU::isEntryFunctionCC(CC)) {

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 
@@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
 
 } // end anonymous namespace
 
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+                                            uint32_t MaxNumGroups) {
+  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+    return;
+
+  if (!Load->getType()->isIntegerTy(32))
+    return;
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Function *F = CI->getParent()->getParent();
 
@@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   const bool HasUniformWorkGroupSize =
     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
 
-  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+  SmallVector<unsigned> MaxNumWorkgroups =
+      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
       switch (Offset) {
       case HIDDEN_BLOCK_COUNT_X:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[0] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Y:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[1] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Z:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[2] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+        }
         break;
       case HIDDEN_GROUP_SIZE_X:
         if (LoadSize == 2)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
       TM.getSubtarget<R600Subtarget>(F));
 }
 
+// FIXME: This has no reason to be in subtarget
 SmallVector<unsigned>
 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+define i32 @use_grid_size_x_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_y_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
+; CHECK-NEXT:    [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Y]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+  %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
+  ret i32 %grid.size.y
+}
+
+define i32 @use_grid_size_z_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
+; CHECK-NEXT:    [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Z]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+  %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
+  ret i32 %grid.size.z
+}
+
+define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
+; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret <2 x i16> [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret <2 x i16> %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
+attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
+
+!0 = !{i32 0, i32 -1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 37}
+; CHECK: [[RNG1]] = !{i32 1, i32 43}
+; CHECK: [[RNG2]] = !{i32 1, i32 90}
+; CHECK: [[RNG3]] = !{i32 1, i32 -1}
+;.