Support 2 attributes: one for min and one for max number of work groups.

Jun Wang · Jun Wang · commit c4e460b39c5a · 2024-02-04T18:23:06.000-06:00
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -2031,10 +2031,17 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
-def AMDGPUNumWorkGroups : InheritableAttr {
-  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
-  let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
-  let Documentation = [AMDGPUNumWorkGroupsDocs];
+def AMDGPUMinNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMinNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
@@ -2705,14 +2705,38 @@ An error will be given if:
   }];
 }
 
-def AMDGPUNumWorkGroupsDocs : Documentation {
+def AMDGPUMinNumWorkGroupsDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-The number of work groups specifies the number of work groups when the kernel
+The min number of work groups specifies the min number of work groups when the kernel
 is dispatched.
 
 Clang supports the
-``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The max number of work groups specifies the max number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
 AMDGPU target. This attribute may be attached to a kernel function definition
 and is an optimization hint.
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -357,18 +357,55 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
 
-  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
-    uint32_t X = Attr->getNumWorkGroupsX();
-    uint32_t Y = Attr->getNumWorkGroupsY();
-    uint32_t Z = Attr->getNumWorkGroupsZ();
-
-    if (X != 0 && Y != 0 && Z != 0) {
-      std::string AttrVal = llvm::utostr(X) + std::string(", ") +
-                            llvm::utostr(Y) + std::string(", ") +
-                            llvm::utostr(Z);
-      F->addFnAttr("amdgpu-num-work-groups", AttrVal);
+  uint32_t MinWGX = 0;
+  uint32_t MinWGY = 0;
+  uint32_t MinWGZ = 0;
+
+  uint32_t MaxWGX = 0;
+  uint32_t MaxWGY = 0;
+  uint32_t MaxWGZ = 0;
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
+    MinWGX = Attr->getMinNumWorkGroupsX();
+    MinWGY = Attr->getMinNumWorkGroupsY();
+    MinWGZ = Attr->getMinNumWorkGroupsZ();
+
+    if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
+      IsMinNumWGValid = true;
+  }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    MaxWGX = Attr->getMaxNumWorkGroupsX();
+    MaxWGY = Attr->getMaxNumWorkGroupsY();
+    MaxWGZ = Attr->getMaxNumWorkGroupsZ();
+
+    if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
+      IsMaxNumWGValid = true;
+  }
+
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
     }
   }
+
+  if (IsMinNumWGValid) {
+    std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
+                          llvm::utostr(MinWGY) + std::string(", ") +
+                          llvm::utostr(MinWGZ);
+    F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
+  }
+
+  if (IsMaxNumWGValid) {
+    std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
+                          llvm::utostr(MaxWGY) + std::string(", ") +
+                          llvm::utostr(MaxWGZ);
+    F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8069,23 +8069,42 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
-static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
-                                          const ParsedAttr &AL) {
-  uint32_t NumWGX = 0;
-  uint32_t NumWGY = 0;
-  uint32_t NumWGZ = 0;
-  Expr *NumWGXExpr = AL.getArgAsExpr(0);
-  Expr *NumWGYExpr = AL.getArgAsExpr(1);
-  Expr *NumWGZExpr = AL.getArgAsExpr(2);
-  if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
+static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MinNumWGX = 0;
+  uint32_t MinNumWGY = 0;
+  uint32_t MinNumWGZ = 0;
+  Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
+    return;
+  if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
+      S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
+}
+
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  uint32_t MaxNumWGX = 0;
+  uint32_t MaxNumWGY = 0;
+  uint32_t MaxNumWGZ = 0;
+  Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
+  Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
+  Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
+  if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
+  if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
     return;
-  if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
+  if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
     return;
 
-  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
-                                                       NumWGY, NumWGZ));
+  D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
+      S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
 }
 
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9211,11 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_AMDGPUNumWorkGroups:
-    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+  case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
+    handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
     break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,9 +4,10 @@
 
 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
-// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,13 +494,39 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
-  unsigned NumWGX = MFI.getNumWorkGroupsX();
-  unsigned NumWGY = MFI.getNumWorkGroupsY();
-  unsigned NumWGZ = MFI.getNumWorkGroupsZ();
-  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
-    Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
-    Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
-    Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
+
+  unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
+  unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
+  unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
+
+  unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
+
+  bool IsMinNumWGValid = false;
+  bool IsMaxNumWGValid = false;
+  if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
+    IsMinNumWGValid = true;
+  if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
+    IsMaxNumWGValid = true;
+  if (IsMinNumWGValid && IsMaxNumWGValid) {
+    if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
+        MaxNumWGZ < MinNumWGZ) {
+      IsMinNumWGValid = false;
+      IsMaxNumWGValid = false;
+    }
+  }
+
+  if (IsMinNumWGValid) {
+    Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
+    Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
+    Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
+  }
+
+  if (IsMaxNumWGValid) {
+    Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
+    Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
+    Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
   }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1110,6 +1110,10 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
 }
 
 SmallVector<unsigned>
-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
-  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
+AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
+}
+SmallVector<unsigned>
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,8 +288,11 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
-  /// Return the number of work groups for the function.
-  SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
+  /// Return the min number of work groups for the function.
+  SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
+
+  /// Return the max number of work groups for the function.
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
 
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
-  NumWorkGroups = ST.getNumWorkGroups(F);
-  assert(NumWorkGroups.size() == 3);
+  MinNumWorkGroups = ST.getMinNumWorkGroups(F);
+  assert(MinNumWorkGroups.size() == 3);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -427,7 +427,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
   // Default/requested number of work groups for the function.
-  SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
 
 private:
   unsigned NumUserSGPRs = 0;
@@ -1077,11 +1078,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool usesAGPRs(const MachineFunction &MF) const;
 
   /// \returns Default/requested number of work groups for this function.
-  SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
+  SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
 
-  unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
-  unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
-  unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
+  unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
+  unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
+  unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
+
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll

Original file line number	Diff line number	Diff line change
`@@ -1110,6 +1110,10 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {`
`1110`	`1110`	`}`
`1111`	`1111`
`1112`	`1112`	`SmallVector<unsigned>`
`1113`		`-AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {`
`1114`		`- return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);`
	`1113`	`+AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {`
	`1114`	`+ return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);`
	`1115`	`+}`
	`1116`	`+SmallVector<unsigned>`
	`1117`	`+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {`
	`1118`	`+ return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);`
`1115`	`1119`	`}`