llvm · jwanggit86 · Dec 15, 2023 · Dec 15, 2023
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -2011,6 +2011,13 @@ def AMDGPUNumVGPR : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
+  let Args = [UnsignedArgument<"NumWorkGroups">];
+  let Documentation = [AMDGPUNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
   let Spellings = [Clang<"amdgpu_kernel">];
   let Documentation = [Undocumented];

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
@@ -2693,6 +2693,29 @@ An error will be given if:
   }];
 }
 
+def AMDGPUNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+The number of work groups specifies the number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_num_work_groups(<num>)))`` attribute for the
+AMDGPU target. This attribute may be attached to a kernel function definition
+and is an optimization hint.
+
+``<num>`` parameter specifies the number of work groups.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,13 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
+    uint32_t NumWG = Attr->getNumWorkGroups();
+
+    if (NumWG != 0)
+      F->addFnAttr("amdgpu-num-work-groups", llvm::utostr(NumWG));
+  }
 }
 
 /// Emits control constants used to change per-architecture behaviour in the

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8051,6 +8051,16 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }
 
+static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
+                                          const ParsedAttr &AL) {
+  uint32_t NumWG = 0;
+  Expr *NumWGExpr = AL.getArgAsExpr(0);
+  if (!checkUInt32Argument(S, AL, NumWGExpr, NumWG))
+    return;
+
+  D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWG));
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                               const ParsedAttr &AL) {
   // If we try to apply it to a function pointer, don't warn, but don't
@@ -9058,6 +9068,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     handleAMDGPUNumVGPRAttr(S, D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUNumWorkGroups:
+    handleAMDGPUNumWorkGroupsAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_AVRSignal:
     handleAVRSignalAttr(S, D, AL);
     break;

diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -6,6 +6,7 @@
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,10 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
 
   Kern[".max_flat_workgroup_size"] =
       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWG = MFI.getNumWorkGroups();
+  if (NumWG != 0) {
+    Kern[".num_work_groups"] = Kern.getDocument()->getNode(NumWG);
+  }
   Kern[".sgpr_spill_count"] =
       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
   Kern[".vgpr_spill_count"] =

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1108,3 +1108,9 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
   return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+unsigned AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
+  const unsigned Default = 0;
+  return AMDGPU::getUnsignedIntegerAttribute(F, "amdgpu-num-work-groups",
+                                             Default);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
   /// 2) dimension.
   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
 
+  /// Return the number of work groups for the function.
+  unsigned getNumWorkGroups(const Function &F) const;
+
   /// Return true if only a single workitem can be active in a wave.
   bool isSingleLaneExecution(const Function &Kernel) const;
 

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
+  NumWorkGroups = ST.getNumWorkGroups(F);
 
   Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
 
+  // Default/requested number of work groups for the function.
+  unsigned NumWorkGroups = 0;
+
 private:
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
@@ -1094,6 +1097,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  unsigned getNumWorkGroups() const { return NumWorkGroups; }
 };
 
 } // end namespace llvm

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1221,6 +1221,22 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
   return Ints;
 }
 
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default) {
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  LLVMContext &Ctx = F.getContext();
+  unsigned IntVal = Default;
+  StringRef Str = A.getValueAsString();
+  if (Str.trim().getAsInteger(0, IntVal)) {
+    Ctx.emitError("can't parse integer attribute " + Name);
+    return Default;
+  }
+  return IntVal;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -818,6 +818,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
 /// to integer.
 int getIntegerAttribute(const Function &F, StringRef Name, int Default);
 
+/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
+                                     unsigned Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups is 0.
+; CHECK-LABEL: {{^}}empty_num_work_groups_0:
+define amdgpu_kernel void @empty_num_work_groups_0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-num-work-groups"="0"}
+
+; Exactly 1 work group.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1:
+define amdgpu_kernel void @empty_num_work_groups_1() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-num-work-groups"="1"}
+
+; Exactly 5 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_5:
+define amdgpu_kernel void @empty_num_work_groups_5() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-num-work-groups"="5"}
+
+; Exactly 32 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_32:
+define amdgpu_kernel void @empty_num_work_groups_32() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-num-work-groups"="32"}
+
+; Exactly 50 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_50:
+define amdgpu_kernel void @empty_num_work_groups_50() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-num-work-groups"="50"}
+
+; Exactly 256 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_256:
+define amdgpu_kernel void @empty_num_work_groups_256() #5 {
+entry:
+  ret void
+}
+attributes #5 = {"amdgpu-num-work-groups"="256"}
+
+; Exactly 1024 work groups.
+; CHECK-LABEL: {{^}}empty_num_work_groups_1024:
+define amdgpu_kernel void @empty_num_work_groups_1024() #6 {
+entry:
+  ret void
+}
+attributes #6 = {"amdgpu-num-work-groups"="1024"}
+
+; CHECK: .amdgpu_metadata
+; CHECK:        .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+; CHECK:        .name:           empty_num_work_groups_1
+; CHECK-NEXT:   .num_work_groups: 1
+; CHECK:        .name:           empty_num_work_groups_5
+; CHECK-NEXT:   .num_work_groups: 5
+; CHECK:        .name:           empty_num_work_groups_32
+; CHECK-NEXT:   .num_work_groups: 32
+; CHECK:        .name:           empty_num_work_groups_50
+; CHECK-NEXT:   .num_work_groups: 50
+; CHECK:        .name:           empty_num_work_groups_256
+; CHECK-NEXT:   .num_work_groups: 256
+; CHECK:        .name:           empty_num_work_groups_1024
+; CHECK-NEXT:   .num_work_groups: 1024