AMDGPU: Mark grid size loads with range metadata #113019

arsenm · 2024-10-19T03:43:29Z

Only handles the v5 case.

arsenm · 2024-10-19T03:43:40Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2024-10-19T03:44:27Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Only handles the v5 case.

Full diff: https://github.com/llvm/llvm-project/pull/113019.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp (+29-4)
(modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+1)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll (+124)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 1bb5e794da7dd6..5fc0c36359b6f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 
@@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
 
 } // end anonymous namespace
 
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+                                            uint32_t MaxNumGroups) {
+  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+    return;
+
+  if (!Load->getType()->isIntegerTy(32))
+    return;
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Function *F = CI->getParent()->getParent();
 
@@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   const bool HasUniformWorkGroupSize =
     F->getFnAttribute("uniform-work-group-size").getValueAsBool();
 
-  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
+  SmallVector<unsigned> MaxNumWorkgroups =
+      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     if (IsV5OrAbove) { // Base is ImplicitArgPtr.
       switch (Offset) {
       case HIDDEN_BLOCK_COUNT_X:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[0] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Y:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[1] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+        }
         break;
       case HIDDEN_BLOCK_COUNT_Z:
-        if (LoadSize == 4)
+        if (LoadSize == 4) {
           BlockCounts[2] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+        }
         break;
       case HIDDEN_GROUP_SIZE_X:
         if (LoadSize == 2)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 961a9220b48d6b..5a899b755419ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
       TM.getSubtarget<R600Subtarget>(F));
 }
 
+// FIXME: This has no reason to be in subtarget
 SmallVector<unsigned>
 AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
   return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
new file mode 100644
index 00000000000000..9064292129928f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+
+define i32 @use_grid_size_x_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_y_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
+; CHECK-NEXT:    [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Y]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
+  %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
+  ret i32 %grid.size.y
+}
+
+define i32 @use_grid_size_z_max_num_workgroups() #0 {
+; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
+; CHECK-NEXT:    [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_Z]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
+  %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
+  ret i32 %grid.size.z
+}
+
+define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
+; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret <2 x i16> [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret <2 x i16> %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
+; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+  %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
+  ret i32 %grid.size.x
+}
+
+declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
+
+attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
+attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
+
+!0 = !{i32 0, i32 -1}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[RNG0]] = !{i32 1, i32 37}
+; CHECK: [[RNG1]] = !{i32 1, i32 43}
+; CHECK: [[RNG2]] = !{i32 1, i32 90}
+; CHECK: [[RNG3]] = !{i32 1, i32 -1}
+;.

I'm not sure what the interpretation of 0 is supposed to be, AMDGPUUsage doesn't say.

Only handles the v5 case.

shiltian · 2024-11-12T17:22:47Z

How are we gonna use the three sizes after we annotate them? What is the impact of using an inaccurate number?

arsenm · 2024-11-12T17:30:20Z

How are we gonna use the three sizes after we annotate them? What is the impact of using an inaccurate number?

It enables known bits style optimizations. I think we're introducing these a bit late though, eventually we should probably move this directly intro the attributor. The big win would be enabling shrinking 64-bit work item calculations to 32-bit

arsenm · 2024-12-09T15:58:37Z

Merge activity

Dec 9, 10:58 AM EST: A user started a stack merge that includes this pull request via Graphite.
Dec 9, 11:01 AM EST: A user merged this pull request with Graphite.

arsenm mentioned this pull request Oct 19, 2024

AMDGPU: Propagate amdgpu-max-num-workgroups attribute #113018

Merged

arsenm added the backend:AMDGPU label Oct 19, 2024 — with Graphite App

arsenm requested review from jdoerfert, jhuber6, jwanggit86, saiislam and shiltian October 19, 2024 03:44

arsenm marked this pull request as ready for review October 19, 2024 03:44

arsenm mentioned this pull request Oct 19, 2024

clang/AMDGPU: Emit grid size builtins with range metadata #113038

Merged

shiltian approved these changes Oct 19, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from 2005b26 to 58a330c Compare October 26, 2024 04:13

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 297173f to 5336b21 Compare October 26, 2024 04:14

arsenm mentioned this pull request Oct 26, 2024

AMDGPU: Treat uint32_max as the default value for amdgpu-max-num-workgroups #113751

Merged

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from 58a330c to b0cdf6f Compare October 28, 2024 22:19

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 5336b21 to 0bd73d5 Compare October 28, 2024 22:19

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from b0cdf6f to d26f42b Compare November 5, 2024 16:06

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 0bd73d5 to 8c4d57e Compare November 5, 2024 16:06

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from d26f42b to 8272a93 Compare November 5, 2024 20:52

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 8c4d57e to 64d530b Compare November 5, 2024 20:53

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from 8272a93 to b5edcd1 Compare November 5, 2024 21:10

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 64d530b to 63a1993 Compare November 5, 2024 21:10

arsenm added 7 commits November 6, 2024 15:42

AMDGPU: Propagate amdgpu-max-num-workgroups attribute

fd3ae0b

I'm not sure what the interpretation of 0 is supposed to be, AMDGPUUsage doesn't say.

Comment parameter name

41bb72b

Add a shader entry test

dfe5c82

Check isValidState

7994c83

Remove fixup of 0

eaf4a26

Remove todo

2ad2f35

AMDGPU: Mark grid size loads with range metadata

cc4a772

Only handles the v5 case.

arsenm force-pushed the users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups branch from b5edcd1 to 2ad2f35 Compare November 6, 2024 23:43

arsenm force-pushed the users/arsenm/amdgpu-annotate-grid-size-loads branch from 63a1993 to cc4a772 Compare November 6, 2024 23:43

Base automatically changed from users/arsenm/amdgpu-attributor-propagate-amdgpu-max-num-workgroups to main December 9, 2024 15:57

arsenm merged commit 009368f into main Dec 9, 2024
8 checks passed

arsenm deleted the users/arsenm/amdgpu-annotate-grid-size-loads branch December 9, 2024 16:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Mark grid size loads with range metadata #113019

AMDGPU: Mark grid size loads with range metadata #113019

Uh oh!

arsenm commented Oct 19, 2024

Uh oh!

arsenm commented Oct 19, 2024 •

edited

Loading

Uh oh!

llvmbot commented Oct 19, 2024

Uh oh!

shiltian commented Nov 12, 2024 •

edited

Loading

Uh oh!

arsenm commented Nov 12, 2024 •

edited

Loading

Uh oh!

arsenm commented Dec 9, 2024 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Mark grid size loads with range metadata #113019

AMDGPU: Mark grid size loads with range metadata #113019

Uh oh!

Conversation

arsenm commented Oct 19, 2024

Uh oh!

arsenm commented Oct 19, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 19, 2024

Uh oh!

shiltian commented Nov 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Nov 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Dec 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

Uh oh!

arsenm commented Oct 19, 2024 •

edited

Loading

shiltian commented Nov 12, 2024 •

edited

Loading

arsenm commented Nov 12, 2024 •

edited

Loading

arsenm commented Dec 9, 2024 •

edited

Loading