Skip to content

Commit d880f5a

Browse files
authored
[AMDGPU][Attributor] Remove uniformity check in the indirect call specialization callback (#106177)
This patch removes the conservative uniformity check in the indirect call specialization callback, as whether the function pointer is uniform doesn't matter too much. Instead, we add an argument to control specialization.
1 parent 2bf2468 commit d880f5a

File tree

3 files changed

+78
-20
lines changed

3 files changed

+78
-20
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17-
#include "llvm/Analysis/TargetTransformInfo.h"
1817
#include "llvm/CodeGen/TargetPassConfig.h"
1918
#include "llvm/IR/IntrinsicsAMDGPU.h"
2019
#include "llvm/IR/IntrinsicsR600.h"
@@ -33,6 +32,12 @@ static cl::opt<unsigned> KernargPreloadCount(
3332
"amdgpu-kernarg-preload-count",
3433
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
3534

35+
static cl::opt<unsigned> IndirectCallSpecializationThreshold(
36+
"amdgpu-indirect-call-specialization-threshold",
37+
cl::desc(
38+
"A threshold controls whether an indirect call will be specialized"),
39+
cl::init(3));
40+
3641
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
3742

3843
enum ImplicitArgumentPositions {
@@ -1049,16 +1054,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
10491054
AC.IsModulePass = true;
10501055
AC.DefaultInitializeLiveInternals = false;
10511056
AC.IndirectCalleeSpecializationCallback =
1052-
[&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1053-
Function &Callee, unsigned NumAssumedCallees) {
1054-
if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
1055-
return false;
1056-
// Singleton functions can be specialized.
1057-
if (NumAssumedCallees == 1)
1058-
return true;
1059-
// Otherwise specialize uniform values.
1060-
const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
1061-
return TTI.isAlwaysUniform(CB.getCalledOperand());
1057+
[](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1058+
Function &Callee, unsigned NumAssumedCallees) {
1059+
return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
1060+
(NumAssumedCallees <= IndirectCallSpecializationThreshold);
10621061
};
10631062
AC.IPOAmendableCB = [](const Function &F) {
10641063
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,19 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
231231
; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
232232
; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
233233
; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
234-
; CHECK-NEXT: call void [[FPTR]]()
234+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
235+
; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
236+
; CHECK: 2:
237+
; CHECK-NEXT: call void @also_empty()
238+
; CHECK-NEXT: br label [[TMP6:%.*]]
239+
; CHECK: 3:
240+
; CHECK-NEXT: br i1 true, label [[TMP4:%.*]], label [[TMP5:%.*]]
241+
; CHECK: 4:
242+
; CHECK-NEXT: call void @empty()
243+
; CHECK-NEXT: br label [[TMP6]]
244+
; CHECK: 5:
245+
; CHECK-NEXT: unreachable
246+
; CHECK: 6:
235247
; CHECK-NEXT: ret void
236248
;
237249
%fptr = select i1 %cond, ptr @empty, ptr @also_empty

llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck --check-prefixes=CHECK,OW %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor<closed-world>' %s | FileCheck --check-prefixes=CHECK,CW %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='amdgpu-attributor<closed-world>' -amdgpu-indirect-call-specialization-threshold=0 %s | FileCheck --check-prefixes=CHECK,NO %s
45

56
target datalayout = "A5"
67

@@ -9,8 +10,8 @@ target datalayout = "A5"
910
;.
1011
; CHECK: @G = global i32 0, align 4
1112
;.
12-
define void @bar() {
13-
; CHECK-LABEL: define {{[^@]+}}@bar
13+
define void @bar1() {
14+
; CHECK-LABEL: define {{[^@]+}}@bar1
1415
; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
1516
; CHECK-NEXT: entry:
1617
; CHECK-NEXT: store i32 1, ptr @G, align 4
@@ -21,14 +22,36 @@ entry:
2122
ret void
2223
}
2324

24-
define ptr @helper() {
25-
; CHECK-LABEL: define {{[^@]+}}@helper
25+
define void @bar2() {
26+
; CHECK-LABEL: define {{[^@]+}}@bar2
2627
; CHECK-SAME: () #[[ATTR0]] {
2728
; CHECK-NEXT: entry:
28-
; CHECK-NEXT: ret ptr @bar
29+
; CHECK-NEXT: store i32 2, ptr @G, align 4
30+
; CHECK-NEXT: ret void
2931
;
3032
entry:
31-
ret ptr @bar
33+
store i32 2, ptr @G, align 4
34+
ret void
35+
}
36+
37+
define ptr @helper1() {
38+
; CHECK-LABEL: define {{[^@]+}}@helper1
39+
; CHECK-SAME: () #[[ATTR0]] {
40+
; CHECK-NEXT: entry:
41+
; CHECK-NEXT: ret ptr @bar1
42+
;
43+
entry:
44+
ret ptr @bar1
45+
}
46+
47+
define ptr @helper2() {
48+
; CHECK-LABEL: define {{[^@]+}}@helper2
49+
; CHECK-SAME: () #[[ATTR0]] {
50+
; CHECK-NEXT: entry:
51+
; CHECK-NEXT: ret ptr @bar2
52+
;
53+
entry:
54+
ret ptr @bar2
3255
}
3356

3457
define amdgpu_kernel void @foo(ptr noundef %fp) {
@@ -45,10 +68,29 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
4568
; CW-NEXT: entry:
4669
; CW-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
4770
; CW-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
48-
; CW-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
49-
; CW-NEXT: call void @bar()
71+
; CW-NEXT: [[TMP0:%.*]] = icmp eq ptr [[FP]], @bar1
72+
; CW-NEXT: br i1 [[TMP0]], label [[TMP1:%.*]], label [[TMP2:%.*]]
73+
; CW: 1:
74+
; CW-NEXT: call void @bar1()
75+
; CW-NEXT: br label [[TMP5:%.*]]
76+
; CW: 2:
77+
; CW-NEXT: br i1 true, label [[TMP3:%.*]], label [[TMP4:%.*]]
78+
; CW: 3:
79+
; CW-NEXT: call void @bar2()
80+
; CW-NEXT: br label [[TMP5]]
81+
; CW: 4:
82+
; CW-NEXT: unreachable
83+
; CW: 5:
5084
; CW-NEXT: ret void
5185
;
86+
; NO-LABEL: define {{[^@]+}}@foo
87+
; NO-SAME: (ptr noundef [[FP:%.*]]) #[[ATTR1:[0-9]+]] {
88+
; NO-NEXT: entry:
89+
; NO-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
90+
; NO-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
91+
; NO-NEXT: call void [[FP]](), !callees [[META0:![0-9]+]]
92+
; NO-NEXT: ret void
93+
;
5294
entry:
5395
%fp.addr = alloca ptr, addrspace(5)
5496
store ptr %fp, ptr addrspace(5) %fp.addr
@@ -57,10 +99,15 @@ entry:
5799
ret void
58100
}
59101

102+
;.
103+
; NO: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
104+
; NO: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
60105
;.
61106
; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
62107
; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
63108
;.
64109
; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
65110
; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
66111
;.
112+
; NO: [[META0]] = !{ptr @bar1, ptr @bar2}
113+
;.

0 commit comments

Comments
 (0)