Skip to content

Commit 8d4b74a

Browse files
committed
AMDGPU: Don't consider whether amdgpu-flat-work-group-size was set
It should be semantically identical if it was set to the same value as the default. Also improve the documentation.
1 parent cd824f9 commit 8d4b74a

10 files changed

+132
-51
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,8 @@ The AMDGPU backend supports the following LLVM IR attributes.
856856
"amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that
857857
will be specified when the kernel is dispatched. Generated
858858
by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
859+
The implied default value is 1,1024.
860+
859861
"amdgpu-implicitarg-num-bytes"="n" Number of kernel argument bytes to add to the kernel
860862
argument block size for the implicit arguments. This
861863
varies by OS and language (for OpenCL see
@@ -866,7 +868,12 @@ The AMDGPU backend supports the following LLVM IR attributes.
866868
``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
867869
"amdgpu-waves-per-eu"="m,n" Specify the minimum and maximum number of waves per
868870
execution unit. Generated by the ``amdgpu_waves_per_eu``
869-
CLANG attribute [CLANG-ATTR]_.
871+
CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
872+
and the backend may not be able to satisfy the request. If
873+
the specified range is incompatible with the function's
874+
"amdgpu-flat-work-group-size" value, the implied occupancy
875+
bounds by the workgroup size takes precedence.
876+
870877
"amdgpu-ieee" true/false. Specify whether the function expects the IEEE field of the
871878
mode register to be set on entry. Overrides the default for
872879
the calling convention.

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -544,8 +544,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
544544
unsigned MinImpliedByFlatWorkGroupSize =
545545
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
546546
Default.first = MinImpliedByFlatWorkGroupSize;
547-
bool RequestedFlatWorkGroupSize =
548-
F.hasFnAttribute("amdgpu-flat-work-group-size");
549547

550548
// Requested minimum/maximum number of waves per execution unit.
551549
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -562,8 +560,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
562560

563561
// Make sure requested values are compatible with values implied by requested
564562
// minimum/maximum flat work group sizes.
565-
if (RequestedFlatWorkGroupSize &&
566-
Requested.first < MinImpliedByFlatWorkGroupSize)
563+
if (Requested.first < MinImpliedByFlatWorkGroupSize)
567564
return Default;
568565

569566
return Requested;

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
1616
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
1717
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
1818
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
19-
; GCN-NEXT: v_mov_b32_e32 v64, 0
2019
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2120
; GCN-NEXT: v_mov_b32_e32 v0, s36
2221
; GCN-NEXT: v_mov_b32_e32 v1, s37
@@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
158157
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:260
159158
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:264
160159
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:268
161-
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:272
162-
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:276
163-
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:280
164-
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:284
160+
; GCN-NEXT: s_waitcnt vmcnt(0)
161+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill
162+
; GCN-NEXT: s_waitcnt vmcnt(0)
163+
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill
164+
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill
165+
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill
166+
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:272
167+
; GCN-NEXT: s_nop 0
168+
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:276
169+
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:280
170+
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:284
171+
; GCN-NEXT: s_waitcnt vmcnt(0)
172+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill
173+
; GCN-NEXT: s_waitcnt vmcnt(0)
174+
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill
175+
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill
176+
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill
165177
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:288
166178
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:292
167179
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:296
@@ -218,43 +230,45 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
218230
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:500
219231
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504
220232
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508
221-
; GCN-NEXT: s_waitcnt vmcnt(60)
222-
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21]
223-
; GCN-NEXT: s_waitcnt vmcnt(57)
224-
; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
225-
; GCN-NEXT: s_waitcnt vmcnt(54)
226-
; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
227-
; GCN-NEXT: s_waitcnt vmcnt(51)
228-
; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
229-
; GCN-NEXT: s_waitcnt vmcnt(48)
230-
; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
231-
; GCN-NEXT: s_waitcnt vmcnt(45)
232-
; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
233-
; GCN-NEXT: s_waitcnt vmcnt(42)
234-
; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
235-
; GCN-NEXT: s_waitcnt vmcnt(39)
236-
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
237-
; GCN-NEXT: s_waitcnt vmcnt(36)
238-
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
239-
; GCN-NEXT: s_waitcnt vmcnt(33)
240-
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
241-
; GCN-NEXT: s_waitcnt vmcnt(30)
242-
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
243-
; GCN-NEXT: s_waitcnt vmcnt(27)
244-
; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
245-
; GCN-NEXT: s_waitcnt vmcnt(24)
246-
; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
247-
; GCN-NEXT: s_waitcnt vmcnt(21)
248-
; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
249-
; GCN-NEXT: s_waitcnt vmcnt(18)
250-
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
251-
; GCN-NEXT: s_waitcnt vmcnt(15)
252-
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
233+
; GCN-NEXT: s_nop 0
234+
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload
235+
; GCN-NEXT: s_nop 0
236+
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload
237+
; GCN-NEXT: s_nop 0
238+
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload
239+
; GCN-NEXT: s_nop 0
240+
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload
241+
; GCN-NEXT: v_mov_b32_e32 v0, 0
242+
; GCN-NEXT: s_waitcnt vmcnt(0)
243+
; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21]
244+
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload
245+
; GCN-NEXT: s_nop 0
246+
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload
247+
; GCN-NEXT: s_nop 0
248+
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload
249+
; GCN-NEXT: s_nop 0
250+
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload
251+
; GCN-NEXT: s_waitcnt vmcnt(0)
252+
; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[20:21] offset:16
253+
; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[20:21] offset:32
254+
; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[20:21] offset:48
255+
; GCN-NEXT: global_store_dwordx4 v0, v[16:19], s[20:21] offset:64
256+
; GCN-NEXT: global_store_dwordx4 v0, v[20:23], s[20:21] offset:80
257+
; GCN-NEXT: global_store_dwordx4 v0, v[24:27], s[20:21] offset:96
258+
; GCN-NEXT: global_store_dwordx4 v0, v[28:31], s[20:21] offset:112
259+
; GCN-NEXT: global_store_dwordx4 v0, v[32:35], s[20:21] offset:128
260+
; GCN-NEXT: global_store_dwordx4 v0, v[36:39], s[20:21] offset:144
261+
; GCN-NEXT: global_store_dwordx4 v0, v[40:43], s[20:21] offset:160
262+
; GCN-NEXT: global_store_dwordx4 v0, v[44:47], s[20:21] offset:176
263+
; GCN-NEXT: global_store_dwordx4 v0, v[48:51], s[20:21] offset:192
264+
; GCN-NEXT: global_store_dwordx4 v0, v[52:55], s[20:21] offset:208
265+
; GCN-NEXT: global_store_dwordx4 v0, v[56:59], s[20:21] offset:224
266+
; GCN-NEXT: global_store_dwordx4 v0, v[60:63], s[20:21] offset:240
253267
; GCN-NEXT: s_endpgm
254268
%vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
255269
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
256270
store <64 x i32> %insert, <64 x i32> addrspace(1)* %out.ptr
257271
ret void
258272
}
259273

260-
attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
274+
attributes #0 = { "amdgpu-flat-workgroup-size"="1,256" "amdgpu-waves-per-eu"="1,10" }

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,5 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
108108

109109
declare i32 @llvm.amdgcn.workitem.id.x() #1
110110

111-
attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
111+
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
112112
attributes #1 = { nounwind readnone speculatable willreturn }

llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
1010
entry:
1111
ret void
1212
}
13-
attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
13+
attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }
1414

1515
; Exactly 5 waves per execution unit.
1616
; CHECK-LABEL: {{^}}empty_exactly_5:
@@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
8484
entry:
8585
ret void
8686
}
87-
attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
87+
attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}
8888

8989
; At most 10 waves per execution unit.
9090
; CHECK-LABEL: {{^}}empty_at_most_10:
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
2+
3+
; Both of these kernels have the same value for
4+
; amdgpu-flat-work-group-size, except one explicitly sets it. This is
5+
; a program visible property which should always take precedence over
6+
; the amdgpu-waves-per-eu optimization hint.
7+
;
8+
; The range is incompatible with the amdgpu-waves-per-eu value, so the
9+
; flat work group size should take precedence implying a requirement
10+
; to support 1024 size workgroups (which exceeds the available LDS
11+
; amount).
12+
13+
; CHECK-NOT: @no_flat_workgroup_size.stack
14+
; CHECK-NOT: @explicit_default_workgroup_size.stack
15+
16+
; CHECK-LABEL: @no_flat_workgroup_size(
17+
; CHECK: alloca [5 x i32]
18+
; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
19+
define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
20+
entry:
21+
%stack = alloca [5 x i32], align 4, addrspace(5)
22+
%0 = load i32, i32 addrspace(1)* %in, align 4
23+
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
24+
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
25+
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
26+
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
27+
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
28+
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
29+
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
30+
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
31+
store i32 %2, i32 addrspace(1)* %out, align 4
32+
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
33+
%3 = load i32, i32 addrspace(5)* %arrayidx12
34+
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
35+
store i32 %3, i32 addrspace(1)* %arrayidx13
36+
ret void
37+
}
38+
39+
; CHECK-LABEL: @explicit_default_workgroup_size(
40+
; CHECK: alloca [5 x i32]
41+
; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
42+
define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
43+
entry:
44+
%stack = alloca [5 x i32], align 4, addrspace(5)
45+
%0 = load i32, i32 addrspace(1)* %in, align 4
46+
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
47+
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
48+
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
49+
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
50+
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
51+
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
52+
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
53+
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
54+
store i32 %2, i32 addrspace(1)* %out, align 4
55+
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
56+
%3 = load i32, i32 addrspace(5)* %arrayidx12
57+
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
58+
store i32 %3, i32 addrspace(1)* %arrayidx13
59+
ret void
60+
}
61+
62+
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
63+
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }

llvm/test/CodeGen/AMDGPU/occupancy-levels.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
392392
ret void
393393
}
394394

395-
attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
395+
attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
396396
attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
397397
attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
398398
attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }

llvm/test/CodeGen/AMDGPU/schedule-ilp.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,5 +585,5 @@ bb:
585585
; Function Attrs: nounwind readnone
586586
declare float @llvm.fmuladd.f32(float, float, float) #1
587587

588-
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
588+
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
589589
attributes #1 = { nounwind readnone }

llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,4 +588,4 @@ bb:
588588
declare float @llvm.fmuladd.f32(float, float, float) #0
589589

590590
attributes #0 = { nounwind readnone }
591-
attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
591+
attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }

llvm/test/CodeGen/AMDGPU/target-cpu.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,5 +107,5 @@ attributes #1 = { nounwind readnone }
107107
attributes #2 = { nounwind "target-cpu"="tahiti" }
108108
attributes #3 = { nounwind "target-cpu"="bonaire" }
109109
attributes #4 = { nounwind "target-cpu"="fiji" }
110-
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
111-
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }
110+
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }
111+
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }

0 commit comments

Comments
 (0)