AMDGPU: Don't consider whether amdgpu-flat-work-group-size was set

arsenm · arsenm · commit 8d4b74ac3f1f · 2021-10-22T16:23:50.000-04:00
It should be semantically identical if it was set to the same value as
the default. Also improve the documentation.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
@@ -856,6 +856,8 @@ The AMDGPU backend supports the following LLVM IR attributes.
      "amdgpu-flat-work-group-size"="min,max" Specify the minimum and maximum flat work group sizes that
                                              will be specified when the kernel is dispatched. Generated
                                              by the ``amdgpu_flat_work_group_size`` CLANG attribute [CLANG-ATTR]_.
+                                             The implied default value is 1,1024.
+
      "amdgpu-implicitarg-num-bytes"="n"      Number of kernel argument bytes to add to the kernel
                                              argument block size for the implicit arguments. This
                                              varies by OS and language (for OpenCL see
@@ -866,7 +868,12 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              ``amdgpu_num_vgpr`` CLANG attribute [CLANG-ATTR]_.
      "amdgpu-waves-per-eu"="m,n"             Specify the minimum and maximum number of waves per
                                              execution unit. Generated by the ``amdgpu_waves_per_eu``
-                                             CLANG attribute [CLANG-ATTR]_.
+                                             CLANG attribute [CLANG-ATTR]_. This is an optimization hint,
+                                             and the backend may not be able to satisfy the request. If
+                                             the specified range is incompatible with the function's
+                                             "amdgpu-flat-work-group-size" value, the implied occupancy
+                                             bounds by the workgroup size takes precedence.
+
      "amdgpu-ieee" true/false.               Specify whether the function expects the IEEE field of the
                                              mode register to be set on entry. Overrides the default for
                                              the calling convention.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -544,8 +544,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   unsigned MinImpliedByFlatWorkGroupSize =
     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
   Default.first = MinImpliedByFlatWorkGroupSize;
-  bool RequestedFlatWorkGroupSize =
-      F.hasFnAttribute("amdgpu-flat-work-group-size");
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -562,8 +560,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
-  if (RequestedFlatWorkGroupSize &&
-      Requested.first < MinImpliedByFlatWorkGroupSize)
+  if (Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -16,7 +16,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
 ; GCN-NEXT:    s_load_dwordx16 s[4:19], s[22:23], 0x80
-; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NEXT:    v_mov_b32_e32 v1, s37
@@ -158,10 +157,23 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:260
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:264
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:268
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:272
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:276
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:280
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:284
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:272
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:276
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:280
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:284
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:540 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:288
 ; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:292
 ; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:296
@@ -218,43 +230,45 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
 ; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:500
 ; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
 ; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
-; GCN-NEXT:    s_waitcnt vmcnt(60)
-; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[20:21]
-; GCN-NEXT:    s_waitcnt vmcnt(57)
-; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(54)
-; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
-; GCN-NEXT:    s_waitcnt vmcnt(51)
-; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(48)
-; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(45)
-; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(42)
-; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
-; GCN-NEXT:    s_waitcnt vmcnt(39)
-; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
-; GCN-NEXT:    s_waitcnt vmcnt(36)
-; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(33)
-; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
-; GCN-NEXT:    s_waitcnt vmcnt(30)
-; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
-; GCN-NEXT:    s_waitcnt vmcnt(27)
-; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
-; GCN-NEXT:    s_waitcnt vmcnt(24)
-; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
-; GCN-NEXT:    s_waitcnt vmcnt(21)
-; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
-; GCN-NEXT:    s_waitcnt vmcnt(18)
-; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
-; GCN-NEXT:    s_waitcnt vmcnt(15)
-; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:512 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:516 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:520 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:524 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[20:21]
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:528 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:532 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:536 ; 4-byte Folded Reload
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:540 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[20:21] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, v[8:11], s[20:21] offset:32
+; GCN-NEXT:    global_store_dwordx4 v0, v[12:15], s[20:21] offset:48
+; GCN-NEXT:    global_store_dwordx4 v0, v[16:19], s[20:21] offset:64
+; GCN-NEXT:    global_store_dwordx4 v0, v[20:23], s[20:21] offset:80
+; GCN-NEXT:    global_store_dwordx4 v0, v[24:27], s[20:21] offset:96
+; GCN-NEXT:    global_store_dwordx4 v0, v[28:31], s[20:21] offset:112
+; GCN-NEXT:    global_store_dwordx4 v0, v[32:35], s[20:21] offset:128
+; GCN-NEXT:    global_store_dwordx4 v0, v[36:39], s[20:21] offset:144
+; GCN-NEXT:    global_store_dwordx4 v0, v[40:43], s[20:21] offset:160
+; GCN-NEXT:    global_store_dwordx4 v0, v[44:47], s[20:21] offset:176
+; GCN-NEXT:    global_store_dwordx4 v0, v[48:51], s[20:21] offset:192
+; GCN-NEXT:    global_store_dwordx4 v0, v[52:55], s[20:21] offset:208
+; GCN-NEXT:    global_store_dwordx4 v0, v[56:59], s[20:21] offset:224
+; GCN-NEXT:    global_store_dwordx4 v0, v[60:63], s[20:21] offset:240
 ; GCN-NEXT:    s_endpgm
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
   %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
   store <64 x i32> %insert, <64 x i32> addrspace(1)* %out.ptr
   ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
+attributes #0 = { "amdgpu-flat-workgroup-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -108,5 +108,5 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
 attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -10,7 +10,7 @@ define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
-attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
+attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,64" }
 
 ; Exactly 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_5:
@@ -84,7 +84,7 @@ define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
-attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
+attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64"}
 
 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=tahiti -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
+
+; Both of these kernels have the same value for
+; amdgpu-flat-work-group-size, except one explicitly sets it. This is
+; a program visible property which should always take precedence over
+; the amdgpu-waves-per-eu optimization hint.
+;
+; The range is incompatible with the amdgpu-waves-per-eu value, so the
+; flat work group size should take precedence implying a requirement
+; to support 1024 size workgroups (which exceeds the available LDS
+; amount).
+
+; CHECK-NOT: @no_flat_workgroup_size.stack
+; CHECK-NOT: @explicit_default_workgroup_size.stack
+
+; CHECK-LABEL: @no_flat_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+define amdgpu_kernel void @no_flat_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+entry:
+  %stack = alloca [5 x i32], align 4, addrspace(5)
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %3 = load i32, i32 addrspace(5)* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; CHECK-LABEL: @explicit_default_workgroup_size(
+; CHECK: alloca [5 x i32]
+; CHECK: store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+define amdgpu_kernel void @explicit_default_workgroup_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+entry:
+  %stack = alloca [5 x i32], align 4, addrspace(5)
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
+  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
+  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
+  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
+  %3 = load i32, i32 addrspace(5)* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {
   ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="2,3" }
+attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }
 attributes #1 = { "amdgpu-waves-per-eu"="18,18" }
 attributes #2 = { "amdgpu-waves-per-eu"="19,19" }
 attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll b/llvm/test/CodeGen/AMDGPU/schedule-ilp.ll
@@ -585,5 +585,5 @@ bb:
 ; Function Attrs: nounwind readnone
 declare float @llvm.fmuladd.f32(float, float, float) #1
 
-attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit3.ll
@@ -588,4 +588,4 @@ bb:
 declare float @llvm.fmuladd.f32(float, float, float) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -107,5 +107,5 @@ attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "target-cpu"="tahiti" }
 attributes #3 = { nounwind "target-cpu"="bonaire" }
 attributes #4 = { nounwind "target-cpu"="fiji" }
-attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
-attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="1,256" }

Original file line number	Diff line number	Diff line change
`@@ -392,7 +392,7 @@ define amdgpu_kernel void @used_lds_8252_max_group_size_32() #10 {`
`392`	`392`	`ret void`
`393`	`393`	`}`
`394`	`394`
`395`		`-attributes #0 = { "amdgpu-waves-per-eu"="2,3" }`
	`395`	`+attributes #0 = { "amdgpu-waves-per-eu"="2,3" "amdgpu-flat-work-group-size"="1,64" }`
`396`	`396`	`attributes #1 = { "amdgpu-waves-per-eu"="18,18" }`
`397`	`397`	`attributes #2 = { "amdgpu-waves-per-eu"="19,19" }`
`398`	`398`	`attributes #3 = { "amdgpu-flat-work-group-size"="1,64" }`