[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early (#114481)

AlexVlx · web-flow · commit 48ec59c234ce · 2024-11-25T10:29:50.000Z
Fold `llvm.amdgcn.wavefrontsize` early, during InstCombine, so that it's
concrete value is used throughout subsequent optimisation passes.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1,6 +1,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
-// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s
 
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
 // CHECK-LABEL test_wavefrontsize(
 unsigned test_wavefrontsize() {
 
-  // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
+  // CHECK-AMDGCN: ret i32 {{[0-9]+}}
+  // CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
   return __builtin_amdgcn_wavefrontsize();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_wavefrontsize: {
+    if (ST->isWaveSizeKnown())
+      return IC.replaceInstUsesWith(
+          II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
+    break;
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II.getArgOperand(0)))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -4,48 +4,28 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s
 
-; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
-
 ; GCN-LABEL: {{^}}fold_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 32
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 64
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   store i32 %tmp, ptr addrspace(1) %arg, align 4
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+
 bb:
   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
   store i32 %tmp, ptr addrspace(1) %arg, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
 
 ; W32:       v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
 ; W64:       v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
 ; GCN-NOT:   cndmask
 ; GCN:       store_{{dword|b32}} v{{.+}}, [[V]]
 
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   %tmp2 = select i1 %tmp1, i32 2, i32 1
-; OPT:   store i32 %tmp2, ptr addrspace(1) %arg
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
@@ -57,13 +37,6 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
-; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
-
-; OPT:       bb:
-; OPT:   %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
-; OPT:   %tmp1 = icmp ugt i32 %tmp, 32
-; OPT:   bb3:
-; OPT-NEXT:  ret void
 
 define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
 bb:
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
+
+define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
+; OPT-NEXT:    store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 32, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 64, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  store i32 %tmp, ptr addrspace(1) %arg, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
+; OPT-NEXT:    store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    store i32 2, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  %tmp2 = select i1 %tmp1, i32 2, i32 1
+  store i32 %tmp2, ptr addrspace(1) %arg
+  ret void
+}
+
+define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
+; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
+; OPT-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
+; OPT-NEXT:    br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT:       [[BB2]]:
+; OPT-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-NEXT:    br label %[[BB3]]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    ret void
+;
+; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W32-NEXT:  [[BB:.*:]]
+; OPT-W32-NEXT:    br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W32:       [[BB2]]:
+; OPT-W32-NEXT:    br label %[[BB3]]
+; OPT-W32:       [[BB3]]:
+; OPT-W32-NEXT:    ret void
+;
+; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
+; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
+; OPT-W64-NEXT:  [[BB:.*:]]
+; OPT-W64-NEXT:    br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
+; OPT-W64:       [[BB2]]:
+; OPT-W64-NEXT:    store i32 1, ptr addrspace(1) [[ARG]], align 4
+; OPT-W64-NEXT:    br label %[[BB3]]
+; OPT-W64:       [[BB3]]:
+; OPT-W64-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
+  %tmp1 = icmp ugt i32 %tmp, 32
+  br i1 %tmp1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  store i32 1, ptr addrspace(1) %arg, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  ret void
+}
+
+declare i32 @llvm.amdgcn.wavefrontsize() #0
+
+attributes #0 = { nounwind readnone speculatable }

Original file line number	Diff line number	Diff line change
`@@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {`
`1024`	`1024`	`}`
`1025`	`1025`	`break;`
`1026`	`1026`	`}`
	`1027`	`+ case Intrinsic::amdgcn_wavefrontsize: {`
	`1028`	`+ if (ST->isWaveSizeKnown())`
	`1029`	`+ return IC.replaceInstUsesWith(`
	`1030`	`+ II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));`
	`1031`	`+ break;`
	`1032`	`+ }`
`1027`	`1033`	`case Intrinsic::amdgcn_wqm_vote: {`
`1028`	`1034`	`// wqm_vote is identity when the argument is constant.`
`1029`	`1035`	`if (!isa<Constant>(II.getArgOperand(0)))`