[AMDGPU][Verifier] Check address space of `alloca` instruction #135820

shiltian · 2025-04-15T17:24:40Z

This PR updates the Verifier to enforce that alloca instructions on AMDGPU must be in AS5. This prevents hitting a misleading backend error like "unable to select FrameIndex," which makes it look like a backend bug when it's actually an IR-level issue.

shiltian · 2025-04-15T17:24:55Z

[AMDGPU][Verifier] Check address space of alloca instruction #135820 👈 (View in Graphite)
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-04-15T17:25:16Z

@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-mlir-llvm

@llvm/pr-subscribers-llvm-transforms

Author: Shilei Tian (shiltian)

Changes

Patch is 944.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135820.diff

11 Files Affected:

(modified) llvm/lib/IR/Verifier.cpp (+5)
(removed) llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll (-16)
(modified) llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll (+3-2)
(removed) llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll (-66)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines.ll (+1202-947)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll (+1582-1284)
(modified) llvm/test/Transforms/OpenMP/spmdization.ll (+776-555)
(modified) llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll (+124-129)
(modified) llvm/test/Transforms/OpenMP/spmdization_indirect.ll (+291-279)
(added) llvm/test/Verifier/AMDGPU/alloca.bc ()
(added) llvm/test/Verifier/AMDGPU/alloca.ll (+14)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e3f6c1ad5a65b..b180349ecebb7 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4395,6 +4395,11 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
     verifySwiftErrorValue(&AI);
   }
 
+  if (TT.isAMDGPU()) {
+    Check(AI.getAddressSpace() == DL.getAllocaAddrSpace(),
+          "alloca on amdgpu must be in addrspace(5)", &AI);
+  }
+
   visitInstruction(AI);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll b/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
deleted file mode 100644
index 1e72e679e83c0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-
-; The alloca has the wrong address space and is passed to a call. The
-; FrameIndex was created with the natural 32-bit pointer type instead
-; of the declared 64-bit. Make sure we don't assert.
-
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: i64 = FrameIndex<0>
-
-declare void @func(ptr)
-
-define void @main() {
-bb:
-  %alloca = alloca i32, align 4
-  call void @func(ptr %alloca)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
index 1b0c8d66d3ebc..4309dacc9da2b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
@@ -16,8 +16,9 @@ define amdgpu_kernel void @offloading_kernel() {
 }
 
 define void @call_unknown() {
-  %1 = alloca ptr, align 8
-  %2 = call i32 %1()
+  %alloca = alloca ptr, align 8, addrspace(5)
+  %alloca.cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  %ret = call i32 %alloca.cast()
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll b/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
deleted file mode 100644
index 9a2bfac0feb02..0000000000000
--- a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Gracefully handle the alloca that is not in the alloca AS (=5)
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
-declare void @use(ptr)
-declare void @use2(ptr, ptr)
-
-define weak amdgpu_kernel void @__omp_offloading_802_ea0109_main_l8(ptr %a) {
-; CHECK-LABEL: @__omp_offloading_802_ea0109_main_l8(
-; CHECK-NEXT:  .master:
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [8 x i8], align 1
-; CHECK-NEXT:    call void @use2(ptr nonnull [[TMP0]], ptr nonnull [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-.master:
-  %0 = alloca i8, i64 8, align 1
-  store ptr undef, ptr %0, align 8
-  call void @use2(ptr %0, ptr %0)
-  ret void
-}
-
-%struct.widget = type { [8 x i8] }
-
-define void @spam(ptr %arg1) {
-; CHECK-LABEL: @spam(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ALLOCA1:%.*]] = alloca [0 x [30 x %struct.widget]], align 16
-; CHECK-NEXT:    call void @zot(ptr nonnull [[ALLOCA1]])
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca [30 x %struct.widget], i32 0, align 16
-  call void @zot(ptr %alloca)
-  ret void
-}
-
-define i1 @alloca_addrspace_0_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_0_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    call void @use(ptr nonnull [[ALLOCA]])
-; CHECK-NEXT:    ret i1 true
-;
-  %alloca = alloca i8
-  call void @use(ptr %alloca)
-  %cmp = icmp ne ptr %alloca, null
-  ret i1 %cmp
-}
-
-define i1 @alloca_addrspace_5_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_5_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
-; CHECK-NEXT:    call void @use(ptr addrspace(5) [[ALLOCA]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr addrspace(5) [[ALLOCA]], null
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %alloca = alloca i8, addrspace(5)
-  call void @use(ptr addrspace(5) %alloca)
-  %cmp = icmp ne ptr addrspace(5) %alloca, null
-  ret i1 %cmp
-}
-
-declare hidden void @zot(ptr)
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 10e521bbfcc10..2fe28daf304a6 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -128,7 +128,6 @@
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-
 @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@@ -138,19 +137,22 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -158,22 +160,25 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr, ptr) {
+define weak i32 @__kmpc_target_init(ptr %0, ptr %1) {
   ret i32 0
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @no_parallel_region_in_here() #7
   call void @unknown_no_openmp() #8
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @no_parallel_region_in_here() #1 {
 entry:
   %0 = call i32 @__kmpc_global_thread_num(ptr @2)
@@ -191,25 +196,30 @@ omp_if.end:                                       ; preds = %omp_if.then, %entry
   ret void
 }
 
+; Function Attrs: convergent
 declare void @unknown_no_openmp() #2
 
+; Function Attrs: nounwind
 declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__1(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__1(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -217,46 +227,60 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  %captured_vars_addrs1 = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  %captured_vars_addrs1 = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs1.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs1 to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @no_parallel_region_in_here() #7
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1.cast, i64 0)
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p0() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p0() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__2(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__2(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
@@ -264,45 +288,57 @@ declare void @__kmpc_get_shared_variables(ptr)
 
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p1() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p1() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__3(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__4(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__4(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -310,76 +346,95 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
   call void @simple_state_machine_interprocedural_before() #7
   call void @no_parallel_region_in_here() #7
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @simple_state_machine_interprocedural_after() #7
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @simple_state_machine_interprocedural_before() #1 {
 entry:
-  %captured_vars_addrs = alloca [0 x ptr], align 8
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspa...
[truncated]

llvmbot · 2025-04-15T17:25:17Z

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

Changes

Patch is 944.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135820.diff

11 Files Affected:

(modified) llvm/lib/IR/Verifier.cpp (+5)
(removed) llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll (-16)
(modified) llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll (+3-2)
(removed) llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll (-66)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines.ll (+1202-947)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll (+1582-1284)
(modified) llvm/test/Transforms/OpenMP/spmdization.ll (+776-555)
(modified) llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll (+124-129)
(modified) llvm/test/Transforms/OpenMP/spmdization_indirect.ll (+291-279)
(added) llvm/test/Verifier/AMDGPU/alloca.bc ()
(added) llvm/test/Verifier/AMDGPU/alloca.ll (+14)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e3f6c1ad5a65b..b180349ecebb7 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4395,6 +4395,11 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
     verifySwiftErrorValue(&AI);
   }
 
+  if (TT.isAMDGPU()) {
+    Check(AI.getAddressSpace() == DL.getAllocaAddrSpace(),
+          "alloca on amdgpu must be in addrspace(5)", &AI);
+  }
+
   visitInstruction(AI);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll b/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
deleted file mode 100644
index 1e72e679e83c0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-
-; The alloca has the wrong address space and is passed to a call. The
-; FrameIndex was created with the natural 32-bit pointer type instead
-; of the declared 64-bit. Make sure we don't assert.
-
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: i64 = FrameIndex<0>
-
-declare void @func(ptr)
-
-define void @main() {
-bb:
-  %alloca = alloca i32, align 4
-  call void @func(ptr %alloca)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
index 1b0c8d66d3ebc..4309dacc9da2b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
@@ -16,8 +16,9 @@ define amdgpu_kernel void @offloading_kernel() {
 }
 
 define void @call_unknown() {
-  %1 = alloca ptr, align 8
-  %2 = call i32 %1()
+  %alloca = alloca ptr, align 8, addrspace(5)
+  %alloca.cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  %ret = call i32 %alloca.cast()
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll b/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
deleted file mode 100644
index 9a2bfac0feb02..0000000000000
--- a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Gracefully handle the alloca that is not in the alloca AS (=5)
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
-declare void @use(ptr)
-declare void @use2(ptr, ptr)
-
-define weak amdgpu_kernel void @__omp_offloading_802_ea0109_main_l8(ptr %a) {
-; CHECK-LABEL: @__omp_offloading_802_ea0109_main_l8(
-; CHECK-NEXT:  .master:
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [8 x i8], align 1
-; CHECK-NEXT:    call void @use2(ptr nonnull [[TMP0]], ptr nonnull [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-.master:
-  %0 = alloca i8, i64 8, align 1
-  store ptr undef, ptr %0, align 8
-  call void @use2(ptr %0, ptr %0)
-  ret void
-}
-
-%struct.widget = type { [8 x i8] }
-
-define void @spam(ptr %arg1) {
-; CHECK-LABEL: @spam(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ALLOCA1:%.*]] = alloca [0 x [30 x %struct.widget]], align 16
-; CHECK-NEXT:    call void @zot(ptr nonnull [[ALLOCA1]])
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca [30 x %struct.widget], i32 0, align 16
-  call void @zot(ptr %alloca)
-  ret void
-}
-
-define i1 @alloca_addrspace_0_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_0_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    call void @use(ptr nonnull [[ALLOCA]])
-; CHECK-NEXT:    ret i1 true
-;
-  %alloca = alloca i8
-  call void @use(ptr %alloca)
-  %cmp = icmp ne ptr %alloca, null
-  ret i1 %cmp
-}
-
-define i1 @alloca_addrspace_5_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_5_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
-; CHECK-NEXT:    call void @use(ptr addrspace(5) [[ALLOCA]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr addrspace(5) [[ALLOCA]], null
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %alloca = alloca i8, addrspace(5)
-  call void @use(ptr addrspace(5) %alloca)
-  %cmp = icmp ne ptr addrspace(5) %alloca, null
-  ret i1 %cmp
-}
-
-declare hidden void @zot(ptr)
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 10e521bbfcc10..2fe28daf304a6 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -128,7 +128,6 @@
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-
 @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@@ -138,19 +137,22 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -158,22 +160,25 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr, ptr) {
+define weak i32 @__kmpc_target_init(ptr %0, ptr %1) {
   ret i32 0
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @no_parallel_region_in_here() #7
   call void @unknown_no_openmp() #8
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @no_parallel_region_in_here() #1 {
 entry:
   %0 = call i32 @__kmpc_global_thread_num(ptr @2)
@@ -191,25 +196,30 @@ omp_if.end:                                       ; preds = %omp_if.then, %entry
   ret void
 }
 
+; Function Attrs: convergent
 declare void @unknown_no_openmp() #2
 
+; Function Attrs: nounwind
 declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__1(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__1(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -217,46 +227,60 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  %captured_vars_addrs1 = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  %captured_vars_addrs1 = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs1.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs1 to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @no_parallel_region_in_here() #7
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1.cast, i64 0)
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p0() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p0() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__2(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__2(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
@@ -264,45 +288,57 @@ declare void @__kmpc_get_shared_variables(ptr)
 
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p1() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p1() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__3(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__4(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__4(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -310,76 +346,95 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
   call void @simple_state_machine_interprocedural_before() #7
   call void @no_parallel_region_in_here() #7
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @simple_state_machine_interprocedural_after() #7
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @simple_state_machine_interprocedural_before() #1 {
 entry:
-  %captured_vars_addrs = alloca [0 x ptr], align 8
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspa...
[truncated]

llvmbot · 2025-04-15T17:25:17Z

@llvm/pr-subscribers-llvm-ir

Author: Shilei Tian (shiltian)

Changes

Patch is 944.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135820.diff

11 Files Affected:

(modified) llvm/lib/IR/Verifier.cpp (+5)
(removed) llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll (-16)
(modified) llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll (+3-2)
(removed) llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll (-66)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines.ll (+1202-947)
(modified) llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll (+1582-1284)
(modified) llvm/test/Transforms/OpenMP/spmdization.ll (+776-555)
(modified) llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll (+124-129)
(modified) llvm/test/Transforms/OpenMP/spmdization_indirect.ll (+291-279)
(added) llvm/test/Verifier/AMDGPU/alloca.bc ()
(added) llvm/test/Verifier/AMDGPU/alloca.ll (+14)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index e3f6c1ad5a65b..b180349ecebb7 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4395,6 +4395,11 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
     verifySwiftErrorValue(&AI);
   }
 
+  if (TT.isAMDGPU()) {
+    Check(AI.getAddressSpace() == DL.getAllocaAddrSpace(),
+          "alloca on amdgpu must be in addrspace(5)", &AI);
+  }
+
   visitInstruction(AI);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll b/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
deleted file mode 100644
index 1e72e679e83c0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/assert-wrong-alloca-addrspace.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-
-; The alloca has the wrong address space and is passed to a call. The
-; FrameIndex was created with the natural 32-bit pointer type instead
-; of the declared 64-bit. Make sure we don't assert.
-
-; CHECK: LLVM ERROR: Cannot select: {{.*}}: i64 = FrameIndex<0>
-
-declare void @func(ptr)
-
-define void @main() {
-bb:
-  %alloca = alloca i32, align 4
-  call void @func(ptr %alloca)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
index 1b0c8d66d3ebc..4309dacc9da2b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-indirect-lds-references.ll
@@ -16,8 +16,9 @@ define amdgpu_kernel void @offloading_kernel() {
 }
 
 define void @call_unknown() {
-  %1 = alloca ptr, align 8
-  %2 = call i32 %1()
+  %alloca = alloca ptr, align 8, addrspace(5)
+  %alloca.cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  %ret = call i32 %alloca.cast()
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll b/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
deleted file mode 100644
index 9a2bfac0feb02..0000000000000
--- a/llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; Gracefully handle the alloca that is not in the alloca AS (=5)
-
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-target triple = "amdgcn-amd-amdhsa"
-
-declare void @use(ptr)
-declare void @use2(ptr, ptr)
-
-define weak amdgpu_kernel void @__omp_offloading_802_ea0109_main_l8(ptr %a) {
-; CHECK-LABEL: @__omp_offloading_802_ea0109_main_l8(
-; CHECK-NEXT:  .master:
-; CHECK-NEXT:    [[TMP0:%.*]] = alloca [8 x i8], align 1
-; CHECK-NEXT:    call void @use2(ptr nonnull [[TMP0]], ptr nonnull [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-.master:
-  %0 = alloca i8, i64 8, align 1
-  store ptr undef, ptr %0, align 8
-  call void @use2(ptr %0, ptr %0)
-  ret void
-}
-
-%struct.widget = type { [8 x i8] }
-
-define void @spam(ptr %arg1) {
-; CHECK-LABEL: @spam(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ALLOCA1:%.*]] = alloca [0 x [30 x %struct.widget]], align 16
-; CHECK-NEXT:    call void @zot(ptr nonnull [[ALLOCA1]])
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca [30 x %struct.widget], i32 0, align 16
-  call void @zot(ptr %alloca)
-  ret void
-}
-
-define i1 @alloca_addrspace_0_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_0_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    call void @use(ptr nonnull [[ALLOCA]])
-; CHECK-NEXT:    ret i1 true
-;
-  %alloca = alloca i8
-  call void @use(ptr %alloca)
-  %cmp = icmp ne ptr %alloca, null
-  ret i1 %cmp
-}
-
-define i1 @alloca_addrspace_5_nonnull() {
-; CHECK-LABEL: @alloca_addrspace_5_nonnull(
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i8, align 1, addrspace(5)
-; CHECK-NEXT:    call void @use(ptr addrspace(5) [[ALLOCA]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr addrspace(5) [[ALLOCA]], null
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %alloca = alloca i8, addrspace(5)
-  call void @use(ptr addrspace(5) %alloca)
-  %cmp = icmp ne ptr addrspace(5) %alloca, null
-  ret i1 %cmp
-}
-
-declare hidden void @zot(ptr)
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 10e521bbfcc10..2fe28daf304a6 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -128,7 +128,6 @@
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-
 @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
@@ -138,19 +137,22 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -158,22 +160,25 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr, ptr) {
+define weak i32 @__kmpc_target_init(ptr %0, ptr %1) {
   ret i32 0
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @no_parallel_region_in_here() #7
   call void @unknown_no_openmp() #8
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @no_parallel_region_in_here() #1 {
 entry:
   %0 = call i32 @__kmpc_global_thread_num(ptr @2)
@@ -191,25 +196,30 @@ omp_if.end:                                       ; preds = %omp_if.then, %entry
   ret void
 }
 
+; Function Attrs: convergent
 declare void @unknown_no_openmp() #2
 
+; Function Attrs: nounwind
 declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__1(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__1(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -217,46 +227,60 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  %captured_vars_addrs1 = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  %captured_vars_addrs1 = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs1.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs1 to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__2, ptr @__omp_outlined__2_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @no_parallel_region_in_here() #7
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr %captured_vars_addrs1.cast, i64 0)
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p0() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p0() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__2(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__2(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
@@ -264,45 +288,57 @@ declare void @__kmpc_get_shared_variables(ptr)
 
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @p1() #7
   ret void
 }
 
+; Function Attrs: convergent
 declare void @p1() #4
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #0 {
 entry:
-  %.addr = alloca i16, align 2
-  %.addr1 = alloca i32, align 4
-  %.zero.addr = alloca i32, align 4
-  %global_args = alloca ptr, align 8
-  store i32 0, ptr %.zero.addr, align 4
-  store i16 %0, ptr %.addr, align 2
-  store i32 %1, ptr %.addr1, align 4
-  call void @__kmpc_get_shared_variables(ptr %global_args)
-  call void @__omp_outlined__3(ptr %.addr1, ptr %.zero.addr) #3
+  %.addr = alloca ptr, align 8, addrspace(5)
+  %.addr.cast = addrspacecast ptr addrspace(5) %.addr to ptr
+  %.addr1 = alloca ptr, align 8, addrspace(5)
+  %.addr1.cast = addrspacecast ptr addrspace(5) %.addr1 to ptr
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %global_args = alloca ptr, align 8, addrspace(5)
+  %global_args.cast = addrspacecast ptr addrspace(5) %global_args to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
+  store i16 %0, ptr %.addr.cast, align 2
+  store i32 %1, ptr %.addr1.cast, align 4
+  call void @__kmpc_get_shared_variables(ptr %global_args.cast)
+  call void @__omp_outlined__3(ptr %.addr1.cast, ptr %.zero.addr.cast) #3
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
-  %.zero.addr = alloca i32, align 4
-  %.threadid_temp. = alloca i32, align 4
-  store i32 0, ptr %.zero.addr, align 4
+  %.zero.addr = alloca ptr, align 8, addrspace(5)
+  %.zero.addr.cast = addrspacecast ptr addrspace(5) %.zero.addr to ptr
+  %.threadid_temp. = alloca ptr, align 8, addrspace(5)
+  %.threadid_temp..cast = addrspacecast ptr addrspace(5) %.threadid_temp. to ptr
+  store i32 0, ptr %.zero.addr.cast, align 4
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
 user_code.entry:                                  ; preds = %entry
   %1 = call i32 @__kmpc_global_thread_num(ptr @1)
-  store i32 %1, ptr %.threadid_temp., align 4
-  call void @__omp_outlined__4(ptr %.threadid_temp., ptr %.zero.addr) #3
+  store i32 %1, ptr %.threadid_temp..cast, align 4
+  call void @__omp_outlined__4(ptr %.threadid_temp..cast, ptr %.zero.addr.cast) #3
   call void @__kmpc_target_deinit()
   ret void
 
@@ -310,76 +346,95 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
+; Function Attrs: convergent noinline norecurse nounwind
 define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %.bound_tid.) #0 {
 entry:
-  %.global_tid..addr = alloca ptr, align 8
-  %.bound_tid..addr = alloca ptr, align 8
-  %captured_vars_addrs = alloca [0 x ptr], align 8
-  store ptr %.global_tid., ptr %.global_tid..addr, align 8
-  store ptr %.bound_tid., ptr %.bound_tid..addr, align 8
+  %.global_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.global_tid..addr.cast = addrspacecast ptr addrspace(5) %.global_tid..addr to ptr
+  %.bound_tid..addr = alloca ptr, align 8, addrspace(5)
+  %.bound_tid..addr.cast = addrspacecast ptr addrspace(5) %.bound_tid..addr to ptr
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspacecast ptr addrspace(5) %captured_vars_addrs to ptr
+  store ptr %.global_tid., ptr %.global_tid..addr.cast, align 8
+  store ptr %.bound_tid., ptr %.bound_tid..addr.cast, align 8
   call void @unknown_no_openmp() #8
   call void @simple_state_machine_interprocedural_before() #7
   call void @no_parallel_region_in_here() #7
-  %0 = load ptr, ptr %.global_tid..addr, align 8
+  %0 = load ptr, ptr %.global_tid..addr.cast, align 8
   %1 = load i32, ptr %0, align 4
-  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs, i64 0)
+  call void @__kmpc_parallel_51(ptr @1, i32 %1, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr %captured_vars_addrs.cast, i64 0)
   call void @simple_state_machine_interprocedural_after() #7
   ret void
 }
 
+; Function Attrs: convergent noinline nounwind
 define hidden void @simple_state_machine_interprocedural_before() #1 {
 entry:
-  %captured_vars_addrs = alloca [0 x ptr], align 8
+  %captured_vars_addrs = alloca ptr, align 8, addrspace(5)
+  %captured_vars_addrs.cast = addrspa...
[truncated]

arsenm

I have another implementation of this but I was actually thinking we could relax this restriction, and just lower 0 alloca as alloca + addrspacecast. It would avoid a common bug report and isn't that horrible. We could use a nicer version of the PTX hack, where infer address spaces can assume the addrspace but it wouldn't be a hard requirement

llvm/lib/IR/Verifier.cpp

shiltian · 2025-04-15T17:38:27Z

I have another implementation of this but I was actually thinking we could relax this restriction, and just lower 0 alloca as alloca + addrspacecast. It would avoid a common bug report and isn't that horrible. We could use a nicer version of the PTX hack, where infer address spaces can assume the addrspace but it wouldn't be a hard requirement

Doesn't it violate LLVM IR semantics since we treat a AS 0 type as AS5? And also, what's the point of supporting AS 0 alloca?

llvm/test/Verifier/AMDGPU/alloca.bc

arsenm · 2025-04-21T11:31:24Z

Doesn't it violate LLVM IR semantics since we treat a AS 0 type as AS5? And also, what's the point of supporting AS 0 alloca?

No, especially since the address space interpretation is target specific. We can interpret alloca with a 0 address space as allocating in addrspace(5), that has an implicit addrspacecast to generic. This is what NVPTX does (except there it's a hard requirement, and there's a late lowering pass)

The point is to avoid the bug reports like the one you've presumably run into. This is a common one, people continuously just try to shove CPU IR through the backend and it fails on here. It is an IR producer bug, but we can also just deal with it. We can demote it from hard failure to suboptimal frontend IR that gives the backend more work to deal with

llvm/lib/IR/Verifier.cpp

shiltian · 2025-04-21T17:31:11Z

#136584

llvm/lib/IR/Verifier.cpp

llvm/test/Verifier/AMDGPU/alloca.ll

jdoerfert · 2025-04-25T03:15:11Z

llvm/lib/IR/Verifier.cpp

@@ -4392,6 +4392,12 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
    verifySwiftErrorValue(&AI);
  }

+  if (TT.isAMDGPU()) {


Why does this check the triple? This should check the default alloca AS in the DL. Same for globals. Basically everything that has a dedicated AS in the DL should only be created in that AS.

It is because of this folded comment #135820 (comment). There are some test cases that basically have alloca in all ASs. Also, based on the discussion in #136865, specifically #136865 (comment) and #136865 (comment), we can't rely on DL for this purpose.

nikic · 2025-04-25T14:58:03Z

llvm/lib/IR/Verifier.cpp

@@ -4395,6 +4395,11 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
    verifySwiftErrorValue(&AI);
  }

+  if (TT.isAMDGPU()) {
+    Check(AI.getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS,
+          "alloca on amdgpu must be in addrspace(5)", &AI);


Might be better to check it against the alloca addrspace from the data layout instead of hardcoding 5?

My thinking is that this way we could extend this check to most other targets as well (mainly excluding wasm and any other that may be using multiple alloca address spaces).

(Another more involved alternative would be to allow specifying multiple alloca address spaces in DL, and making the first the preferred one. Then this could be a target-independent check.)

The problem with that is the datalayout isn't always right. Even if you have the triple, the right datalaoyut doesn't get pulled. e.g.

; RUN llvm-as < %s target triple = "amdgcn-amd-amdhsa" define void @foo() { %alloca = alloca i32 store volatile i32 0, ptr %alloca ret void }

In this case DL.getAllocaAddrSpace() is 0

I think llc and opt do default to the correct data layout based on triple nowadays, but it's possible llvm-as doesn't do this, as a low level tool.

llc yes, not opt I think. There are a bunch of tests with spurious datalayout = "A5" to compensate

Actually does seem to work for opt now

@nikic I'm thinking about how to update test cases after we extend DL. There are many test cases w/o triple nor data layout, but they have alloca in multiple ASs. After we extend DL, and assert that alloca must be in those ASs, how are we gonna deal with those test cases? Explicitly add AX, AY, AZ? Also, targets such as NVPTX actually require alloca to be in AS5, but their data layout doesn't have that. They do have a fix-up to convert alloca to AS5. However, if we enforce alloca via DL, it also needs update of the data layout string for NVPTX, which is quite intrusive. I can't speak for NVPTX but it might be quite challenging.

@nikic I'm thinking about how to update test cases after we extend DL. There are many test cases w/o triple nor data layout, but they have alloca in multiple ASs. After we extend DL, and assert that alloca must be in those ASs, how are we gonna deal with those test cases? Explicitly add AX, AY, AZ?

Yes, that's what I'd expect to happen.

Also, targets such as NVPTX actually require alloca to be in AS5, but their data layout doesn't have that. They do have a fix-up to convert alloca to AS5. However, if we enforce alloca via DL, it also needs update of the data layout string for NVPTX, which is quite intrusive. I can't speak for NVPTX but it might be quite challenging.

The target data layouts change all the time, this is not a problem in itself.

Something I'm uncertain about with this approach is that this basically adds some information to the data layout that is only used for verification purposes. I don't think we have any other properties like that. Maybe that's a sign that it's not the right approach?

Do we want it just to be for verification only? The point @arsenm made was that DL doesn't assert anything. If we can make it assert, it is gonna be more useful? Also, if we enforce alloca to be in specific AS(s), it is a form of assertion right? Therefore, we can use it in the middle end for optimization. Of course if a target accepts multiple ASs for alloca, we can't assert it must be in exact one, but for those only have one, we can assert.

-1 to making this a list in the DL. It's overly limiting and it's a lot of work for something we do not care about at all. KISS and just check == 5 for AMDGPU

I'm fine with leaving it at that for now.

llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll

shiltian · 2025-04-25T17:03:46Z

Anything else is needed for this PR? The wider adoption would need a series of other PRs, which is beyond the scope of this.

nikic

LGTM

llvm#135820)" This reverts commit 3bc1254.

…135820) This PR updates the `Verifier` to enforce that `alloca` instructions on AMDGPU must be in AS5. This prevents hitting a misleading backend error like "unable to select FrameIndex," which makes it look like a backend bug when it's actually an IR-level issue.

…135820) (llvm#1885)

…135820) This PR updates the `Verifier` to enforce that `alloca` instructions on AMDGPU must be in AS5. This prevents hitting a misleading backend error like "unable to select FrameIndex," which makes it look like a backend bug when it's actually an IR-level issue.

llvmbot added backend:AMDGPU llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:ir labels Apr 15, 2025

llvmbot added llvm:transforms clang:openmp OpenMP related changes to Clang labels Apr 15, 2025

shiltian requested review from arsenm, jdoerfert and ritter-x2a April 15, 2025 17:25

arsenm reviewed Apr 15, 2025

View reviewed changes

shiltian commented Apr 15, 2025

View reviewed changes

llvm/lib/IR/Verifier.cpp Show resolved Hide resolved

ritter-x2a reviewed Apr 16, 2025

View reviewed changes

llvm/test/Verifier/AMDGPU/alloca.bc Outdated Show resolved Hide resolved

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch 2 times, most recently from 47f6fb1 to c9c1eef Compare April 20, 2025 22:52

llvmbot added mlir:llvm mlir labels Apr 20, 2025

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from c9c1eef to 96a89a4 Compare April 21, 2025 03:57

arsenm reviewed Apr 21, 2025

View reviewed changes

llvm/lib/IR/Verifier.cpp Outdated Show resolved Hide resolved

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch 3 times, most recently from 80447ff to 76d2304 Compare April 22, 2025 04:24

arsenm reviewed Apr 23, 2025

View reviewed changes

llvm/lib/IR/Verifier.cpp Outdated Show resolved Hide resolved

llvm/test/Verifier/AMDGPU/alloca.ll Show resolved Hide resolved

llvm/test/Verifier/AMDGPU/alloca.ll Show resolved Hide resolved

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from 76d2304 to 24b1a27 Compare April 23, 2025 13:04

shiltian mentioned this pull request Apr 23, 2025

[AMDGPU] Make AllocaInst return AS5 in getAssumedAddrSpace #136798

Closed

shiltian mentioned this pull request Apr 23, 2025

[Attributor] Use getAssumedAddrSpace to get address space for AllocaInst #136865

Closed

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from 24b1a27 to 81d3d1b Compare April 23, 2025 17:04

jdoerfert reviewed Apr 25, 2025

View reviewed changes

shiltian mentioned this pull request Apr 25, 2025

[AMDGPU] Support alloca in AS0 #136584

Closed

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from 81d3d1b to 16a290c Compare April 25, 2025 14:31

shiltian requested a review from nikic April 25, 2025 14:34

nikic reviewed Apr 25, 2025

View reviewed changes

llvm/test/Transforms/InstCombine/alloca-in-non-alloca-as.ll Outdated Show resolved Hide resolved

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from 16a290c to 3cf8783 Compare April 25, 2025 15:59

nikic approved these changes Apr 25, 2025

View reviewed changes

[AMDGPU][Verifier] Check address space of alloca instruction

48f7e8a

shiltian force-pushed the users/shiltian/amdgpu-alloca-as branch from 3cf8783 to 48f7e8a Compare April 26, 2025 02:41

shiltian merged commit 3bc1254 into main Apr 26, 2025
11 checks passed

shiltian deleted the users/shiltian/amdgpu-alloca-as branch April 26, 2025 04:54

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Apr 26, 2025

Revert "[AMDGPU][Verifier] Check address space of alloca instruction (

71c64be

llvm#135820)" This reverts commit 3bc1254.

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Apr 29, 2025

[AMDGPU][Verifier] Check address space of alloca instruction (llvm#…

26395f7

…135820) (llvm#1885)

[AMDGPU][Verifier] Check address space of alloca instruction #135820

[AMDGPU][Verifier] Check address space of alloca instruction #135820

Uh oh!

Conversation

shiltian commented Apr 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

shiltian commented Apr 15, 2025

Uh oh!

llvmbot commented Apr 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 15, 2025

Uh oh!

llvmbot commented Apr 15, 2025

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

shiltian commented Apr 15, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

arsenm commented Apr 21, 2025

Uh oh!

Uh oh!

shiltian commented Apr 21, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

shiltian commented Apr 25, 2025

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

[AMDGPU][Verifier] Check address space of `alloca` instruction #135820

[AMDGPU][Verifier] Check address space of `alloca` instruction #135820

shiltian commented Apr 15, 2025 •

edited

Loading

llvmbot commented Apr 15, 2025 •

edited

Loading

shiltian commented Apr 15, 2025 •

edited

Loading