AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca #114144

arsenm · 2024-10-29T22:50:18Z

No description provided.

arsenm · 2024-10-29T22:50:34Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-10-29T22:50:49Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Patch is 25.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114144.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+34-39)
(removed) llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll (-44)
(added) llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll (+243)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 7dd7388376f474..9834b0f7f4b2f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1115,9 +1115,10 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
   if (Val == OtherOp)
     OtherOp = Inst->getOperand(OpIdx1);
 
-  if (isa<ConstantPointerNull>(OtherOp))
+  if (isa<ConstantPointerNull>(OtherOp) || isa<ConstantAggregateZero>(OtherOp))
     return true;
 
+  // TODO: getUnderlyingObject will not work on a vector getelementptr
   Value *OtherObj = getUnderlyingObject(OtherOp);
   if (!isa<AllocaInst>(OtherObj))
     return false;
@@ -1195,36 +1196,19 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       continue;
     }
 
-    // TODO: If we know the address is only observed through flat pointers, we
-    // could still promote.
-    if (UseInst->getOpcode() == Instruction::AddrSpaceCast)
-      return false;
-
-    // Do not promote vector/aggregate type instructions. It is hard to track
-    // their users.
-    if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
-      return false;
-
-    // TODO: Handle vectors of pointers.
-    if (!User->getType()->isPointerTy())
-      return false;
-
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
       // Be conservative if an address could be computed outside the bounds of
       // the alloca.
       if (!GEP->isInBounds())
         return false;
-    }
-
-    // Only promote a select if we know that the other select operand is from
-    // another pointer that will also be promoted.
-    if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
+      // Only promote a select if we know that the other select operand is from
+      // another pointer that will also be promoted.
       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
         return false;
-    }
+    } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
+      // Repeat for phis.
 
-    // Repeat for phis.
-    if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
       // TODO: Handle more complex cases. We should be able to replace loops
       // over arrays.
       switch (Phi->getNumIncomingValues()) {
@@ -1237,6 +1221,15 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
       default:
         return false;
       }
+    } else if (!isa<ExtractElementInst>(User)) {
+      // Do not promote vector/aggregate type instructions. It is hard to track
+      // their users.
+
+      // Do not promote addrspacecast.
+      //
+      // TODO: If we know the address is only observed through flat pointers, we
+      // could still promote.
+      return false;
     }
 
     WorkList.push_back(User);
@@ -1490,17 +1483,21 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
 
   SmallVector<IntrinsicInst *> DeferredIntrs;
 
+  PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
+
   for (Value *V : WorkList) {
     CallInst *Call = dyn_cast<CallInst>(V);
     if (!Call) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
-        PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
+        Value *LHS = CI->getOperand(0);
+        Value *RHS = CI->getOperand(1);
 
-        if (isa<ConstantPointerNull>(CI->getOperand(0)))
-          CI->setOperand(0, ConstantPointerNull::get(NewTy));
+        Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
+        if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
+          CI->setOperand(0, Constant::getNullValue(NewTy));
 
-        if (isa<ConstantPointerNull>(CI->getOperand(1)))
-          CI->setOperand(1, ConstantPointerNull::get(NewTy));
+        if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
+          CI->setOperand(1, Constant::getNullValue(NewTy));
 
         continue;
       }
@@ -1510,25 +1507,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
       if (isa<AddrSpaceCastInst>(V))
         continue;
 
-      PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
-
-      assert(isa<PointerType>(V->getType()));
+      assert(V->getType()->isPtrOrPtrVectorTy());
 
-      // FIXME: It doesn't really make sense to try to do this for all
-      // instructions.
+      Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
       V->mutateType(NewTy);
 
       // Adjust the types of any constant operands.
       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
-        if (isa<ConstantPointerNull>(SI->getOperand(1)))
-          SI->setOperand(1, ConstantPointerNull::get(NewTy));
+        if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1)))
+          SI->setOperand(1, Constant::getNullValue(NewTy));
 
-        if (isa<ConstantPointerNull>(SI->getOperand(2)))
-          SI->setOperand(2, ConstantPointerNull::get(NewTy));
+        if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2)))
+          SI->setOperand(2, Constant::getNullValue(NewTy));
       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
-          if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
-            Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
+          if (isa<ConstantPointerNull, ConstantAggregateZero>(
+                  Phi->getIncomingValue(I)))
+            Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
         }
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll
deleted file mode 100644
index b0d578e421e280..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
-
-; Check that invalid IR is not produced on a vector typed
-; getelementptr with a scalar alloca pointer base.
-
-define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
-; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
-; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[GETELEMENTPTR]], i64 0
-; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  %getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-  %extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0
-  store i32 0, ptr addrspace(5) %extractelement
-  ret void
-}
-
-define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) {
-; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(
-; CHECK-SAME: i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0>
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]]
-; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[SELECT]], i64 1
-; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-  %getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0>
-  %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1
-  %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
-  store i32 0, ptr addrspace(5) %extractelement
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
new file mode 100644
index 00000000000000..355a2b8796b24d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Check that invalid IR is not produced on a vector typed
+; getelementptr with a scalar alloca pointer base.
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[GETELEMENTPTR]], i64 0
+; CHECK-NEXT:    store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}
+
+; TODO: Should be able to promote this
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0>
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]]
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[SELECT]], i64 1
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0>
+  %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1
+  %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]]
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1
+; CHECK-NEXT:    store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0
+  %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> [[GETELEMENTPTR0]], <4 x ptr addrspace(3)> zeroinitializer
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1
+; CHECK-NEXT:    store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> zeroinitializer
+  %extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
+  store i32 0, ptr addrspace(5) %extractelement
+  ret void
+}
+
+define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0(i1 %cond, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0(
+; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[BB0:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq <4 x ptr addrspace(3)> [[SELECT]], zeroinitializer
+; CHECK-NEXT:    store <4 x i1> [[ICMP]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT:    ret void
+;
+bb0:
+  %alloca = alloca [4 x i32], align 4, addrspace(5)
+  %getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+  %select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0
+  %icmp = icmp eq <4 x ptr addrspace(5)> %select, zeroinitializer
+  store <4 x i1> %icmp,...
[truncated]

llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

shiltian

LG with nits mentioned.

arsenm · 2024-11-06T16:43:06Z

Merge activity

Nov 6, 11:43 AM EST: A user started a stack merge that includes this pull request via Graphite.
Nov 6, 11:47 AM EST: A user merged this pull request with Graphite.

arsenm added the backend:AMDGPU label Oct 29, 2024 — with Graphite App

arsenm requested review from jayfoad, Pierre-vh, rampitec and shiltian October 29, 2024 22:50

arsenm marked this pull request as ready for review October 29, 2024 22:50

This was referenced Oct 29, 2024

IR: Fix verifier missing addrspace mismatch in vector GEPs #114091

Merged

AMDGPU: Fix producing invalid IR on vector typed getelementptr #114113

Merged

arsenm force-pushed the users/arsenm/amdgpu-fix-promote-alloca-vector-gep-verifier-error branch from be705d0 to 38c949d Compare October 30, 2024 02:43

arsenm force-pushed the users/arsenm/amdgpu-promote-alloca-handle-vector-of-pointers branch from 5fb1bed to 164d408 Compare October 30, 2024 02:44

Base automatically changed from users/arsenm/amdgpu-fix-promote-alloca-vector-gep-verifier-error to main October 30, 2024 05:14

arsenm force-pushed the users/arsenm/amdgpu-promote-alloca-handle-vector-of-pointers branch 2 times, most recently from d834bc3 to b5022a1 Compare October 30, 2024 22:02

arsenm mentioned this pull request Oct 30, 2024

ValueTracking: Allow getUnderlyingObject to look at vectors #114311

Merged

Pierre-vh reviewed Nov 5, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/promote-alloca-invalid-vector-gep.ll Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp Outdated Show resolved Hide resolved

shiltian approved these changes Nov 5, 2024

View reviewed changes

arsenm added 2 commits November 5, 2024 10:44

AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca

7e356e2

variadic isa

5c33264

arsenm force-pushed the users/arsenm/amdgpu-promote-alloca-handle-vector-of-pointers branch from b5022a1 to 5c33264 Compare November 5, 2024 18:56

Pierre-vh approved these changes Nov 6, 2024

View reviewed changes

arsenm merged commit efe87fb into main Nov 6, 2024
6 of 9 checks passed

arsenm deleted the users/arsenm/amdgpu-promote-alloca-handle-vector-of-pointers branch November 6, 2024 16:47

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca #114144

AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca #114144

Uh oh!

arsenm commented Oct 29, 2024

Uh oh!

arsenm commented Oct 29, 2024 •

edited

Loading

Uh oh!

llvmbot commented Oct 29, 2024

Uh oh!

Uh oh!

Uh oh!

shiltian left a comment

Uh oh!

arsenm commented Nov 6, 2024 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca #114144

AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca #114144

Uh oh!

Conversation

arsenm commented Oct 29, 2024

Uh oh!

arsenm commented Oct 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Oct 29, 2024

Uh oh!

Uh oh!

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

arsenm commented Nov 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Merge activity

Uh oh!

Uh oh!

Uh oh!

arsenm commented Oct 29, 2024 •

edited

Loading

arsenm commented Nov 6, 2024 •

edited

Loading