llvm · krzysz00 · Feb 11, 2025 · Feb 11, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
   return !F || !ST->isSingleLaneExecution(*F);
 }
 
+unsigned GCNTTIImpl::getRegUsageForType(Type *Ty) {
+  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+    if (auto *PT = dyn_cast<PointerType>(VT->getElementType())) {
+      switch (PT->getAddressSpace()) {
+      // Assume that the resource parts of the vector being asked about are the
+      // same.
+      case AMDGPUAS::BUFFER_FAT_POINTER:
+        return 4 + VT->getNumElements();
+      case AMDGPUAS::BUFFER_STRIDED_POINTER:
+        return 4 + 2 * VT->getNumElements();
+      default:
+        break;
+      }
+    }
+  }
+  return BaseT::getRegUsageForType(Ty);
+}
+
 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
   // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
   // registers. See getRegisterClassForType for the implementation.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -113,6 +113,14 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP);
 
+  // Vectorization will query for the number of registers needed for
+  // <N x ptr addrspace(7/9)> and the default implementation will cause crashes,
+  // so override it here. This also lets us account for the fact that, in the
+  // context of loop vectorization (which is what uses this API), the number of
+  // registers needed for fat pointers is lower because they'll share a resource
+  // part.
+  unsigned getRegUsageForType(Type *Ty);
+
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
     return TTI::PSK_FastHardware;

diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=loop-vectorize -S < %s | FileCheck %s
+
+; Reduced from a crash, variables added to make things more realistic.
+; This is a roundabout test for TargetLowering::getValueType() returning
+; a reasonable value for <N x p7> instead of asserting.
+define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) {
+; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(
+; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[_LR_PH5:.*:]]
+; CHECK-NEXT:    [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7)
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[DOTLR_PH5:%.*]] ], [ [[TMP5:%.*]], %[[BB2]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], [[DOT_CRIT_EDGE_LOOPEXIT:label %.*]], label %[[BB2]]
+; CHECK:       [[__CRIT_EDGE_LOOPEXIT:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
+  %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7)
+  br label %loop
+
+loop:                                                ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %ptr = getelementptr i32, ptr addrspace(7) %fat, i32 0
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, %v
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:                             ; preds = %exit
+  ret void
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)