[AMDGPU] Add custom MachineValueType entries for buffer fat poiners (#127692)

krzysz00 · web-flow · commit e697c99b6322 · 2025-03-04T17:19:06.000-06:00
The old hack of returning v5/v6i32 for the fat and strided buffer
pointers was causing issuse during vectorization queries that expected
to be able to construct a VectorType from the return value of `MVT
getPointerType()`. On example is in the test attached to this PR, which
used to crash.

Now, we define the custom MVT entries, the 160-bit
amdgpuBufferFatPointer and 192-bit amdgpuBufferStridedPointer, which are
used to represent ptr addrspace(7) and ptr addrspace(9) respectively.

Neither of these types will be present at the time of lowering to a
SelectionDAG or other MIR - MVT::amdgpuBufferFatPointer is eliminated by
the LowerBufferFatPointers pass and amdgpu::bufferStridedPointer is not
currently used outside of the SPIR-V translator (which does its own
lowering).

An alternative solution would be to add MVT::i160 and MVT::i192. We
elect not to do this now as it would require changes to unrelated code
and runs the risk of breaking any SelectionDAG code that assumes that
the MVT series are all powers of two (and so can be split apart and
merged back together) in ways that wouldn't be obvious if someone tried
to use MVT::i160 in codegen. If i160 is added at some future point,
these custom types can be retired.
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -331,6 +331,12 @@ def i64x8     : ValueType<512,  231>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
               : ValueType<16,  232>;  // AArch64 predicate-as-counter
 def spirvbuiltin : ValueType<0, 233>; // SPIR-V's builtin type
+// AMDGPU buffer fat pointer, buffer rsrc + offset, rewritten before MIR translation.
+// FIXME: Remove this and the getPointerType() override if MVT::i160 is added.
+def amdgpuBufferFatPointer : ValueType<160, 234>;
+// AMDGPU buffer strided pointer, buffer rsrc + index + offset, doesn't reach MIR.
+// FIXME: Remove this and the getPointerType() override if MVT::i82 is added.
+def amdgpuBufferStridedPointer : ValueType<192, 235>;
 
 let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -194,6 +194,10 @@ std::string EVT::getEVTString() const {
     return "aarch64svcount";
   case MVT::spirvbuiltin:
     return "spirvbuiltin";
+  case MVT::amdgpuBufferFatPointer:
+    return "amdgpuBufferFatPointer";
+  case MVT::amdgpuBufferStridedPointer:
+    return "amdgpuBufferStridedPointer";
   }
 }
 
@@ -219,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return TargetExtType::get(Context, "aarch64.svcount");
   case MVT::x86amx:  return Type::getX86_AMXTy(Context);
   case MVT::i64x8:   return IntegerType::get(Context, 512);
+  case MVT::amdgpuBufferFatPointer:  return IntegerType::get(Context, 160);
+  case MVT::amdgpuBufferStridedPointer:  return IntegerType::get(Context, 192);
   case MVT::externref: return Type::getWasm_ExternrefTy(Context);
   case MVT::funcref: return Type::getWasm_FuncrefTy(Context);
   case MVT::Metadata: return Type::getMetadataTy(Context);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1180,19 +1180,20 @@ static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
   return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
 }
 
-/// Map address space 7 to MVT::v5i32 because that's its in-memory
-/// representation. This return value is vector-typed because there is no
-/// MVT::i160 and it is not clear if one can be added. While this could
-/// cause issues during codegen, these address space 7 pointers will be
-/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
-/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
-/// modeling, to work.
+/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
+/// in-memory representation. This return value is a custom type because there
+/// is no MVT::i160 and adding one breaks integer promotion logic. While this
+/// could cause issues during codegen, these address space 7 pointers will be
+/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
+/// in order to allow pre-codegen passes that query TargetTransformInfo, often
+/// for cost modeling, to work. (This also sets us up decently for doing the
+/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
 MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
   if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
-    return MVT::v5i32;
+    return MVT::amdgpuBufferFatPointer;
   if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
       DL.getPointerSizeInBits(AS) == 192)
-    return MVT::v6i32;
+    return MVT::amdgpuBufferStridedPointer;
   return AMDGPUTargetLowering::getPointerTy(DL, AS);
 }
 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -303,8 +303,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
 
   // While address space 7 should never make it to codegen, it still needs to
-  // have a MVT to prevent some analyses that query this function from breaking,
-  // so, to work around the lack of i160, map it to v5i32.
+  // have a MVT to prevent some analyses that query this function from breaking.
+  // We use the custum MVT::amdgpuBufferFatPointer and
+  // amdgpu::amdgpuBufferStridedPointer for this, though we use v8i32 for the
+  // memory type (which is probably unused).
   MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
   MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
 
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=loop-vectorize -S < %s | FileCheck %s
+
+; Reduced from a crash, variables added to make things more realistic.
+; This is a roundabout test for TargetLowering::getValueType() returning
+; a reasonable value for <N x p7> instead of asserting.
+define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) {
+; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(
+; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[_LR_PH5:.*]]:
+; CHECK-NEXT:    [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7)
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[_LR_PH5]] ], [ [[TMP5:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[V]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[__CRIT_EDGE_LOOPEXIT:.*]], label %[[LOOP]]
+; CHECK:       [[__CRIT_EDGE_LOOPEXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744)
+  %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7)
+  br label %loop
+
+loop:                                                ; preds = %loop, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %ptr = getelementptr i32, ptr addrspace(7) %fat, i32 0
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv, %v
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:                             ; preds = %exit
+  ret void
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)