Skip to content

[AMDGPULowerBufferFatPointers] Use InstSimplifyFolder during rewrites #134137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2025

Conversation

krzysz00
Copy link
Contributor

@krzysz00 krzysz00 commented Apr 2, 2025

This PR updates AMDGPULowerBufferFatPointers to use the InstSimplifyFolder
when creating IR during buffer fat pointer lowering.

This shouldn't cause any large functional changes and might improve the
quality of the generated code.

Copy link
Contributor Author

krzysz00 commented Apr 2, 2025

@llvmbot
Copy link
Member

llvmbot commented Apr 2, 2025

@llvm/pr-subscribers-llvm-globalisel

Author: Krzysztof Drewniak (krzysz00)

Changes

This PR updates AMDGPULowerBufferFatPointers to use the InstSemilpfyFolder
when creating IR during buffer fat pointer lowering.

This shouldn't cause any large functional changes and might improve the
quality of the generated code.


Patch is 41.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134137.diff

8 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+32-22)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll (-2)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll (+4-6)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll (+2-3)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+4-8)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll (+8-52)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll (+24-41)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll (+18-24)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 5dd1fe14e5626..e6250ddf2c26b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -224,7 +224,7 @@
 #include "SIDefines.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/AttributeMask.h"
@@ -445,7 +445,7 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
 
   ValueToValueMapTy ConvertedForStore;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   const TargetMachine *TM;
 
@@ -459,9 +459,10 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
 
 public:
   StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
+                                           const DataLayout &DL,
                                            LLVMContext &Ctx,
                                            const TargetMachine *TM)
-      : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
+      : TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {}
   bool processFunction(Function &F);
 
   bool visitInstruction(Instruction &I) { return false; }
@@ -683,7 +684,7 @@ class LegalizeBufferContentTypesVisitor
     : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
   friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   const DataLayout &DL;
 
@@ -743,7 +744,7 @@ class LegalizeBufferContentTypesVisitor
 
 public:
   LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
-      : IRB(Ctx), DL(DL) {}
+      : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
   bool processFunction(Function &F);
 };
 } // namespace
@@ -1326,7 +1327,7 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
   const TargetMachine *TM;
   const GCNSubtarget *ST = nullptr;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   // Copy metadata between instructions if applicable.
   void copyMetadata(Value *Dest, Value *Src);
@@ -1363,8 +1364,9 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
                           bool IsVolatile, SyncScope::ID SSID);
 
 public:
-  SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM)
-      : TM(TM), IRB(Ctx) {}
+  SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx,
+                  const TargetMachine *TM)
+      : TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {}
 
   void processFunction(Function &F);
 
@@ -1415,7 +1417,7 @@ PtrParts SplitPtrStructs::getPtrParts(Value *V) {
     return {*RsrcEntry = Rsrc, *OffEntry = Off};
   }
 
-  IRBuilder<>::InsertPointGuard Guard(IRB);
+  IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
   if (auto *I = dyn_cast<Instruction>(V)) {
     LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
     auto [Rsrc, Off] = visit(*I);
@@ -1479,7 +1481,7 @@ void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
 }
 
 void SplitPtrStructs::processConditionals() {
-  SmallDenseMap<Instruction *, Value *> FoundRsrcs;
+  SmallDenseMap<Value *, Value *> FoundRsrcs;
   SmallPtrSet<Value *, 4> Roots;
   SmallPtrSet<Value *, 4> Seen;
   for (Instruction *I : Conditionals) {
@@ -1493,7 +1495,7 @@ void SplitPtrStructs::processConditionals() {
     if (MaybeFoundRsrc != FoundRsrcs.end()) {
       MaybeRsrc = MaybeFoundRsrc->second;
     } else {
-      IRBuilder<>::InsertPointGuard Guard(IRB);
+      IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
       Roots.clear();
       Seen.clear();
       getPossibleRsrcRoots(I, Roots, Seen);
@@ -1558,21 +1560,29 @@ void SplitPtrStructs::processConditionals() {
       // to put the corrections maps in an inconstent state. That'll be handed
       // during the rest of the killing. Also, `ValueToValueMapTy` guarantees
       // that references in that map will be updated as well.
-      ConditionalTemps.push_back(cast<Instruction>(Rsrc));
-      ConditionalTemps.push_back(cast<Instruction>(Off));
-      Rsrc->replaceAllUsesWith(NewRsrc);
-      Off->replaceAllUsesWith(NewOff);
+      // Note that if the temporary instruction got `InstSimplify`'d away, it
+      // might be something like a block argument.
+      if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
+        ConditionalTemps.push_back(RsrcInst);
+        RsrcInst->replaceAllUsesWith(NewRsrc);
+      }
+      if (auto *OffInst = dyn_cast<Instruction>(Off)) {
+        ConditionalTemps.push_back(OffInst);
+        OffInst->replaceAllUsesWith(NewOff);
+      }
 
       // Save on recomputing the cycle traversals in known-root cases.
       if (MaybeRsrc)
         for (Value *V : Seen)
-          FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
+          FoundRsrcs[V] = NewRsrc;
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
-        ConditionalTemps.push_back(cast<Instruction>(Rsrc));
-        Rsrc->replaceAllUsesWith(*MaybeRsrc);
+        if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
+          ConditionalTemps.push_back(RsrcInst);
+          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+        }
         for (Value *V : Seen)
-          FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc;
+          FoundRsrcs[V] = *MaybeRsrc;
       }
     } else {
       llvm_unreachable("Only PHIs and selects go in the conditionals list");
@@ -2426,8 +2436,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
         /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
   }
 
-  StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
-                                                         &TM);
+  StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
+                                                         M.getContext(), &TM);
   LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
                                                               M.getContext());
   for (Function &F : M.functions()) {
@@ -2472,7 +2482,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   IntTM.clear();
   CloneMap.clear();
 
-  SplitPtrStructs Splitter(M.getContext(), &TM);
+  SplitPtrStructs Splitter(DL, M.getContext(), &TM);
   for (Function *F : NeedsPostProcess)
     Splitter.processFunction(*F);
   for (Function *F : Intrinsics) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
index c509cf4b1bf37..2450ca5063de0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
@@ -49,8 +49,6 @@ define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i6
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s32)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 022094bc633c8..3765bb0af79ba 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -48,9 +48,8 @@ define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) {
 ; CHECK-NEXT:    [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0
 ; CHECK-NEXT:    [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[X_OFF]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1
-; CHECK-NEXT:    [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]])
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[X]], i32 [[INC]], 1
+; CHECK-NEXT:    [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP1]], i32 [[V]])
 ; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[RET]]
 ;
   %inc = getelementptr i32, ptr addrspace(7) %x, i32 1
@@ -110,9 +109,8 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
 ; CHECK-NEXT:    [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
 ; CHECK-NEXT:    [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
-; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[ARG]], i32 [[RET]], 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP1]]
 ;
   %ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1
   ret ptr addrspace(7) %ret
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index d18f0f8bd1ff9..a8e67a4a61816 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1898,10 +1898,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i6(
 ; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
-; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index e6c2d1907068f..ee51b0b84554e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -133,8 +133,7 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -328,8 +327,7 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -792,8 +790,7 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -987,8 +984,7 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
index 9b2e2f950a39d..a8473927e1bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
@@ -14,11 +14,7 @@ define void @scalar_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr i160, ptr [[B]], i64 1
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[X_PTR_INT]], ptr [[B1]], align 32
+; CHECK-NEXT:    store i160 [[X]], ptr [[B1]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %x = load ptr addrspace(7), ptr %a
@@ -36,11 +32,7 @@ define void @vector_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr <4 x i128> [[TMP2]] to <4 x ptr addrspace(8)>
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc <4 x i160> [[X]] to <4 x i32>
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr <4 x i160>, ptr [[B]], i64 2
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint <4 x ptr addrspace(8)> [[X_PTR_RSRC]] to <4 x i160>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i160> [[X_PTR_INT_RSRC]], splat (i160 32)
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext <4 x i32> [[X_PTR_OFF]] to <4 x i160>
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or <4 x i160> [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store <4 x i160> [[X_PTR_INT]], ptr [[B1]], align 128
+; CHECK-NEXT:    store <4 x i160> [[X]], ptr [[B1]], align 128
 ; CHECK-NEXT:    ret void
 ;
   %x = load <4 x ptr addrspace(7)>, ptr %a
@@ -59,21 +51,13 @@ define void @alloca(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32
 ; CHECK-NEXT:    [[L:%.*]] = getelementptr i160, ptr addrspace(5) [[ALLOCA]], i32 1
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[X_PTR_INT]], ptr addrspace(5) [[L]], align 32
+; CHECK-NEXT:    store i160 [[X]], ptr addrspace(5) [[L]], align 32
 ; CHECK-NEXT:    [[Y:%.*]] = load i160, ptr addrspace(5) [[L]], align 32
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr i160 [[Y]], 32
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128
-; CHECK-NEXT:    [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8)
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i160 [[Y]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128
+; CHECK-NEXT:    [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[Y_PTR_OFF:%.*]] = trunc i160 [[Y]] to i32
-; CHECK-NEXT:    [[Y_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[Y_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i160 [[Y_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[Y_PTR_INT_OFF:%.*]] = zext i32 [[Y_PTR_OFF]] to i160
-; CHECK-NEXT:    [[Y_PTR_INT:%.*]] = or i160 [[TMP6]], [[Y_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[Y_PTR_INT]], ptr [[B]], align 32
+; CHECK-NEXT:    store i160 [[Y]], ptr [[B]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [5 x ptr addrspace(7)], addrspace(5)
@@ -117,35 +101,7 @@ define void @complex_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_2_PTR_RSRC]], 0
 ; CHECK-NEXT:    [[X_2_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Apr 2, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Krzysztof Drewniak (krzysz00)

Changes

This PR updates AMDGPULowerBufferFatPointers to use the InstSemilpfyFolder
when creating IR during buffer fat pointer lowering.

This shouldn't cause any large functional changes and might improve the
quality of the generated code.


Patch is 41.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134137.diff

8 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+32-22)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll (-2)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll (+4-6)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll (+2-3)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+4-8)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll (+8-52)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll (+24-41)
  • (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll (+18-24)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 5dd1fe14e5626..e6250ddf2c26b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -224,7 +224,7 @@
 #include "SIDefines.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/AttributeMask.h"
@@ -445,7 +445,7 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
 
   ValueToValueMapTy ConvertedForStore;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   const TargetMachine *TM;
 
@@ -459,9 +459,10 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
 
 public:
   StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
+                                           const DataLayout &DL,
                                            LLVMContext &Ctx,
                                            const TargetMachine *TM)
-      : TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
+      : TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {}
   bool processFunction(Function &F);
 
   bool visitInstruction(Instruction &I) { return false; }
@@ -683,7 +684,7 @@ class LegalizeBufferContentTypesVisitor
     : public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
   friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   const DataLayout &DL;
 
@@ -743,7 +744,7 @@ class LegalizeBufferContentTypesVisitor
 
 public:
   LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
-      : IRB(Ctx), DL(DL) {}
+      : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
   bool processFunction(Function &F);
 };
 } // namespace
@@ -1326,7 +1327,7 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
   const TargetMachine *TM;
   const GCNSubtarget *ST = nullptr;
 
-  IRBuilder<> IRB;
+  IRBuilder<InstSimplifyFolder> IRB;
 
   // Copy metadata between instructions if applicable.
   void copyMetadata(Value *Dest, Value *Src);
@@ -1363,8 +1364,9 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
                           bool IsVolatile, SyncScope::ID SSID);
 
 public:
-  SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM)
-      : TM(TM), IRB(Ctx) {}
+  SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx,
+                  const TargetMachine *TM)
+      : TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {}
 
   void processFunction(Function &F);
 
@@ -1415,7 +1417,7 @@ PtrParts SplitPtrStructs::getPtrParts(Value *V) {
     return {*RsrcEntry = Rsrc, *OffEntry = Off};
   }
 
-  IRBuilder<>::InsertPointGuard Guard(IRB);
+  IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
   if (auto *I = dyn_cast<Instruction>(V)) {
     LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
     auto [Rsrc, Off] = visit(*I);
@@ -1479,7 +1481,7 @@ void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
 }
 
 void SplitPtrStructs::processConditionals() {
-  SmallDenseMap<Instruction *, Value *> FoundRsrcs;
+  SmallDenseMap<Value *, Value *> FoundRsrcs;
   SmallPtrSet<Value *, 4> Roots;
   SmallPtrSet<Value *, 4> Seen;
   for (Instruction *I : Conditionals) {
@@ -1493,7 +1495,7 @@ void SplitPtrStructs::processConditionals() {
     if (MaybeFoundRsrc != FoundRsrcs.end()) {
       MaybeRsrc = MaybeFoundRsrc->second;
     } else {
-      IRBuilder<>::InsertPointGuard Guard(IRB);
+      IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
       Roots.clear();
       Seen.clear();
       getPossibleRsrcRoots(I, Roots, Seen);
@@ -1558,21 +1560,29 @@ void SplitPtrStructs::processConditionals() {
       // to put the corrections maps in an inconstent state. That'll be handed
       // during the rest of the killing. Also, `ValueToValueMapTy` guarantees
       // that references in that map will be updated as well.
-      ConditionalTemps.push_back(cast<Instruction>(Rsrc));
-      ConditionalTemps.push_back(cast<Instruction>(Off));
-      Rsrc->replaceAllUsesWith(NewRsrc);
-      Off->replaceAllUsesWith(NewOff);
+      // Note that if the temporary instruction got `InstSimplify`'d away, it
+      // might be something like a block argument.
+      if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
+        ConditionalTemps.push_back(RsrcInst);
+        RsrcInst->replaceAllUsesWith(NewRsrc);
+      }
+      if (auto *OffInst = dyn_cast<Instruction>(Off)) {
+        ConditionalTemps.push_back(OffInst);
+        OffInst->replaceAllUsesWith(NewOff);
+      }
 
       // Save on recomputing the cycle traversals in known-root cases.
       if (MaybeRsrc)
         for (Value *V : Seen)
-          FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
+          FoundRsrcs[V] = NewRsrc;
     } else if (isa<SelectInst>(I)) {
       if (MaybeRsrc) {
-        ConditionalTemps.push_back(cast<Instruction>(Rsrc));
-        Rsrc->replaceAllUsesWith(*MaybeRsrc);
+        if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
+          ConditionalTemps.push_back(RsrcInst);
+          RsrcInst->replaceAllUsesWith(*MaybeRsrc);
+        }
         for (Value *V : Seen)
-          FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc;
+          FoundRsrcs[V] = *MaybeRsrc;
       }
     } else {
       llvm_unreachable("Only PHIs and selects go in the conditionals list");
@@ -2426,8 +2436,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
         /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
   }
 
-  StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
-                                                         &TM);
+  StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
+                                                         M.getContext(), &TM);
   LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
                                                               M.getContext());
   for (Function &F : M.functions()) {
@@ -2472,7 +2482,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   IntTM.clear();
   CloneMap.clear();
 
-  SplitPtrStructs Splitter(M.getContext(), &TM);
+  SplitPtrStructs Splitter(DL, M.getContext(), &TM);
   for (Function *F : NeedsPostProcess)
     Splitter.processFunction(*F);
   for (Function *F : Intrinsics) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
index c509cf4b1bf37..2450ca5063de0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll
@@ -49,8 +49,6 @@ define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i6
   ; CHECK-NEXT:   [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s32)
   ; CHECK-NEXT:   [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
index 022094bc633c8..3765bb0af79ba 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
@@ -48,9 +48,8 @@ define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) {
 ; CHECK-NEXT:    [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0
 ; CHECK-NEXT:    [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[X_OFF]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1
-; CHECK-NEXT:    [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]])
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[X]], i32 [[INC]], 1
+; CHECK-NEXT:    [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP1]], i32 [[V]])
 ; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[RET]]
 ;
   %inc = getelementptr i32, ptr addrspace(7) %x, i32 1
@@ -110,9 +109,8 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
 ; CHECK-NEXT:    [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
 ; CHECK-NEXT:    [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
-; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[ARG]], i32 [[RET]], 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP1]]
 ;
   %ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1
   ret ptr addrspace(7) %ret
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
index d18f0f8bd1ff9..a8e67a4a61816 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll
@@ -1898,10 +1898,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
 ; CHECK-LABEL: define void @store_v32i6(
 ; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
-; CHECK-NEXT:    [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
-; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index e6c2d1907068f..ee51b0b84554e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -133,8 +133,7 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -328,8 +327,7 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -792,8 +790,7 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -987,8 +984,7 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
 ; CHECK-NEXT:    [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
 ; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
-; CHECK-NEXT:    [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP4]] = add i64 [[LOOP_INDEX]], 256
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
index 9b2e2f950a39d..a8473927e1bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll
@@ -14,11 +14,7 @@ define void @scalar_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr i160, ptr [[B]], i64 1
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[X_PTR_INT]], ptr [[B1]], align 32
+; CHECK-NEXT:    store i160 [[X]], ptr [[B1]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %x = load ptr addrspace(7), ptr %a
@@ -36,11 +32,7 @@ define void @vector_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr <4 x i128> [[TMP2]] to <4 x ptr addrspace(8)>
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc <4 x i160> [[X]] to <4 x i32>
 ; CHECK-NEXT:    [[B1:%.*]] = getelementptr <4 x i160>, ptr [[B]], i64 2
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint <4 x ptr addrspace(8)> [[X_PTR_RSRC]] to <4 x i160>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw <4 x i160> [[X_PTR_INT_RSRC]], splat (i160 32)
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext <4 x i32> [[X_PTR_OFF]] to <4 x i160>
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or <4 x i160> [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store <4 x i160> [[X_PTR_INT]], ptr [[B1]], align 128
+; CHECK-NEXT:    store <4 x i160> [[X]], ptr [[B1]], align 128
 ; CHECK-NEXT:    ret void
 ;
   %x = load <4 x ptr addrspace(7)>, ptr %a
@@ -59,21 +51,13 @@ define void @alloca(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32
 ; CHECK-NEXT:    [[L:%.*]] = getelementptr i160, ptr addrspace(5) [[ALLOCA]], i32 1
-; CHECK-NEXT:    [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160
-; CHECK-NEXT:    [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[X_PTR_INT]], ptr addrspace(5) [[L]], align 32
+; CHECK-NEXT:    store i160 [[X]], ptr addrspace(5) [[L]], align 32
 ; CHECK-NEXT:    [[Y:%.*]] = load i160, ptr addrspace(5) [[L]], align 32
-; CHECK-NEXT:    [[TMP4:%.*]] = lshr i160 [[Y]], 32
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128
-; CHECK-NEXT:    [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8)
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i160 [[Y]], 32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128
+; CHECK-NEXT:    [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8)
 ; CHECK-NEXT:    [[Y_PTR_OFF:%.*]] = trunc i160 [[Y]] to i32
-; CHECK-NEXT:    [[Y_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[Y_PTR_RSRC]] to i160
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i160 [[Y_PTR_INT_RSRC]], 32
-; CHECK-NEXT:    [[Y_PTR_INT_OFF:%.*]] = zext i32 [[Y_PTR_OFF]] to i160
-; CHECK-NEXT:    [[Y_PTR_INT:%.*]] = or i160 [[TMP6]], [[Y_PTR_INT_OFF]]
-; CHECK-NEXT:    store i160 [[Y_PTR_INT]], ptr [[B]], align 32
+; CHECK-NEXT:    store i160 [[Y]], ptr [[B]], align 32
 ; CHECK-NEXT:    ret void
 ;
   %alloca = alloca [5 x ptr addrspace(7)], addrspace(5)
@@ -117,35 +101,7 @@ define void @complex_copy(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_2_PTR_RSRC]], 0
 ; CHECK-NEXT:    [[X_2_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [...
[truncated]

@krzysz00 krzysz00 requested review from arsenm, nikic and lialan April 2, 2025 18:51
Base automatically changed from users/krzysz00/is-legal-masked-load-with-addrspace to main April 2, 2025 20:38
This PR updates AMDGPULowerBufferFatPointers to use the InstSemilpfyFolder
when creating IR during buffer fat pointer lowering.

This shouldn't cause any large functional changes and might improve the
quality of the generated code.
@krzysz00 krzysz00 force-pushed the users/krzysz00/use-instsimplify-buffer-fat-pointers branch from 9f1a39d to a8da904 Compare April 2, 2025 20:40
@krzysz00 krzysz00 merged commit f23bb53 into main Apr 3, 2025
11 checks passed
@krzysz00 krzysz00 deleted the users/krzysz00/use-instsimplify-buffer-fat-pointers branch April 3, 2025 15:12
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants