Skip to content

[AMDGPULowerBufferFatPointers] Use InstSimplifyFolder during rewrites #134137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 32 additions & 22 deletions llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@
#include "SIDefines.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstSimplifyFolder.h"
#include "llvm/Analysis/Utils/Local.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/AttributeMask.h"
Expand Down Expand Up @@ -445,7 +445,7 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor

ValueToValueMapTy ConvertedForStore;

IRBuilder<> IRB;
IRBuilder<InstSimplifyFolder> IRB;

const TargetMachine *TM;

Expand All @@ -459,9 +459,10 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor

public:
StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
const DataLayout &DL,
LLVMContext &Ctx,
const TargetMachine *TM)
: TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
: TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {}
bool processFunction(Function &F);

bool visitInstruction(Instruction &I) { return false; }
Expand Down Expand Up @@ -683,7 +684,7 @@ class LegalizeBufferContentTypesVisitor
: public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;

IRBuilder<> IRB;
IRBuilder<InstSimplifyFolder> IRB;

const DataLayout &DL;

Expand Down Expand Up @@ -743,7 +744,7 @@ class LegalizeBufferContentTypesVisitor

public:
LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
: IRB(Ctx), DL(DL) {}
: IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
bool processFunction(Function &F);
};
} // namespace
Expand Down Expand Up @@ -1326,7 +1327,7 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
const TargetMachine *TM;
const GCNSubtarget *ST = nullptr;

IRBuilder<> IRB;
IRBuilder<InstSimplifyFolder> IRB;

// Copy metadata between instructions if applicable.
void copyMetadata(Value *Dest, Value *Src);
Expand Down Expand Up @@ -1363,8 +1364,9 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
bool IsVolatile, SyncScope::ID SSID);

public:
SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM)
: TM(TM), IRB(Ctx) {}
SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx,
const TargetMachine *TM)
: TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {}

void processFunction(Function &F);

Expand Down Expand Up @@ -1415,7 +1417,7 @@ PtrParts SplitPtrStructs::getPtrParts(Value *V) {
return {*RsrcEntry = Rsrc, *OffEntry = Off};
}

IRBuilder<>::InsertPointGuard Guard(IRB);
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
if (auto *I = dyn_cast<Instruction>(V)) {
LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
auto [Rsrc, Off] = visit(*I);
Expand Down Expand Up @@ -1479,7 +1481,7 @@ void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
}

void SplitPtrStructs::processConditionals() {
SmallDenseMap<Instruction *, Value *> FoundRsrcs;
SmallDenseMap<Value *, Value *> FoundRsrcs;
SmallPtrSet<Value *, 4> Roots;
SmallPtrSet<Value *, 4> Seen;
for (Instruction *I : Conditionals) {
Expand All @@ -1493,7 +1495,7 @@ void SplitPtrStructs::processConditionals() {
if (MaybeFoundRsrc != FoundRsrcs.end()) {
MaybeRsrc = MaybeFoundRsrc->second;
} else {
IRBuilder<>::InsertPointGuard Guard(IRB);
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
Roots.clear();
Seen.clear();
getPossibleRsrcRoots(I, Roots, Seen);
Expand Down Expand Up @@ -1558,21 +1560,29 @@ void SplitPtrStructs::processConditionals() {
// to put the corrections maps in an inconstent state. That'll be handed
// during the rest of the killing. Also, `ValueToValueMapTy` guarantees
// that references in that map will be updated as well.
ConditionalTemps.push_back(cast<Instruction>(Rsrc));
ConditionalTemps.push_back(cast<Instruction>(Off));
Rsrc->replaceAllUsesWith(NewRsrc);
Off->replaceAllUsesWith(NewOff);
// Note that if the temporary instruction got `InstSimplify`'d away, it
// might be something like a block argument.
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
ConditionalTemps.push_back(RsrcInst);
RsrcInst->replaceAllUsesWith(NewRsrc);
}
if (auto *OffInst = dyn_cast<Instruction>(Off)) {
ConditionalTemps.push_back(OffInst);
OffInst->replaceAllUsesWith(NewOff);
}

// Save on recomputing the cycle traversals in known-root cases.
if (MaybeRsrc)
for (Value *V : Seen)
FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
FoundRsrcs[V] = NewRsrc;
} else if (isa<SelectInst>(I)) {
if (MaybeRsrc) {
ConditionalTemps.push_back(cast<Instruction>(Rsrc));
Rsrc->replaceAllUsesWith(*MaybeRsrc);
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
ConditionalTemps.push_back(RsrcInst);
RsrcInst->replaceAllUsesWith(*MaybeRsrc);
}
for (Value *V : Seen)
FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc;
FoundRsrcs[V] = *MaybeRsrc;
}
} else {
llvm_unreachable("Only PHIs and selects go in the conditionals list");
Expand Down Expand Up @@ -2426,8 +2436,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
}

StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
&TM);
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
M.getContext(), &TM);
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
M.getContext());
for (Function &F : M.functions()) {
Expand Down Expand Up @@ -2472,7 +2482,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
IntTM.clear();
CloneMap.clear();

SplitPtrStructs Splitter(M.getContext(), &TM);
SplitPtrStructs Splitter(DL, M.getContext(), &TM);
for (Function *F : NeedsPostProcess)
Splitter.processFunction(*F);
for (Function *F : Intrinsics) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i6
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s32)
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>)
Expand Down
10 changes: 4 additions & 6 deletions llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,8 @@ define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) {
; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0
; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1
; CHECK-NEXT: [[INC:%.*]] = add i32 [[X_OFF]], 4
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1
; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]])
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[X]], i32 [[INC]], 1
; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP1]], i32 [[V]])
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]]
;
%inc = getelementptr i32, ptr addrspace(7) %x, i32 1
Expand Down Expand Up @@ -110,9 +109,8 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]]
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[ARG]], i32 [[RET]], 1
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]]
;
%ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1
ret ptr addrspace(7) %ret
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1898,10 +1898,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
; CHECK-LABEL: define void @store_v32i6(
; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
; CHECK-NEXT: ret void
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
Expand Down Expand Up @@ -328,8 +327,7 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
Expand Down Expand Up @@ -792,8 +790,7 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
Expand Down Expand Up @@ -987,8 +984,7 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
Expand Down
Loading