Skip to content

Commit f23bb53

Browse files
authored
[AMDGPULowerBufferFatPointers] Use InstSimplifyFolder during rewrites (#134137)
This PR updates AMDGPULowerBufferFatPointers to use the InstSimplifyFolder when creating IR during buffer fat pointer lowering. This shouldn't cause any large functional changes and might improve the quality of the generated code.
1 parent 2334fd2 commit f23bb53

8 files changed

+92
-158
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@
224224
#include "SIDefines.h"
225225
#include "llvm/ADT/SetOperations.h"
226226
#include "llvm/ADT/SmallVector.h"
227-
#include "llvm/Analysis/ConstantFolding.h"
227+
#include "llvm/Analysis/InstSimplifyFolder.h"
228228
#include "llvm/Analysis/Utils/Local.h"
229229
#include "llvm/CodeGen/TargetPassConfig.h"
230230
#include "llvm/IR/AttributeMask.h"
@@ -445,7 +445,7 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
445445

446446
ValueToValueMapTy ConvertedForStore;
447447

448-
IRBuilder<> IRB;
448+
IRBuilder<InstSimplifyFolder> IRB;
449449

450450
const TargetMachine *TM;
451451

@@ -459,9 +459,10 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
459459

460460
public:
461461
StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
462+
const DataLayout &DL,
462463
LLVMContext &Ctx,
463464
const TargetMachine *TM)
464-
: TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
465+
: TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {}
465466
bool processFunction(Function &F);
466467

467468
bool visitInstruction(Instruction &I) { return false; }
@@ -683,7 +684,7 @@ class LegalizeBufferContentTypesVisitor
683684
: public InstVisitor<LegalizeBufferContentTypesVisitor, bool> {
684685
friend class InstVisitor<LegalizeBufferContentTypesVisitor, bool>;
685686

686-
IRBuilder<> IRB;
687+
IRBuilder<InstSimplifyFolder> IRB;
687688

688689
const DataLayout &DL;
689690

@@ -743,7 +744,7 @@ class LegalizeBufferContentTypesVisitor
743744

744745
public:
745746
LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx)
746-
: IRB(Ctx), DL(DL) {}
747+
: IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {}
747748
bool processFunction(Function &F);
748749
};
749750
} // namespace
@@ -1326,7 +1327,7 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
13261327
const TargetMachine *TM;
13271328
const GCNSubtarget *ST = nullptr;
13281329

1329-
IRBuilder<> IRB;
1330+
IRBuilder<InstSimplifyFolder> IRB;
13301331

13311332
// Copy metadata between instructions if applicable.
13321333
void copyMetadata(Value *Dest, Value *Src);
@@ -1363,8 +1364,9 @@ class SplitPtrStructs : public InstVisitor<SplitPtrStructs, PtrParts> {
13631364
bool IsVolatile, SyncScope::ID SSID);
13641365

13651366
public:
1366-
SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM)
1367-
: TM(TM), IRB(Ctx) {}
1367+
SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx,
1368+
const TargetMachine *TM)
1369+
: TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {}
13681370

13691371
void processFunction(Function &F);
13701372

@@ -1415,7 +1417,7 @@ PtrParts SplitPtrStructs::getPtrParts(Value *V) {
14151417
return {*RsrcEntry = Rsrc, *OffEntry = Off};
14161418
}
14171419

1418-
IRBuilder<>::InsertPointGuard Guard(IRB);
1420+
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
14191421
if (auto *I = dyn_cast<Instruction>(V)) {
14201422
LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n");
14211423
auto [Rsrc, Off] = visit(*I);
@@ -1479,7 +1481,7 @@ void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I,
14791481
}
14801482

14811483
void SplitPtrStructs::processConditionals() {
1482-
SmallDenseMap<Instruction *, Value *> FoundRsrcs;
1484+
SmallDenseMap<Value *, Value *> FoundRsrcs;
14831485
SmallPtrSet<Value *, 4> Roots;
14841486
SmallPtrSet<Value *, 4> Seen;
14851487
for (Instruction *I : Conditionals) {
@@ -1493,7 +1495,7 @@ void SplitPtrStructs::processConditionals() {
14931495
if (MaybeFoundRsrc != FoundRsrcs.end()) {
14941496
MaybeRsrc = MaybeFoundRsrc->second;
14951497
} else {
1496-
IRBuilder<>::InsertPointGuard Guard(IRB);
1498+
IRBuilder<InstSimplifyFolder>::InsertPointGuard Guard(IRB);
14971499
Roots.clear();
14981500
Seen.clear();
14991501
getPossibleRsrcRoots(I, Roots, Seen);
@@ -1558,21 +1560,29 @@ void SplitPtrStructs::processConditionals() {
15581560
// to put the corrections maps in an inconstent state. That'll be handed
15591561
// during the rest of the killing. Also, `ValueToValueMapTy` guarantees
15601562
// that references in that map will be updated as well.
1561-
ConditionalTemps.push_back(cast<Instruction>(Rsrc));
1562-
ConditionalTemps.push_back(cast<Instruction>(Off));
1563-
Rsrc->replaceAllUsesWith(NewRsrc);
1564-
Off->replaceAllUsesWith(NewOff);
1563+
// Note that if the temporary instruction got `InstSimplify`'d away, it
1564+
// might be something like a block argument.
1565+
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
1566+
ConditionalTemps.push_back(RsrcInst);
1567+
RsrcInst->replaceAllUsesWith(NewRsrc);
1568+
}
1569+
if (auto *OffInst = dyn_cast<Instruction>(Off)) {
1570+
ConditionalTemps.push_back(OffInst);
1571+
OffInst->replaceAllUsesWith(NewOff);
1572+
}
15651573

15661574
// Save on recomputing the cycle traversals in known-root cases.
15671575
if (MaybeRsrc)
15681576
for (Value *V : Seen)
1569-
FoundRsrcs[cast<Instruction>(V)] = NewRsrc;
1577+
FoundRsrcs[V] = NewRsrc;
15701578
} else if (isa<SelectInst>(I)) {
15711579
if (MaybeRsrc) {
1572-
ConditionalTemps.push_back(cast<Instruction>(Rsrc));
1573-
Rsrc->replaceAllUsesWith(*MaybeRsrc);
1580+
if (auto *RsrcInst = dyn_cast<Instruction>(Rsrc)) {
1581+
ConditionalTemps.push_back(RsrcInst);
1582+
RsrcInst->replaceAllUsesWith(*MaybeRsrc);
1583+
}
15741584
for (Value *V : Seen)
1575-
FoundRsrcs[cast<Instruction>(V)] = *MaybeRsrc;
1585+
FoundRsrcs[V] = *MaybeRsrc;
15761586
}
15771587
} else {
15781588
llvm_unreachable("Only PHIs and selects go in the conditionals list");
@@ -2426,8 +2436,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
24262436
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
24272437
}
24282438

2429-
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
2430-
&TM);
2439+
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL,
2440+
M.getContext(), &TM);
24312441
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
24322442
M.getContext());
24332443
for (Function &F : M.functions()) {
@@ -2472,7 +2482,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
24722482
IntTM.clear();
24732483
CloneMap.clear();
24742484

2475-
SplitPtrStructs Splitter(M.getContext(), &TM);
2485+
SplitPtrStructs Splitter(DL, M.getContext(), &TM);
24762486
for (Function *F : NeedsPostProcess)
24772487
Splitter.processFunction(*F);
24782488
for (Function *F : Intrinsics) {

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,6 @@ define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i6
4949
; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32)
5050
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
5151
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
52-
; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF
53-
; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
5452
; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s32)
5553
; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
5654
; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>)

llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) {
4848
; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0
4949
; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1
5050
; CHECK-NEXT: [[INC:%.*]] = add i32 [[X_OFF]], 4
51-
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0
52-
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1
53-
; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]])
51+
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[X]], i32 [[INC]], 1
52+
; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP1]], i32 [[V]])
5453
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]]
5554
;
5655
%inc = getelementptr i32, ptr addrspace(7) %x, i32 1
@@ -110,9 +109,8 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n
110109
; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0
111110
; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1
112111
; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4
113-
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0
114-
; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1
115-
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]]
112+
; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[ARG]], i32 [[RET]], 1
113+
; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]]
116114
;
117115
%ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1
118116
ret ptr addrspace(7) %ret

llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,10 +1898,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) {
18981898
; CHECK-LABEL: define void @store_v32i6(
18991899
; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] {
19001900
; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6>
1901-
; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32>
1902-
; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1901+
; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
19031902
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0)
1904-
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
1903+
; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> <i32 4, i32 5>
19051904
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0)
19061905
; CHECK-NEXT: ret void
19071906
;

llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,7 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d
133133
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
134134
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
135135
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
136-
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
137-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
136+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
138137
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
139138
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
140139
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -328,8 +327,7 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre
328327
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
329328
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
330329
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
331-
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
332-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
330+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
333331
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
334332
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
335333
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -792,8 +790,7 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i
792790
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
793791
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
794792
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
795-
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
796-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
793+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
797794
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
798795
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192
799796
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]
@@ -987,8 +984,7 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(
987984
; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
988985
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
989986
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240
990-
; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
991-
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
987+
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
992988
; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
993989
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192
994990
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]]

0 commit comments

Comments
 (0)