Skip to content

Commit 9e22764

Browse files
krzysz00joaosaffran
authored andcommitted
Reapply "[AMDGPU] Handle memcpy()-like ops in LowerBufferFatPointers (llvm#126621)" (llvm#129078)
This reverts commit 1559a65. Fixed test (I suspect broken by unrelated change in the merge)
1 parent 63d1790 commit 9e22764

File tree

3 files changed

+3173
-24
lines changed

3 files changed

+3173
-24
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp

Lines changed: 98 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,16 @@
4545
//
4646
// This pass proceeds in three main phases:
4747
//
48-
// ## Rewriting loads and stores of p7
48+
// ## Rewriting loads and stores of p7 and memcpy()-like handling
4949
//
5050
// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`,
5151
// including aggregates containing such pointers, to ones that use `i160`. This
52-
// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and
53-
// allocas and, if the loaded or stored type contains `ptr addrspace(7)`,
54-
// rewrites that type to one where the p7s are replaced by i160s, copying other
55-
// parts of aggregates as needed. In the case of a store, each pointer is
56-
// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back.
57-
// This same transformation is applied to vectors of pointers.
52+
// is handled by `StoreFatPtrsAsIntsAndExpandMemcpyVisitor` , which visits
53+
// loads, stores, and allocas and, if the loaded or stored type contains `ptr
54+
// addrspace(7)`, rewrites that type to one where the p7s are replaced by i160s,
55+
// copying other parts of aggregates as needed. In the case of a store, each
56+
// pointer is `ptrtoint`d to i160 before storing, and load integers are
57+
// `inttoptr`d back. This same transformation is applied to vectors of pointers.
5858
//
5959
// Such a transformation allows the later phases of the pass to not need
6060
// to handle buffer fat pointers moving to and from memory, where we load
@@ -66,6 +66,10 @@
6666
// Atomics operations on `ptr addrspace(7)` values are not suppported, as the
6767
// hardware does not include a 160-bit atomic.
6868
//
69+
// In order to save on O(N) work and to ensure that the contents type
70+
// legalizer correctly splits up wide loads, also unconditionally lower
71+
// memcpy-like intrinsics into loops here.
72+
//
6973
// ## Buffer contents type legalization
7074
//
7175
// The underlying buffer intrinsics only support types up to 128 bits long,
@@ -231,20 +235,24 @@
231235
#include "llvm/IR/InstIterator.h"
232236
#include "llvm/IR/InstVisitor.h"
233237
#include "llvm/IR/Instructions.h"
238+
#include "llvm/IR/IntrinsicInst.h"
234239
#include "llvm/IR/Intrinsics.h"
235240
#include "llvm/IR/IntrinsicsAMDGPU.h"
236241
#include "llvm/IR/Metadata.h"
237242
#include "llvm/IR/Operator.h"
238243
#include "llvm/IR/PatternMatch.h"
239244
#include "llvm/IR/ReplaceConstant.h"
245+
#include "llvm/IR/ValueHandle.h"
240246
#include "llvm/InitializePasses.h"
241247
#include "llvm/Pass.h"
248+
#include "llvm/Support/AMDGPUAddrSpace.h"
242249
#include "llvm/Support/Alignment.h"
243250
#include "llvm/Support/AtomicOrdering.h"
244251
#include "llvm/Support/Debug.h"
245252
#include "llvm/Support/ErrorHandling.h"
246253
#include "llvm/Transforms/Utils/Cloning.h"
247254
#include "llvm/Transforms/Utils/Local.h"
255+
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
248256
#include "llvm/Transforms/Utils/ValueMapper.h"
249257

250258
#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers"
@@ -431,14 +439,16 @@ namespace {
431439
/// marshalling costs when reading or storing these values, but since placing
432440
/// such pointers into memory is an uncommon operation at best, we feel that
433441
/// this cost is acceptable for better performance in the common case.
434-
class StoreFatPtrsAsIntsVisitor
435-
: public InstVisitor<StoreFatPtrsAsIntsVisitor, bool> {
442+
class StoreFatPtrsAsIntsAndExpandMemcpyVisitor
443+
: public InstVisitor<StoreFatPtrsAsIntsAndExpandMemcpyVisitor, bool> {
436444
BufferFatPtrToIntTypeMap *TypeMap;
437445

438446
ValueToValueMapTy ConvertedForStore;
439447

440448
IRBuilder<> IRB;
441449

450+
const TargetMachine *TM;
451+
442452
// Convert all the buffer fat pointers within the input value to inttegers
443453
// so that it can be stored in memory.
444454
Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name);
@@ -448,20 +458,27 @@ class StoreFatPtrsAsIntsVisitor
448458
Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name);
449459

450460
public:
451-
StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx)
452-
: TypeMap(TypeMap), IRB(Ctx) {}
461+
StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap,
462+
LLVMContext &Ctx,
463+
const TargetMachine *TM)
464+
: TypeMap(TypeMap), IRB(Ctx), TM(TM) {}
453465
bool processFunction(Function &F);
454466

455467
bool visitInstruction(Instruction &I) { return false; }
456468
bool visitAllocaInst(AllocaInst &I);
457469
bool visitLoadInst(LoadInst &LI);
458470
bool visitStoreInst(StoreInst &SI);
459471
bool visitGetElementPtrInst(GetElementPtrInst &I);
472+
473+
bool visitMemCpyInst(MemCpyInst &MCI);
474+
bool visitMemMoveInst(MemMoveInst &MMI);
475+
bool visitMemSetInst(MemSetInst &MSI);
476+
bool visitMemSetPatternInst(MemSetPatternInst &MSPI);
460477
};
461478
} // namespace
462479

463-
Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
464-
const Twine &Name) {
480+
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::fatPtrsToInts(
481+
Value *V, Type *From, Type *To, const Twine &Name) {
465482
if (From == To)
466483
return V;
467484
ValueToValueMapTy::iterator Find = ConvertedForStore.find(V);
@@ -498,8 +515,8 @@ Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To,
498515
return Ret;
499516
}
500517

501-
Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
502-
const Twine &Name) {
518+
Value *StoreFatPtrsAsIntsAndExpandMemcpyVisitor::intsToFatPtrs(
519+
Value *V, Type *From, Type *To, const Twine &Name) {
503520
if (From == To)
504521
return V;
505522
if (isBufferFatPtrOrVector(To)) {
@@ -531,18 +548,25 @@ Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To,
531548
return Ret;
532549
}
533550

534-
bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) {
551+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::processFunction(Function &F) {
535552
bool Changed = false;
536-
// The visitors will mutate GEPs and allocas, but will push loads and stores
537-
// to the worklist to avoid invalidation.
553+
// Process memcpy-like instructions after the main iteration because they can
554+
// invalidate iterators.
555+
SmallVector<WeakTrackingVH> CanBecomeLoops;
538556
for (Instruction &I : make_early_inc_range(instructions(F))) {
539-
Changed |= visit(I);
557+
if (isa<MemTransferInst, MemSetInst, MemSetPatternInst>(I))
558+
CanBecomeLoops.push_back(&I);
559+
else
560+
Changed |= visit(I);
561+
}
562+
for (WeakTrackingVH VH : make_early_inc_range(CanBecomeLoops)) {
563+
Changed |= visit(cast<Instruction>(VH));
540564
}
541565
ConvertedForStore.clear();
542566
return Changed;
543567
}
544568

545-
bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
569+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitAllocaInst(AllocaInst &I) {
546570
Type *Ty = I.getAllocatedType();
547571
Type *NewTy = TypeMap->remapType(Ty);
548572
if (Ty == NewTy)
@@ -551,7 +575,8 @@ bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) {
551575
return true;
552576
}
553577

554-
bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
578+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitGetElementPtrInst(
579+
GetElementPtrInst &I) {
555580
Type *Ty = I.getSourceElementType();
556581
Type *NewTy = TypeMap->remapType(Ty);
557582
if (Ty == NewTy)
@@ -563,7 +588,7 @@ bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) {
563588
return true;
564589
}
565590

566-
bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
591+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitLoadInst(LoadInst &LI) {
567592
Type *Ty = LI.getType();
568593
Type *IntTy = TypeMap->remapType(Ty);
569594
if (Ty == IntTy)
@@ -581,7 +606,7 @@ bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) {
581606
return true;
582607
}
583608

584-
bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
609+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitStoreInst(StoreInst &SI) {
585610
Value *V = SI.getValueOperand();
586611
Type *Ty = V->getType();
587612
Type *IntTy = TypeMap->remapType(Ty);
@@ -597,6 +622,47 @@ bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) {
597622
return true;
598623
}
599624

625+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemCpyInst(
626+
MemCpyInst &MCI) {
627+
// TODO: Allow memcpy.p7.p3 as a synonym for the direct-to-LDS copy, which'll
628+
// need loop expansion here.
629+
if (MCI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
630+
MCI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
631+
return false;
632+
llvm::expandMemCpyAsLoop(&MCI,
633+
TM->getTargetTransformInfo(*MCI.getFunction()));
634+
MCI.eraseFromParent();
635+
return true;
636+
}
637+
638+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemMoveInst(
639+
MemMoveInst &MMI) {
640+
if (MMI.getSourceAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER &&
641+
MMI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
642+
return false;
643+
report_fatal_error(
644+
"memmove() on buffer descriptors is not implemented because pointer "
645+
"comparison on buffer descriptors isn't implemented\n");
646+
}
647+
648+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
649+
MemSetInst &MSI) {
650+
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
651+
return false;
652+
llvm::expandMemSetAsLoop(&MSI);
653+
MSI.eraseFromParent();
654+
return true;
655+
}
656+
657+
bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
658+
MemSetPatternInst &MSPI) {
659+
if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
660+
return false;
661+
llvm::expandMemSetPatternAsLoop(&MSPI);
662+
MSPI.eraseFromParent();
663+
return true;
664+
}
665+
600666
namespace {
601667
/// Convert loads/stores of types that the buffer intrinsics can't handle into
602668
/// one ore more such loads/stores that consist of legal types.
@@ -1127,6 +1193,7 @@ bool LegalizeBufferContentTypesVisitor::visitStoreInst(StoreInst &SI) {
11271193

11281194
bool LegalizeBufferContentTypesVisitor::processFunction(Function &F) {
11291195
bool Changed = false;
1196+
// Note, memory transfer intrinsics won't
11301197
for (Instruction &I : make_early_inc_range(instructions(F))) {
11311198
Changed |= visit(I);
11321199
}
@@ -2084,6 +2151,12 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
20842151
case Intrinsic::invariant_end:
20852152
case Intrinsic::launder_invariant_group:
20862153
case Intrinsic::strip_invariant_group:
2154+
case Intrinsic::memcpy:
2155+
case Intrinsic::memcpy_inline:
2156+
case Intrinsic::memmove:
2157+
case Intrinsic::memset:
2158+
case Intrinsic::memset_inline:
2159+
case Intrinsic::experimental_memset_pattern:
20872160
return true;
20882161
}
20892162
}
@@ -2353,7 +2426,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
23532426
/*RemoveDeadConstants=*/false, /*IncludeSelf=*/true);
23542427
}
23552428

2356-
StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext());
2429+
StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(),
2430+
&TM);
23572431
LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL,
23582432
M.getContext());
23592433
for (Function &F : M.functions()) {

0 commit comments

Comments
 (0)