Skip to content

Commit b95a6c7

Browse files
authored
[AMDGPU] Remove special cases in TTI::getMemcpyLoop(Residual)LoweringType (#125507)
These special cases limit the width of memory operations we use for lowering memcpy/memmove when the pointer arguments are 2-aligned or in the LDS/GDS. I found that performance in microbenchmarks on gfx90a, gfx1030, and gfx1100 is better without this limitation.
1 parent b9fa35f commit b95a6c7

File tree

3 files changed

+258
-243
lines changed

3 files changed

+258
-243
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 18 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,6 @@ int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
416416
return 1024;
417417
}
418418

419-
// FIXME: Should we use narrower types for local/region, or account for when
420-
// unaligned access is legal?
421419
Type *GCNTTIImpl::getMemcpyLoopLoweringType(
422420
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
423421
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
@@ -426,29 +424,12 @@ Type *GCNTTIImpl::getMemcpyLoopLoweringType(
426424
if (AtomicElementSize)
427425
return Type::getIntNTy(Context, *AtomicElementSize * 8);
428426

429-
Align MinAlign = std::min(SrcAlign, DestAlign);
430-
431-
// A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
432-
// hardware into byte accesses. If you assume all alignments are equally
433-
// probable, it's more efficient on average to use short accesses for this
434-
// case.
435-
if (MinAlign == Align(2))
436-
return Type::getInt16Ty(Context);
437-
438-
// Not all subtargets have 128-bit DS instructions, and we currently don't
439-
// form them by default.
440-
if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
441-
SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
442-
DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
443-
DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
444-
return FixedVectorType::get(Type::getInt32Ty(Context), 2);
445-
}
446-
447-
// Global memory works best with 16-byte accesses.
427+
// 16-byte accesses achieve the highest copy throughput.
448428
// If the operation has a fixed known length that is large enough, it is
449429
// worthwhile to return an even wider type and let legalization lower it into
450-
// multiple accesses, effectively unrolling the memcpy loop. Private memory
451-
// also hits this, although accesses may be decomposed.
430+
// multiple accesses, effectively unrolling the memcpy loop.
431+
// We also rely on legalization to decompose into smaller accesses for
432+
// subtargets and address spaces where it is necessary.
452433
//
453434
// Don't unroll if Length is not a constant, since unrolling leads to worse
454435
// performance for length values that are smaller or slightly larger than the
@@ -473,26 +454,22 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
473454
OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
474455
DestAlign, AtomicCpySize);
475456

476-
Align MinAlign = std::min(SrcAlign, DestAlign);
477-
478-
if (MinAlign != Align(2)) {
479-
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
480-
while (RemainingBytes >= 16) {
481-
OpsOut.push_back(I32x4Ty);
482-
RemainingBytes -= 16;
483-
}
457+
Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
458+
while (RemainingBytes >= 16) {
459+
OpsOut.push_back(I32x4Ty);
460+
RemainingBytes -= 16;
461+
}
484462

485-
Type *I64Ty = Type::getInt64Ty(Context);
486-
while (RemainingBytes >= 8) {
487-
OpsOut.push_back(I64Ty);
488-
RemainingBytes -= 8;
489-
}
463+
Type *I64Ty = Type::getInt64Ty(Context);
464+
while (RemainingBytes >= 8) {
465+
OpsOut.push_back(I64Ty);
466+
RemainingBytes -= 8;
467+
}
490468

491-
Type *I32Ty = Type::getInt32Ty(Context);
492-
while (RemainingBytes >= 4) {
493-
OpsOut.push_back(I32Ty);
494-
RemainingBytes -= 4;
495-
}
469+
Type *I32Ty = Type::getInt32Ty(Context);
470+
while (RemainingBytes >= 4) {
471+
OpsOut.push_back(I32Ty);
472+
RemainingBytes -= 4;
496473
}
497474

498475
Type *I16Ty = Type::getInt16Ty(Context);

0 commit comments

Comments
 (0)