Skip to content

Commit 266db91

Browse files
committed
[AArch64][GlobalISel] Avoid splitting loads of large vector types into individual element loads
This patch adds custom legalization for G_LOAD where it splits loads of fixed-width vector types larger than 128 bits into loads of 128-bit vectors with the same element type. This is an improvement to what was being done before where loads would be split into individual loads for each element of the vector.
1 parent e6c1c9f commit 266db91

File tree

3 files changed

+290
-1297
lines changed

3 files changed

+290
-1297
lines changed

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
356356
return Query.Types[0] == s128 &&
357357
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
358358
})
359+
.customIf([=](const LegalityQuery &Query) {
360+
// We need custom legalization for loads greater than 128-bits as they
361+
// need to be split up into chunks.
362+
return Query.Types[0].isFixedVector() &&
363+
Query.Types[0].getSizeInBits() > 128;
364+
})
359365
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
360366
{s16, p0, s16, 8},
361367
{s32, p0, s32, 8},
@@ -1632,6 +1638,70 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
16321638
Register ValReg = MI.getOperand(0).getReg();
16331639
const LLT ValTy = MRI.getType(ValReg);
16341640

1641+
if (ValTy.isFixedVector() && ValTy.getSizeInBits() > 128) {
1642+
// Break fixed-width vector loads of sizes greater than 128 bits into chunks
1643+
// of 128-bit vector loads with the same element type.
1644+
Register LoadReg = MI.getOperand(1).getReg();
1645+
Register LoadRegWithOffset = LoadReg;
1646+
1647+
unsigned EltSize = ValTy.getScalarSizeInBits();
1648+
// Only support element types which can cleanly divide into 128-bit wide
1649+
// vectors.
1650+
if (128 % EltSize != 0)
1651+
return false;
1652+
1653+
unsigned NewEltCount = 128 / EltSize;
1654+
LLT NewTy = LLT::fixed_vector(NewEltCount, ValTy.getElementType());
1655+
1656+
unsigned OldEltCount = ValTy.getNumElements();
1657+
unsigned NumVecs = OldEltCount / NewEltCount;
1658+
1659+
// Create registers to represent each element of ValReg. Load into these,
1660+
// then combine them at the end.
1661+
SmallVector<Register, 16> ComponentRegs;
1662+
for (unsigned i = 0, e = ValTy.getNumElements(); i != e; i++)
1663+
ComponentRegs.push_back(
1664+
MRI.createGenericVirtualRegister(ValTy.getElementType()));
1665+
1666+
MachineMemOperand &MMO = **MI.memoperands_begin();
1667+
auto GetMMO = [&MMO, &MI](int64_t Offset, LLT Ty) {
1668+
return MI.getMF()->getMachineMemOperand(&MMO, Offset, Ty);
1669+
};
1670+
1671+
for (unsigned i = 0, e = NumVecs; i != e; i++) {
1672+
auto LoadChunk = MIRBuilder.buildLoad(
1673+
NewTy, LoadRegWithOffset, *GetMMO(i * NewTy.getSizeInBytes(), NewTy));
1674+
1675+
auto LoadOffset = MIRBuilder.buildConstant(
1676+
LLT::scalar(64), (i + 1) * NewTy.getSizeInBytes());
1677+
1678+
LoadRegWithOffset =
1679+
MIRBuilder.buildPtrAdd(MRI.getType(LoadReg), LoadReg, LoadOffset)
1680+
.getReg(0);
1681+
1682+
Register *ChunkFirstReg = ComponentRegs.begin() + (i * NewEltCount);
1683+
MIRBuilder.buildUnmerge({ChunkFirstReg, ChunkFirstReg + NewEltCount},
1684+
LoadChunk.getReg(0));
1685+
}
1686+
1687+
unsigned ExtraElems = OldEltCount % NewEltCount;
1688+
if (ExtraElems != 0) {
1689+
LLT ExtraTy = LLT::fixed_vector(ExtraElems, ValTy.getElementType());
1690+
1691+
auto ExtraLoadChunk = MIRBuilder.buildLoad(
1692+
ExtraTy, LoadRegWithOffset,
1693+
*GetMMO(NumVecs * NewTy.getSizeInBytes(), ExtraTy));
1694+
1695+
MIRBuilder.buildUnmerge({ComponentRegs.begin() + (NumVecs * NewEltCount),
1696+
ComponentRegs.end()},
1697+
ExtraLoadChunk.getReg(0));
1698+
}
1699+
1700+
MIRBuilder.buildBuildVector(ValReg, ComponentRegs);
1701+
MI.eraseFromParent();
1702+
return true;
1703+
}
1704+
16351705
if (ValTy == LLT::scalar(128)) {
16361706

16371707
AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();

llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -711,33 +711,24 @@ body: |
711711
; CHECK: liveins: $x0
712712
; CHECK-NEXT: {{ $}}
713713
; CHECK-NEXT: %ptr:_(p0) = COPY $x0
714-
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
715-
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
714+
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr(p0) :: (load (<2 x s64>), align 64)
715+
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
716+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
716717
; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
717-
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
718-
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
718+
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
719+
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
720+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
719721
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
720-
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
721-
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
722-
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
723-
; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
724-
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
725-
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
726-
; CHECK-NEXT: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
727-
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
728-
; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
729-
; CHECK-NEXT: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
730-
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
731-
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
732-
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
733-
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
734-
; CHECK-NEXT: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
735-
; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
736-
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
737-
; CHECK-NEXT: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
738-
; CHECK-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
739-
; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
740-
; CHECK-NEXT: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
722+
; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 32, align 32)
723+
; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
724+
; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST]](<2 x p0>)
725+
; CHECK-NEXT: G_STORE [[BITCAST3]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
726+
; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
727+
; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST1]](<2 x p0>)
728+
; CHECK-NEXT: G_STORE [[BITCAST4]](<2 x s64>), [[PTR_ADD2]](p0) :: (store (<2 x s64>) into unknown-address + 16)
729+
; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
730+
; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BITCAST2]](<2 x p0>)
731+
; CHECK-NEXT: G_STORE [[BITCAST5]](<2 x s64>), [[PTR_ADD3]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
741732
; CHECK-NEXT: RET_ReallyLR
742733
%ptr:_(p0) = COPY $x0
743734
%val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))

0 commit comments

Comments
 (0)