Skip to content

Commit c464896

Browse files
authored
[AArch64][GlobalISel] Select llvm.aarch64.neon.ld* intrinsics (#65630)
Similar to llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp.
1 parent 0e67a68 commit c464896

File tree

3 files changed

+650
-161
lines changed

3 files changed

+650
-161
lines changed

llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Lines changed: 310 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,12 @@ class AArch64InstructionSelector : public InstructionSelector {
143143
const TargetRegisterClass *DstRC,
144144
Register Scalar,
145145
MachineIRBuilder &MIRBuilder) const;
146+
/// Helper to narrow vector that was widened by emitScalarToVector.
147+
/// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
148+
/// vector, correspondingly.
149+
MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
150+
MachineIRBuilder &MIRBuilder,
151+
MachineRegisterInfo &MRI) const;
146152

147153
/// Emit a lane insert into \p DstReg, or a new vector register if
148154
/// std::nullopt is provided.
@@ -186,6 +192,8 @@ class AArch64InstructionSelector : public InstructionSelector {
186192
/// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
187193
bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
188194
MachineInstr &I);
195+
bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
196+
MachineInstr &I);
189197
bool selectIntrinsicWithSideEffects(MachineInstr &I,
190198
MachineRegisterInfo &MRI);
191199
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -3897,6 +3905,31 @@ MachineInstr *AArch64InstructionSelector::emitScalarToVector(
38973905
}
38983906
}
38993907

3908+
MachineInstr *
3909+
AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3910+
MachineIRBuilder &MIB,
3911+
MachineRegisterInfo &MRI) const {
3912+
LLT DstTy = MRI.getType(DstReg);
3913+
const TargetRegisterClass *RC =
3914+
getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3915+
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3916+
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3917+
return nullptr;
3918+
}
3919+
unsigned SubReg = 0;
3920+
if (!getSubRegForClass(RC, TRI, SubReg))
3921+
return nullptr;
3922+
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3923+
LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3924+
<< DstTy.getSizeInBits() << "\n");
3925+
return nullptr;
3926+
}
3927+
auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3928+
.addReg(SrcReg, 0, SubReg);
3929+
RBI.constrainGenericRegister(DstReg, *RC, MRI);
3930+
return Copy;
3931+
}
3932+
39003933
bool AArch64InstructionSelector::selectMergeValues(
39013934
MachineInstr &I, MachineRegisterInfo &MRI) {
39023935
assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
@@ -5384,24 +5417,8 @@ bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
53845417
if (VecSize < 128) {
53855418
// If we had to widen to perform the insert, then we have to demote back to
53865419
// the original size to get the result we want.
5387-
Register DemoteVec = InsMI->getOperand(0).getReg();
5388-
const TargetRegisterClass *RC =
5389-
getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
5390-
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5391-
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5392-
return false;
5393-
}
5394-
unsigned SubReg = 0;
5395-
if (!getSubRegForClass(RC, TRI, SubReg))
5420+
if (!emitNarrowVector(DstReg, InsMI->getOperand(0).getReg(), MIB, MRI))
53965421
return false;
5397-
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5398-
LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5399-
<< "\n");
5400-
return false;
5401-
}
5402-
MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5403-
.addReg(DemoteVec, 0, SubReg);
5404-
RBI.constrainGenericRegister(DstReg, *RC, MRI);
54055422
} else {
54065423
// No widening needed.
54075424
InsMI->getOperand(0).setReg(DstReg);
@@ -5630,6 +5647,60 @@ bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
56305647
return true;
56315648
}
56325649

5650+
bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5651+
unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5652+
assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5653+
assert(Opc && "Expected an opcode?");
5654+
assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5655+
auto &MRI = *MIB.getMRI();
5656+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5657+
bool Narrow = Ty.getSizeInBits() == 64;
5658+
5659+
auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
5660+
SmallVector<Register, 4> Regs(NumVecs);
5661+
std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
5662+
[](auto MO) { return MO.getReg(); });
5663+
5664+
if (Narrow) {
5665+
transform(Regs, Regs.begin(), [this](Register Reg) {
5666+
return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
5667+
->getOperand(0)
5668+
.getReg();
5669+
});
5670+
Ty = Ty.multiplyElements(2);
5671+
}
5672+
5673+
Register Tuple = createQTuple(Regs, MIB);
5674+
auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
5675+
if (!LaneNo)
5676+
return false;
5677+
5678+
Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
5679+
auto Load = MIB.buildInstr(Opc, {Ty}, {})
5680+
.addReg(Tuple)
5681+
.addImm(LaneNo->getZExtValue())
5682+
.addReg(Ptr);
5683+
Load.cloneMemRefs(I);
5684+
constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5685+
Register SelectedLoadDst = Load->getOperand(0).getReg();
5686+
unsigned SubReg = AArch64::qsub0;
5687+
for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5688+
auto Vec = MIB.buildInstr(TargetOpcode::COPY,
5689+
{Narrow ? DstOp(&AArch64::FPR128RegClass)
5690+
: DstOp(I.getOperand(Idx).getReg())},
5691+
{})
5692+
.addReg(SelectedLoadDst, 0, SubReg + Idx);
5693+
Register WideReg = Vec.getReg(0);
5694+
// Emit the subreg copies and immediately select them.
5695+
selectCopy(*Vec, TII, MRI, TRI, RBI);
5696+
if (Narrow &&
5697+
!emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
5698+
return false;
5699+
}
5700+
5701+
return true;
5702+
}
5703+
56335704
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
56345705
MachineInstr &I, MachineRegisterInfo &MRI) {
56355706
// Find the intrinsic ID.
@@ -5664,6 +5735,78 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
56645735
MIB.buildInstr(AArch64::BRK, {}, {})
56655736
.addImm(I.getOperand(1).getImm() | ('U' << 8));
56665737
break;
5738+
case Intrinsic::aarch64_neon_ld1x2: {
5739+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5740+
unsigned Opc = 0;
5741+
if (Ty == LLT::fixed_vector(8, S8))
5742+
Opc = AArch64::LD1Twov8b;
5743+
else if (Ty == LLT::fixed_vector(16, S8))
5744+
Opc = AArch64::LD1Twov16b;
5745+
else if (Ty == LLT::fixed_vector(4, S16))
5746+
Opc = AArch64::LD1Twov4h;
5747+
else if (Ty == LLT::fixed_vector(8, S16))
5748+
Opc = AArch64::LD1Twov8h;
5749+
else if (Ty == LLT::fixed_vector(2, S32))
5750+
Opc = AArch64::LD1Twov2s;
5751+
else if (Ty == LLT::fixed_vector(4, S32))
5752+
Opc = AArch64::LD1Twov4s;
5753+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5754+
Opc = AArch64::LD1Twov2d;
5755+
else if (Ty == S64 || Ty == P0)
5756+
Opc = AArch64::LD1Twov1d;
5757+
else
5758+
llvm_unreachable("Unexpected type for ld1x2!");
5759+
selectVectorLoadIntrinsic(Opc, 2, I);
5760+
break;
5761+
}
5762+
case Intrinsic::aarch64_neon_ld1x3: {
5763+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5764+
unsigned Opc = 0;
5765+
if (Ty == LLT::fixed_vector(8, S8))
5766+
Opc = AArch64::LD1Threev8b;
5767+
else if (Ty == LLT::fixed_vector(16, S8))
5768+
Opc = AArch64::LD1Threev16b;
5769+
else if (Ty == LLT::fixed_vector(4, S16))
5770+
Opc = AArch64::LD1Threev4h;
5771+
else if (Ty == LLT::fixed_vector(8, S16))
5772+
Opc = AArch64::LD1Threev8h;
5773+
else if (Ty == LLT::fixed_vector(2, S32))
5774+
Opc = AArch64::LD1Threev2s;
5775+
else if (Ty == LLT::fixed_vector(4, S32))
5776+
Opc = AArch64::LD1Threev4s;
5777+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5778+
Opc = AArch64::LD1Threev2d;
5779+
else if (Ty == S64 || Ty == P0)
5780+
Opc = AArch64::LD1Threev1d;
5781+
else
5782+
llvm_unreachable("Unexpected type for ld1x3!");
5783+
selectVectorLoadIntrinsic(Opc, 3, I);
5784+
break;
5785+
}
5786+
case Intrinsic::aarch64_neon_ld1x4: {
5787+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5788+
unsigned Opc = 0;
5789+
if (Ty == LLT::fixed_vector(8, S8))
5790+
Opc = AArch64::LD1Fourv8b;
5791+
else if (Ty == LLT::fixed_vector(16, S8))
5792+
Opc = AArch64::LD1Fourv16b;
5793+
else if (Ty == LLT::fixed_vector(4, S16))
5794+
Opc = AArch64::LD1Fourv4h;
5795+
else if (Ty == LLT::fixed_vector(8, S16))
5796+
Opc = AArch64::LD1Fourv8h;
5797+
else if (Ty == LLT::fixed_vector(2, S32))
5798+
Opc = AArch64::LD1Fourv2s;
5799+
else if (Ty == LLT::fixed_vector(4, S32))
5800+
Opc = AArch64::LD1Fourv4s;
5801+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5802+
Opc = AArch64::LD1Fourv2d;
5803+
else if (Ty == S64 || Ty == P0)
5804+
Opc = AArch64::LD1Fourv1d;
5805+
else
5806+
llvm_unreachable("Unexpected type for ld1x4!");
5807+
selectVectorLoadIntrinsic(Opc, 4, I);
5808+
break;
5809+
}
56675810
case Intrinsic::aarch64_neon_ld2: {
56685811
LLT Ty = MRI.getType(I.getOperand(0).getReg());
56695812
unsigned Opc = 0;
@@ -5688,6 +5831,114 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
56885831
selectVectorLoadIntrinsic(Opc, 2, I);
56895832
break;
56905833
}
5834+
case Intrinsic::aarch64_neon_ld2lane: {
5835+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5836+
unsigned Opc;
5837+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
5838+
Opc = AArch64::LD2i8;
5839+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
5840+
Opc = AArch64::LD2i16;
5841+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
5842+
Opc = AArch64::LD2i32;
5843+
else if (Ty == LLT::fixed_vector(2, S64) ||
5844+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
5845+
Opc = AArch64::LD2i64;
5846+
else
5847+
llvm_unreachable("Unexpected type for st2lane!");
5848+
if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
5849+
return false;
5850+
break;
5851+
}
5852+
case Intrinsic::aarch64_neon_ld2r: {
5853+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5854+
unsigned Opc = 0;
5855+
if (Ty == LLT::fixed_vector(8, S8))
5856+
Opc = AArch64::LD2Rv8b;
5857+
else if (Ty == LLT::fixed_vector(16, S8))
5858+
Opc = AArch64::LD2Rv16b;
5859+
else if (Ty == LLT::fixed_vector(4, S16))
5860+
Opc = AArch64::LD2Rv4h;
5861+
else if (Ty == LLT::fixed_vector(8, S16))
5862+
Opc = AArch64::LD2Rv8h;
5863+
else if (Ty == LLT::fixed_vector(2, S32))
5864+
Opc = AArch64::LD2Rv2s;
5865+
else if (Ty == LLT::fixed_vector(4, S32))
5866+
Opc = AArch64::LD2Rv4s;
5867+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5868+
Opc = AArch64::LD2Rv2d;
5869+
else if (Ty == S64 || Ty == P0)
5870+
Opc = AArch64::LD2Rv1d;
5871+
else
5872+
llvm_unreachable("Unexpected type for ld2r!");
5873+
selectVectorLoadIntrinsic(Opc, 2, I);
5874+
break;
5875+
}
5876+
case Intrinsic::aarch64_neon_ld3: {
5877+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5878+
unsigned Opc = 0;
5879+
if (Ty == LLT::fixed_vector(8, S8))
5880+
Opc = AArch64::LD3Threev8b;
5881+
else if (Ty == LLT::fixed_vector(16, S8))
5882+
Opc = AArch64::LD3Threev16b;
5883+
else if (Ty == LLT::fixed_vector(4, S16))
5884+
Opc = AArch64::LD3Threev4h;
5885+
else if (Ty == LLT::fixed_vector(8, S16))
5886+
Opc = AArch64::LD3Threev8h;
5887+
else if (Ty == LLT::fixed_vector(2, S32))
5888+
Opc = AArch64::LD3Threev2s;
5889+
else if (Ty == LLT::fixed_vector(4, S32))
5890+
Opc = AArch64::LD3Threev4s;
5891+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5892+
Opc = AArch64::LD3Threev2d;
5893+
else if (Ty == S64 || Ty == P0)
5894+
Opc = AArch64::LD1Threev1d;
5895+
else
5896+
llvm_unreachable("Unexpected type for ld3!");
5897+
selectVectorLoadIntrinsic(Opc, 3, I);
5898+
break;
5899+
}
5900+
case Intrinsic::aarch64_neon_ld3lane: {
5901+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5902+
unsigned Opc;
5903+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
5904+
Opc = AArch64::LD3i8;
5905+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
5906+
Opc = AArch64::LD3i16;
5907+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
5908+
Opc = AArch64::LD3i32;
5909+
else if (Ty == LLT::fixed_vector(2, S64) ||
5910+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
5911+
Opc = AArch64::LD3i64;
5912+
else
5913+
llvm_unreachable("Unexpected type for st3lane!");
5914+
if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
5915+
return false;
5916+
break;
5917+
}
5918+
case Intrinsic::aarch64_neon_ld3r: {
5919+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5920+
unsigned Opc = 0;
5921+
if (Ty == LLT::fixed_vector(8, S8))
5922+
Opc = AArch64::LD3Rv8b;
5923+
else if (Ty == LLT::fixed_vector(16, S8))
5924+
Opc = AArch64::LD3Rv16b;
5925+
else if (Ty == LLT::fixed_vector(4, S16))
5926+
Opc = AArch64::LD3Rv4h;
5927+
else if (Ty == LLT::fixed_vector(8, S16))
5928+
Opc = AArch64::LD3Rv8h;
5929+
else if (Ty == LLT::fixed_vector(2, S32))
5930+
Opc = AArch64::LD3Rv2s;
5931+
else if (Ty == LLT::fixed_vector(4, S32))
5932+
Opc = AArch64::LD3Rv4s;
5933+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5934+
Opc = AArch64::LD3Rv2d;
5935+
else if (Ty == S64 || Ty == P0)
5936+
Opc = AArch64::LD3Rv1d;
5937+
else
5938+
llvm_unreachable("Unexpected type for ld3r!");
5939+
selectVectorLoadIntrinsic(Opc, 3, I);
5940+
break;
5941+
}
56915942
case Intrinsic::aarch64_neon_ld4: {
56925943
LLT Ty = MRI.getType(I.getOperand(0).getReg());
56935944
unsigned Opc = 0;
@@ -5712,6 +5963,48 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
57125963
selectVectorLoadIntrinsic(Opc, 4, I);
57135964
break;
57145965
}
5966+
case Intrinsic::aarch64_neon_ld4lane: {
5967+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5968+
unsigned Opc;
5969+
if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
5970+
Opc = AArch64::LD4i8;
5971+
else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
5972+
Opc = AArch64::LD4i16;
5973+
else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
5974+
Opc = AArch64::LD4i32;
5975+
else if (Ty == LLT::fixed_vector(2, S64) ||
5976+
Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
5977+
Opc = AArch64::LD4i64;
5978+
else
5979+
llvm_unreachable("Unexpected type for st4lane!");
5980+
if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
5981+
return false;
5982+
break;
5983+
}
5984+
case Intrinsic::aarch64_neon_ld4r: {
5985+
LLT Ty = MRI.getType(I.getOperand(0).getReg());
5986+
unsigned Opc = 0;
5987+
if (Ty == LLT::fixed_vector(8, S8))
5988+
Opc = AArch64::LD4Rv8b;
5989+
else if (Ty == LLT::fixed_vector(16, S8))
5990+
Opc = AArch64::LD4Rv16b;
5991+
else if (Ty == LLT::fixed_vector(4, S16))
5992+
Opc = AArch64::LD4Rv4h;
5993+
else if (Ty == LLT::fixed_vector(8, S16))
5994+
Opc = AArch64::LD4Rv8h;
5995+
else if (Ty == LLT::fixed_vector(2, S32))
5996+
Opc = AArch64::LD4Rv2s;
5997+
else if (Ty == LLT::fixed_vector(4, S32))
5998+
Opc = AArch64::LD4Rv4s;
5999+
else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6000+
Opc = AArch64::LD4Rv2d;
6001+
else if (Ty == S64 || Ty == P0)
6002+
Opc = AArch64::LD4Rv1d;
6003+
else
6004+
llvm_unreachable("Unexpected type for ld4r!");
6005+
selectVectorLoadIntrinsic(Opc, 4, I);
6006+
break;
6007+
}
57156008
case Intrinsic::aarch64_neon_st2: {
57166009
Register Src1 = I.getOperand(1).getReg();
57176010
Register Src2 = I.getOperand(2).getReg();

0 commit comments

Comments
 (0)