Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 212054e

Browse files
author
Ivan A. Kosarev
committed
[NEON] Support vldNq intrinsics in AArch32 (LLVM part)
This patch adds support for the q versions of the dup (load-to-all-lanes) NEON intrinsics, such as vld2q_dup_f16() for example. Currently, non-q versions of the dup intrinsics are implemented in clang by generating IR that first loads the elements of the structure into the first lane with the lane (to-single-lane) intrinsics, and then propagating it other lanes. There are at least two problems with this approach. First, there are no double-spaced to-single-lane byte-element instructions. For example, there is no such instruction as 'vld2.8 { d0[0], d2[0] }, [r0]'. That means we cannot rely on the to-single-lane intrinsics and instructions to implement the q versions of the dup intrinsics. Note that to-all-lanes instructions do support all sizes of data items, including bytes. The second problem with the current approach is that we need a separate vdup instruction to propagate the structure to each lane. So for vld4q_dup_f16() we would need four vdup instructions in addition to the initial vld instruction. This patch introduces dup LLVM intrinsics and reworks handling of the currently supported (non-q) NEON dup intrinsics to expand them into those LLVM intrinsics, thus eliminating the need for using to-single-lane intrinsics and instructions. Additionally, this patch adds support for u64 and s64 dup NEON intrinsics. These are marked as Arch64-only in the ARM NEON Reference, but it seems there are no reasons to not support them in AArch32 mode. Please correct, if that is wrong. That's what we generate with this patch applied: vld2q_dup_f16: vld2.16 {d0[], d2[]}, [r0] vld2.16 {d1[], d3[]}, [r0] vld3q_dup_f16: vld3.16 {d0[], d2[], d4[]}, [r0] vld3.16 {d1[], d3[], d5[]}, [r0] vld4q_dup_f16: vld4.16 {d0[], d2[], d4[], d6[]}, [r0] vld4.16 {d1[], d3[], d5[], d7[]}, [r0] Differential Revision: https://reviews.llvm.org/D48439 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@335733 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a0883e0 commit 212054e

File tree

7 files changed

+510
-70
lines changed

7 files changed

+510
-70
lines changed

include/llvm/IR/IntrinsicsARM.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,20 @@ def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
652652
LLVMMatchType<0>, llvm_i32_ty,
653653
llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
654654

655+
// Vector load N-element structure to all lanes.
656+
// Source operands are the address and alignment.
657+
def int_arm_neon_vld2dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
658+
[llvm_anyptr_ty, llvm_i32_ty],
659+
[IntrReadMem, IntrArgMemOnly]>;
660+
def int_arm_neon_vld3dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
661+
LLVMMatchType<0>],
662+
[llvm_anyptr_ty, llvm_i32_ty],
663+
[IntrReadMem, IntrArgMemOnly]>;
664+
def int_arm_neon_vld4dup : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
665+
LLVMMatchType<0>, LLVMMatchType<0>],
666+
[llvm_anyptr_ty, llvm_i32_ty],
667+
[IntrReadMem, IntrArgMemOnly]>;
668+
655669
// Interleaving vector stores from N-element structures.
656670
// Source operands are: the address, the N vectors, and the alignment.
657671
def int_arm_neon_vst1 : Intrinsic<[],

lib/Target/ARM/ARMBaseInstrInfo.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4310,12 +4310,30 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
43104310
case ARM::VLD2DUPd8wb_register:
43114311
case ARM::VLD2DUPd16wb_register:
43124312
case ARM::VLD2DUPd32wb_register:
4313+
case ARM::VLD2DUPq8EvenPseudo:
4314+
case ARM::VLD2DUPq8OddPseudo:
4315+
case ARM::VLD2DUPq16EvenPseudo:
4316+
case ARM::VLD2DUPq16OddPseudo:
4317+
case ARM::VLD2DUPq32EvenPseudo:
4318+
case ARM::VLD2DUPq32OddPseudo:
4319+
case ARM::VLD3DUPq8EvenPseudo:
4320+
case ARM::VLD3DUPq8OddPseudo:
4321+
case ARM::VLD3DUPq16EvenPseudo:
4322+
case ARM::VLD3DUPq16OddPseudo:
4323+
case ARM::VLD3DUPq32EvenPseudo:
4324+
case ARM::VLD3DUPq32OddPseudo:
43134325
case ARM::VLD4DUPd8Pseudo:
43144326
case ARM::VLD4DUPd16Pseudo:
43154327
case ARM::VLD4DUPd32Pseudo:
43164328
case ARM::VLD4DUPd8Pseudo_UPD:
43174329
case ARM::VLD4DUPd16Pseudo_UPD:
43184330
case ARM::VLD4DUPd32Pseudo_UPD:
4331+
case ARM::VLD4DUPq8EvenPseudo:
4332+
case ARM::VLD4DUPq8OddPseudo:
4333+
case ARM::VLD4DUPq16EvenPseudo:
4334+
case ARM::VLD4DUPq16OddPseudo:
4335+
case ARM::VLD4DUPq32EvenPseudo:
4336+
case ARM::VLD4DUPq32OddPseudo:
43194337
case ARM::VLD1LNq8Pseudo:
43204338
case ARM::VLD1LNq16Pseudo:
43214339
case ARM::VLD1LNq32Pseudo:

lib/Target/ARM/ARMExpandPseudoInsts.cpp

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
186186
{ ARM::VLD1q8LowQPseudo_UPD, ARM::VLD1d8Qwb_fixed, true, true, true, SingleLowSpc, 4, 8 ,false},
187187
{ ARM::VLD1q8LowTPseudo_UPD, ARM::VLD1d8Twb_fixed, true, true, true, SingleLowSpc, 3, 8 ,false},
188188

189+
{ ARM::VLD2DUPq16EvenPseudo, ARM::VLD2DUPd16x2, true, false, false, EvenDblSpc, 2, 4 ,false},
190+
{ ARM::VLD2DUPq16OddPseudo, ARM::VLD2DUPd16x2, true, false, false, OddDblSpc, 2, 4 ,false},
191+
{ ARM::VLD2DUPq32EvenPseudo, ARM::VLD2DUPd32x2, true, false, false, EvenDblSpc, 2, 2 ,false},
192+
{ ARM::VLD2DUPq32OddPseudo, ARM::VLD2DUPd32x2, true, false, false, OddDblSpc, 2, 2 ,false},
193+
{ ARM::VLD2DUPq8EvenPseudo, ARM::VLD2DUPd8x2, true, false, false, EvenDblSpc, 2, 8 ,false},
194+
{ ARM::VLD2DUPq8OddPseudo, ARM::VLD2DUPd8x2, true, false, false, OddDblSpc, 2, 8 ,false},
195+
189196
{ ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true},
190197
{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true},
191198
{ ARM::VLD2LNd32Pseudo, ARM::VLD2LNd32, true, false, false, SingleSpc, 2, 2 ,true},
@@ -213,6 +220,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
213220
{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true, SingleSpc, 3, 2,true},
214221
{ ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd8, true, false, false, SingleSpc, 3, 8,true},
215222
{ ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd8_UPD, true, true, true, SingleSpc, 3, 8,true},
223+
{ ARM::VLD3DUPq16EvenPseudo, ARM::VLD3DUPq16, true, false, false, EvenDblSpc, 3, 4 ,true},
224+
{ ARM::VLD3DUPq16OddPseudo, ARM::VLD3DUPq16, true, false, false, OddDblSpc, 3, 4 ,true},
225+
{ ARM::VLD3DUPq32EvenPseudo, ARM::VLD3DUPq32, true, false, false, EvenDblSpc, 3, 2 ,true},
226+
{ ARM::VLD3DUPq32OddPseudo, ARM::VLD3DUPq32, true, false, false, OddDblSpc, 3, 2 ,true},
227+
{ ARM::VLD3DUPq8EvenPseudo, ARM::VLD3DUPq8, true, false, false, EvenDblSpc, 3, 8 ,true},
228+
{ ARM::VLD3DUPq8OddPseudo, ARM::VLD3DUPq8, true, false, false, OddDblSpc, 3, 8 ,true},
216229

217230
{ ARM::VLD3LNd16Pseudo, ARM::VLD3LNd16, true, false, false, SingleSpc, 3, 4 ,true},
218231
{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true, SingleSpc, 3, 4 ,true},
@@ -248,6 +261,12 @@ static const NEONLdStTableEntry NEONLdStTable[] = {
248261
{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true, SingleSpc, 4, 2,true},
249262
{ ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd8, true, false, false, SingleSpc, 4, 8,true},
250263
{ ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd8_UPD, true, true, true, SingleSpc, 4, 8,true},
264+
{ ARM::VLD4DUPq16EvenPseudo, ARM::VLD4DUPq16, true, false, false, EvenDblSpc, 4, 4 ,true},
265+
{ ARM::VLD4DUPq16OddPseudo, ARM::VLD4DUPq16, true, false, false, OddDblSpc, 4, 4 ,true},
266+
{ ARM::VLD4DUPq32EvenPseudo, ARM::VLD4DUPq32, true, false, false, EvenDblSpc, 4, 2 ,true},
267+
{ ARM::VLD4DUPq32OddPseudo, ARM::VLD4DUPq32, true, false, false, OddDblSpc, 4, 2 ,true},
268+
{ ARM::VLD4DUPq8EvenPseudo, ARM::VLD4DUPq8, true, false, false, EvenDblSpc, 4, 8 ,true},
269+
{ ARM::VLD4DUPq8OddPseudo, ARM::VLD4DUPq8, true, false, false, OddDblSpc, 4, 8 ,true},
251270

252271
{ ARM::VLD4LNd16Pseudo, ARM::VLD4LNd16, true, false, false, SingleSpc, 4, 4 ,true},
253272
{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true, SingleSpc, 4, 4 ,true},
@@ -463,15 +482,31 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
463482

464483
bool DstIsDead = MI.getOperand(OpIdx).isDead();
465484
unsigned DstReg = MI.getOperand(OpIdx++).getReg();
466-
unsigned D0, D1, D2, D3;
467-
GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
468-
MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
469-
if (NumRegs > 1 && TableEntry->copyAllListRegs)
470-
MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
471-
if (NumRegs > 2 && TableEntry->copyAllListRegs)
472-
MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
473-
if (NumRegs > 3 && TableEntry->copyAllListRegs)
474-
MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
485+
if(TableEntry->RealOpc == ARM::VLD2DUPd8x2 ||
486+
TableEntry->RealOpc == ARM::VLD2DUPd16x2 ||
487+
TableEntry->RealOpc == ARM::VLD2DUPd32x2) {
488+
unsigned SubRegIndex;
489+
if (RegSpc == EvenDblSpc) {
490+
SubRegIndex = ARM::dsub_0;
491+
} else {
492+
assert(RegSpc == OddDblSpc && "Unexpected spacing!");
493+
SubRegIndex = ARM::dsub_1;
494+
}
495+
unsigned SubReg = TRI->getSubReg(DstReg, SubRegIndex);
496+
unsigned DstRegPair = TRI->getMatchingSuperReg(SubReg, ARM::dsub_0,
497+
&ARM::DPairSpcRegClass);
498+
MIB.addReg(DstRegPair, RegState::Define | getDeadRegState(DstIsDead));
499+
} else {
500+
unsigned D0, D1, D2, D3;
501+
GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
502+
MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
503+
if (NumRegs > 1 && TableEntry->copyAllListRegs)
504+
MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
505+
if (NumRegs > 2 && TableEntry->copyAllListRegs)
506+
MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
507+
if (NumRegs > 3 && TableEntry->copyAllListRegs)
508+
MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
509+
}
475510

476511
if (TableEntry->isUpdating)
477512
MIB.add(MI.getOperand(OpIdx++));
@@ -510,10 +545,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
510545
// has an extra operand that is a use of the super-register. Record the
511546
// operand index and skip over it.
512547
unsigned SrcOpIdx = 0;
513-
if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
514-
RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
515-
RegSpc == SingleHighTSpc)
516-
SrcOpIdx = OpIdx++;
548+
if(TableEntry->RealOpc != ARM::VLD2DUPd8x2 &&
549+
TableEntry->RealOpc != ARM::VLD2DUPd16x2 &&
550+
TableEntry->RealOpc != ARM::VLD2DUPd32x2) {
551+
if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc ||
552+
RegSpc == SingleLowSpc || RegSpc == SingleHighQSpc ||
553+
RegSpc == SingleHighTSpc)
554+
SrcOpIdx = OpIdx++;
555+
}
517556

518557
// Copy the predicate operands.
519558
MIB.add(MI.getOperand(OpIdx++));
@@ -1674,6 +1713,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
16741713
case ARM::VLD4DUPd8Pseudo_UPD:
16751714
case ARM::VLD4DUPd16Pseudo_UPD:
16761715
case ARM::VLD4DUPd32Pseudo_UPD:
1716+
case ARM::VLD2DUPq8EvenPseudo:
1717+
case ARM::VLD2DUPq8OddPseudo:
1718+
case ARM::VLD2DUPq16EvenPseudo:
1719+
case ARM::VLD2DUPq16OddPseudo:
1720+
case ARM::VLD2DUPq32EvenPseudo:
1721+
case ARM::VLD2DUPq32OddPseudo:
1722+
case ARM::VLD3DUPq8EvenPseudo:
1723+
case ARM::VLD3DUPq8OddPseudo:
1724+
case ARM::VLD3DUPq16EvenPseudo:
1725+
case ARM::VLD3DUPq16OddPseudo:
1726+
case ARM::VLD3DUPq32EvenPseudo:
1727+
case ARM::VLD3DUPq32OddPseudo:
1728+
case ARM::VLD4DUPq8EvenPseudo:
1729+
case ARM::VLD4DUPq8OddPseudo:
1730+
case ARM::VLD4DUPq16EvenPseudo:
1731+
case ARM::VLD4DUPq16OddPseudo:
1732+
case ARM::VLD4DUPq32EvenPseudo:
1733+
case ARM::VLD4DUPq32OddPseudo:
16771734
ExpandVLD(MBBI);
16781735
return true;
16791736

0 commit comments

Comments
 (0)