Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 9cca551

Browse files
Hao LiuHao Liu
authored andcommitted
[AArch64]Add support to copy D tuples such as DPair/DTriple/DQuad and Q tuples such as QPair/QTriple/QQuad. There is no test case for D tuple as the original test cases are too large. As the copy of the D tuple is similar to the Q tuple, the correctness can be guaranteed.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@198682 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent dff3861 commit 9cca551

File tree

3 files changed

+101
-1
lines changed

3 files changed

+101
-1
lines changed

lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
134134
return;
135135
}
136136
} else {
137-
llvm_unreachable("Unknown register class in copyPhysReg");
137+
CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
138+
return;
138139
}
139140

140141
// E.g. ORR xDst, xzr, xSrc, lsl #0
@@ -144,6 +145,55 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
144145
.addImm(0);
145146
}
146147

148+
void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
149+
MachineBasicBlock::iterator I,
150+
DebugLoc DL, unsigned DestReg,
151+
unsigned SrcReg) const {
152+
unsigned SubRegs;
153+
bool IsQRegs;
154+
if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
155+
SubRegs = 2;
156+
IsQRegs = false;
157+
} else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
158+
SubRegs = 3;
159+
IsQRegs = false;
160+
} else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
161+
SubRegs = 4;
162+
IsQRegs = false;
163+
} else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
164+
SubRegs = 2;
165+
IsQRegs = true;
166+
} else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
167+
SubRegs = 3;
168+
IsQRegs = true;
169+
} else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
170+
SubRegs = 4;
171+
IsQRegs = true;
172+
} else
173+
llvm_unreachable("Unknown register class");
174+
175+
unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
176+
int Spacing = 1;
177+
const TargetRegisterInfo *TRI = &getRegisterInfo();
178+
// Copy register tuples backward when the first Dest reg overlaps
179+
// with SrcReg.
180+
if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
181+
BeginIdx = BeginIdx + (SubRegs - 1);
182+
Spacing = -1;
183+
}
184+
185+
unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
186+
for (unsigned i = 0; i != SubRegs; ++i) {
187+
unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
188+
unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
189+
assert(Dst && Src && "Bad sub-register");
190+
BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
191+
.addReg(Src)
192+
.addReg(Src);
193+
}
194+
return;
195+
}
196+
147197
/// Does the Opcode represent a conditional branch that we can remove and re-add
148198
/// at the end of a basic block?
149199
static bool isCondBranch(unsigned Opc) {

lib/Target/AArch64/AArch64InstrInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ class AArch64InstrInfo : public AArch64GenInstrInfo {
4242
MachineBasicBlock::iterator I, DebugLoc DL,
4343
unsigned DestReg, unsigned SrcReg,
4444
bool KillSrc) const;
45+
void CopyPhysRegTuple(MachineBasicBlock &MBB,
46+
MachineBasicBlock::iterator I, DebugLoc DL,
47+
unsigned DestReg, unsigned SrcReg) const;
4548

4649
void storeRegToStackSlot(MachineBasicBlock &MBB,
4750
MachineBasicBlock::iterator MI,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
2+
3+
define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
4+
; CHECK-LABEL: copyTuple.QPair:
5+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
6+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
7+
; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
8+
entry:
9+
%vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
10+
%extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
11+
%vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
12+
%vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
13+
ret <4 x i32> %vld1.fca.0.extract
14+
}
15+
16+
define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
17+
; CHECK-LABEL: copyTuple.QTriple:
18+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
19+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
20+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
21+
; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
22+
entry:
23+
%vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
24+
%extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
25+
%vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
26+
%vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
27+
ret <4 x i32> %vld1.fca.0.extract
28+
}
29+
30+
define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
31+
; CHECK-LABEL: copyTuple.QQuad:
32+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
33+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
34+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
35+
; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
36+
; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
37+
entry:
38+
%vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
39+
%extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
40+
%vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
41+
%vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
42+
ret <4 x i32> %vld1.fca.0.extract
43+
}
44+
45+
declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
46+
declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
47+
declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)

0 commit comments

Comments
 (0)