Skip to content

Commit c0bf592

Browse files
committed
[AArch64] Improve vector reverse lowering
This improves the lowering of v8i16 and v16i8 vector reverse shuffles. Instead of going via a generic tbl it uses a rev64; ext pair, as already happens for v4i32. Differential Revision: https://reviews.llvm.org/D100882
1 parent 6f4ed8c commit c0bf592

File tree

3 files changed

+24
-69
lines changed

3 files changed

+24
-69
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9007,6 +9007,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
90079007
SDValue V1 = Op.getOperand(0);
90089008
SDValue V2 = Op.getOperand(1);
90099009

9010+
assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
9011+
assert(ShuffleMask.size() == VT.getVectorNumElements() &&
9012+
"Unexpected VECTOR_SHUFFLE mask size!");
9013+
90109014
if (SVN->isSplat()) {
90119015
int Lane = SVN->getSplatIndex();
90129016
// If this is undef splat, generate it via "just" vdup, if possible.
@@ -9053,6 +9057,14 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
90539057
if (isREVMask(ShuffleMask, VT, 16))
90549058
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
90559059

9060+
if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
9061+
(VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
9062+
ShuffleVectorInst::isReverseMask(ShuffleMask)) {
9063+
SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
9064+
return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
9065+
DAG.getConstant(8, dl, MVT::i32));
9066+
}
9067+
90569068
bool ReverseEXT = false;
90579069
unsigned Imm;
90589070
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {

llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll

Lines changed: 6 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -9,57 +9,21 @@ target triple = "aarch64-unknown-linux-gnu"
99
;
1010

1111
define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
12-
; CHECK-LABEL: .LCPI0_0:
13-
; CHECK: .byte 15 // 0xf
14-
; CHECK-NEXT: .byte 14 // 0xe
15-
; CHECK-NEXT: .byte 13 // 0xd
16-
; CHECK-NEXT: .byte 12 // 0xc
17-
; CHECK-NEXT: .byte 11 // 0xb
18-
; CHECK-NEXT: .byte 10 // 0xa
19-
; CHECK-NEXT: .byte 9 // 0x9
20-
; CHECK-NEXT: .byte 8 // 0x8
21-
; CHECK-NEXT: .byte 7 // 0x7
22-
; CHECK-NEXT: .byte 6 // 0x6
23-
; CHECK-NEXT: .byte 5 // 0x5
24-
; CHECK-NEXT: .byte 4 // 0x4
25-
; CHECK-NEXT: .byte 3 // 0x3
26-
; CHECK-NEXT: .byte 2 // 0x2
27-
; CHECK-NEXT: .byte 1 // 0x1
28-
; CHECK-NEXT: .byte 0 // 0x0
2912
; CHECK-LABEL: reverse_v16i8:
3013
; CHECK: // %bb.0:
31-
; CHECK-NEXT: adrp x8, .LCPI0_0
32-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
33-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
14+
; CHECK-NEXT: rev64 v0.16b, v0.16b
15+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
3416
; CHECK-NEXT: ret
3517

3618
%res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
3719
ret <16 x i8> %res
3820
}
3921

4022
define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
41-
; CHECK-LABEL: .LCPI1_0:
42-
; CHECK: .byte 14 // 0xe
43-
; CHECK-NEXT: .byte 15 // 0xf
44-
; CHECK-NEXT: .byte 12 // 0xc
45-
; CHECK-NEXT: .byte 13 // 0xd
46-
; CHECK-NEXT: .byte 10 // 0xa
47-
; CHECK-NEXT: .byte 11 // 0xb
48-
; CHECK-NEXT: .byte 8 // 0x8
49-
; CHECK-NEXT: .byte 9 // 0x9
50-
; CHECK-NEXT: .byte 6 // 0x6
51-
; CHECK-NEXT: .byte 7 // 0x7
52-
; CHECK-NEXT: .byte 4 // 0x4
53-
; CHECK-NEXT: .byte 5 // 0x5
54-
; CHECK-NEXT: .byte 2 // 0x2
55-
; CHECK-NEXT: .byte 3 // 0x3
56-
; CHECK-NEXT: .byte 0 // 0x0
57-
; CHECK-NEXT: .byte 1 // 0x1
5823
; CHECK-LABEL: reverse_v8i16:
5924
; CHECK: // %bb.0:
60-
; CHECK-NEXT: adrp x8, .LCPI1_0
61-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
62-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
25+
; CHECK-NEXT: rev64 v0.8h, v0.8h
26+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
6327
; CHECK-NEXT: ret
6428

6529
%res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
@@ -88,28 +52,10 @@ define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
8852
}
8953

9054
define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
91-
; CHECK-LABEL: .LCPI4_0:
92-
; CHECK: .byte 14 // 0xe
93-
; CHECK-NEXT: .byte 15 // 0xf
94-
; CHECK-NEXT: .byte 12 // 0xc
95-
; CHECK-NEXT: .byte 13 // 0xd
96-
; CHECK-NEXT: .byte 10 // 0xa
97-
; CHECK-NEXT: .byte 11 // 0xb
98-
; CHECK-NEXT: .byte 8 // 0x8
99-
; CHECK-NEXT: .byte 9 // 0x9
100-
; CHECK-NEXT: .byte 6 // 0x6
101-
; CHECK-NEXT: .byte 7 // 0x7
102-
; CHECK-NEXT: .byte 4 // 0x4
103-
; CHECK-NEXT: .byte 5 // 0x5
104-
; CHECK-NEXT: .byte 2 // 0x2
105-
; CHECK-NEXT: .byte 3 // 0x3
106-
; CHECK-NEXT: .byte 0 // 0x0
107-
; CHECK-NEXT: .byte 1 // 0x1
10855
; CHECK-LABEL: reverse_v8f16:
10956
; CHECK: // %bb.0:
110-
; CHECK-NEXT: adrp x8, .LCPI4_0
111-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
112-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
57+
; CHECK-NEXT: rev64 v0.8h, v0.8h
58+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
11359
; CHECK-NEXT: ret
11460

11561
%res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)

llvm/test/CodeGen/AArch64/neon-reverseshuffle.patch

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ entry:
3535
define <8 x i16> @v8i16(<8 x i16> %a) {
3636
; CHECK-LABEL: v8i16:
3737
; CHECK: // %bb.0: // %entry
38-
; CHECK-NEXT: adrp x8, .LCPI3_0
39-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
40-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
38+
; CHECK-NEXT: rev64 v0.8h, v0.8h
39+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
4140
; CHECK-NEXT: ret
4241
entry:
4342
%V128 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -69,9 +68,8 @@ entry:
6968
define <16 x i8> @v16i8(<16 x i8> %a) {
7069
; CHECK-LABEL: v16i8:
7170
; CHECK: // %bb.0: // %entry
72-
; CHECK-NEXT: adrp x8, .LCPI6_0
73-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
74-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
71+
; CHECK-NEXT: rev64 v0.16b, v0.16b
72+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
7573
; CHECK-NEXT: ret
7674
entry:
7775
%V128 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -136,9 +134,8 @@ entry:
136134
define <8 x half> @v8f16(<8 x half> %a) {
137135
; CHECK-LABEL: v8f16:
138136
; CHECK: // %bb.0: // %entry
139-
; CHECK-NEXT: adrp x8, .LCPI12_0
140-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
141-
; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
137+
; CHECK-NEXT: rev64 v0.8h, v0.8h
138+
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
142139
; CHECK-NEXT: ret
143140
entry:
144141
%V128 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

0 commit comments

Comments
 (0)