Skip to content

Commit bb84dd8

Browse files
committed
[AArch64] Add a tablegen pattern for RADDHN/RADDHN2.
Converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half the width of the vector element. The latter has twice the throughput and half the latency on Arm out-of-order cores. Setting up the zero register adds no latency. Differential Revision: https://reviews.llvm.org/D116166
1 parent 3b4c040 commit bb84dd8

File tree

2 files changed

+76
-0
lines changed

2 files changed

+76
-0
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6603,6 +6603,34 @@ defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
66036603
defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
66046604
TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
66056605

6606+
// RADDHN patterns for when RSHRN shifts by half the size of the vector element
6607+
def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
6608+
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
6609+
def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))),
6610+
(RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
6611+
def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
6612+
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
6613+
6614+
// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
6615+
def : Pat<(v16i8 (concat_vectors
6616+
(v8i8 V64:$Vd),
6617+
(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))),
6618+
(RADDHNv8i16_v16i8
6619+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
6620+
(v8i16 (MOVIv2d_ns (i32 0))))>;
6621+
def : Pat<(v8i16 (concat_vectors
6622+
(v4i16 V64:$Vd),
6623+
(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))),
6624+
(RADDHNv4i32_v8i16
6625+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
6626+
(v4i32 (MOVIv2d_ns (i32 0))))>;
6627+
def : Pat<(v4i32 (concat_vectors
6628+
(v2i32 V64:$Vd),
6629+
(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))),
6630+
(RADDHNv2i64_v4i32
6631+
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
6632+
(v2i64 (MOVIv2d_ns (i32 0))))>;
6633+
66066634
// SHRN patterns for when a logical right shift was used instead of arithmetic
66076635
// (the immediate guarantees no sign bits actually end up in the result so it
66086636
// doesn't matter).
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s
3+
4+
define <16 x i8> @test_combine_v8i16_to_v16i8(<8 x i16> %x, <8 x i16> %y) {
5+
; CHECK-LABEL: test_combine_v8i16_to_v16i8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: movi v2.2d, #0000000000000000
8+
; CHECK-NEXT: raddhn v0.8b, v0.8h, v2.8h
9+
; CHECK-NEXT: raddhn2 v0.16b, v1.8h, v2.8h
10+
; CHECK-NEXT: ret
11+
entry:
12+
%res = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %x, i32 8)
13+
%res2 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %y, i32 8)
14+
%shuffle = shufflevector <8 x i8> %res, <8 x i8> %res2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
15+
ret <16 x i8> %shuffle
16+
}
17+
18+
define <8 x i16> @test_combine_v4i32_to_v8i16(<4 x i32> %x, <4 x i32> %y) {
19+
; CHECK-LABEL: test_combine_v4i32_to_v8i16:
20+
; CHECK: // %bb.0: // %entry
21+
; CHECK-NEXT: movi v2.2d, #0000000000000000
22+
; CHECK-NEXT: raddhn v0.4h, v0.4s, v2.4s
23+
; CHECK-NEXT: raddhn2 v0.8h, v1.4s, v2.4s
24+
; CHECK-NEXT: ret
25+
entry:
26+
%res = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %x, i32 16)
27+
%res2 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %y, i32 16)
28+
%shuffle = shufflevector <4 x i16> %res, <4 x i16> %res2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
29+
ret <8 x i16> %shuffle
30+
}
31+
32+
define <4 x i32> @test_combine_v2i64_to_v4i32(<2 x i64> %x, <2 x i64> %y) {
33+
; CHECK-LABEL: test_combine_v2i64_to_v4i32:
34+
; CHECK: // %bb.0: // %entry
35+
; CHECK-NEXT: movi v2.2d, #0000000000000000
36+
; CHECK-NEXT: raddhn v0.2s, v0.2d, v2.2d
37+
; CHECK-NEXT: raddhn2 v0.4s, v1.2d, v2.2d
38+
; CHECK-NEXT: ret
39+
entry:
40+
%res = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %x, i32 32)
41+
%res2 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %y, i32 32)
42+
%shuffle = shufflevector <2 x i32> %res, <2 x i32> %res2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
43+
ret <4 x i32> %shuffle
44+
}
45+
46+
declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
47+
declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
48+
declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)

0 commit comments

Comments
 (0)