Skip to content

Commit 87f4240

Browse files
authored
[AArch64] Generate BSP instead of TBL for select shuffles. (#121474)
In using BIF/BIT/BSL the constant mask has a larger chance of being regular, being able to be materialized with a movi. On some cpus the BIF/BIT/BSL is slightly quicker too.
1 parent e44f03d commit 87f4240

File tree

3 files changed

+32
-33
lines changed

3 files changed

+32
-33
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14004,6 +14004,23 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
1400414004
dl);
1400514005
}
1400614006

14007+
// Check for a "select shuffle", generating a BSL to pick between lanes in
14008+
// V1/V2.
14009+
if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14010+
assert(VT.getScalarSizeInBits() <= 32 &&
14011+
"Expected larger vector element sizes to be handled already");
14012+
SmallVector<SDValue> MaskElts;
14013+
for (int M : ShuffleMask)
14014+
MaskElts.push_back(DAG.getConstant(
14015+
M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, dl, MVT::i32));
14016+
EVT IVT = VT.changeVectorElementTypeToInteger();
14017+
SDValue MaskConst = DAG.getBuildVector(IVT, dl, MaskElts);
14018+
return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, dl, IVT, MaskConst,
14019+
DAG.getBitcast(IVT, V1),
14020+
DAG.getBitcast(IVT, V2)));
14021+
}
14022+
14023+
// Fall back to generating a TBL
1400714024
return GenerateTBL(Op, ShuffleMask, DAG);
1400814025
}
1400914026

llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2537,14 +2537,13 @@ entry:
25372537
define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) {
25382538
; CHECK-LABEL: cmplx_mul_combined_re_im:
25392539
; CHECK: // %bb.0: // %entry
2540-
; CHECK-NEXT: lsr x9, x0, #16
2541-
; CHECK-NEXT: adrp x8, .LCPI196_0
2540+
; CHECK-NEXT: lsr x8, x0, #16
2541+
; CHECK-NEXT: movi v1.2d, #0xffff0000ffff0000
25422542
; CHECK-NEXT: fmov d5, x0
2543-
; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI196_0]
25442543
; CHECK-NEXT: rev32 v4.8h, v0.8h
2545-
; CHECK-NEXT: dup v1.8h, w9
2546-
; CHECK-NEXT: sqneg v2.8h, v1.8h
2547-
; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
2544+
; CHECK-NEXT: dup v2.8h, w8
2545+
; CHECK-NEXT: sqneg v3.8h, v2.8h
2546+
; CHECK-NEXT: bsl v1.16b, v2.16b, v3.16b
25482547
; CHECK-NEXT: sqdmull v2.4s, v0.4h, v5.h[0]
25492548
; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v5.h[0]
25502549
; CHECK-NEXT: sqdmlal v2.4s, v4.4h, v1.4h

llvm/test/CodeGen/AArch64/shuffle-select.ll

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,8 @@
44
define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
55
; CHECK-LABEL: sel_v8i8:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
8-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
9-
; CHECK-NEXT: adrp x8, .LCPI0_0
10-
; CHECK-NEXT: mov v0.d[1], v1.d[0]
11-
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0]
12-
; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
7+
; CHECK-NEXT: movi d2, #0xff00ff00ff00ff
8+
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
139
; CHECK-NEXT: ret
1410
%tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1511
ret <8 x i8> %tmp0
@@ -18,11 +14,8 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
1814
define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
1915
; CHECK-LABEL: sel_v16i8:
2016
; CHECK: // %bb.0:
21-
; CHECK-NEXT: adrp x8, .LCPI1_0
22-
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
23-
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
24-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
25-
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
17+
; CHECK-NEXT: movi v2.2d, #0xff00ff00ff00ff
18+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
2619
; CHECK-NEXT: ret
2720
%tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
2821
ret <16 x i8> %tmp0
@@ -32,10 +25,8 @@ define <16 x i8> @sel_v16i8_poison(<16 x i8> %v0, <16 x i8> %v1) {
3225
; CHECK-LABEL: sel_v16i8_poison:
3326
; CHECK: // %bb.0:
3427
; CHECK-NEXT: adrp x8, .LCPI2_0
35-
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
3628
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0]
37-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
38-
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
29+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
3930
; CHECK-NEXT: ret
4031
%tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
4132
ret <16 x i8> %tmp0
@@ -45,10 +36,8 @@ define <16 x i8> @sel_v16i8_unregular(<16 x i8> %v0, <16 x i8> %v1) {
4536
; CHECK-LABEL: sel_v16i8_unregular:
4637
; CHECK: // %bb.0:
4738
; CHECK-NEXT: adrp x8, .LCPI3_0
48-
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
4939
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0]
50-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
51-
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
40+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
5241
; CHECK-NEXT: ret
5342
%tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 31>
5443
ret <16 x i8> %tmp0
@@ -67,11 +56,8 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
6756
define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
6857
; CHECK-LABEL: sel_v8i16:
6958
; CHECK: // %bb.0:
70-
; CHECK-NEXT: adrp x8, .LCPI5_0
71-
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
72-
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0]
73-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
74-
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
59+
; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
60+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
7561
; CHECK-NEXT: ret
7662
%tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
7763
ret <8 x i16> %tmp0
@@ -121,11 +107,8 @@ define <4 x half> @sel_v4f16(<4 x half> %v0, <4 x half> %v1) {
121107
define <8 x half> @sel_v8f16(<8 x half> %v0, <8 x half> %v1) {
122108
; CHECK-LABEL: sel_v8f16:
123109
; CHECK: // %bb.0:
124-
; CHECK-NEXT: adrp x8, .LCPI10_0
125-
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
126-
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_0]
127-
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
128-
; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b
110+
; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff
111+
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
129112
; CHECK-NEXT: ret
130113
%tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
131114
ret <8 x half> %tmp0

0 commit comments

Comments
 (0)