Skip to content

Commit 167747a

Browse files
davemgreenfhahn
authored andcommitted
[AArch64] Sink splat shuffles to lane index intrinsics
This teaches AArch64TargetLowering::shouldSinkOperands to sink splat shuffles to certain neon intrinsics, so that they can make use of the lane variants of the instructions that are available. Differential Revision: https://reviews.llvm.org/D112994 (cherry-picked from 760d4d0)
1 parent 3016fd1 commit 167747a

File tree

3 files changed

+63
-19
lines changed

3 files changed

+63
-19
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12107,6 +12107,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
1210712107
return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
1210812108
}
1210912109

12110+
static bool isSplatShuffle(Value *V) {
12111+
if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
12112+
return is_splat(Shuf->getShuffleMask());
12113+
return false;
12114+
}
12115+
1211012116
/// Check if sinking \p I's operands to I's basic block is profitable, because
1211112117
/// the operands can be folded into a target instruction, e.g.
1211212118
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -12117,12 +12123,24 @@ bool AArch64TargetLowering::shouldSinkOperands(
1211712123

1211812124
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1211912125
switch (II->getIntrinsicID()) {
12126+
case Intrinsic::aarch64_neon_smull:
1212012127
case Intrinsic::aarch64_neon_umull:
12121-
if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
12122-
return false;
12123-
Ops.push_back(&II->getOperandUse(0));
12124-
Ops.push_back(&II->getOperandUse(1));
12125-
return true;
12128+
if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
12129+
Ops.push_back(&II->getOperandUse(0));
12130+
Ops.push_back(&II->getOperandUse(1));
12131+
return true;
12132+
}
12133+
LLVM_FALLTHROUGH;
12134+
12135+
case Intrinsic::aarch64_neon_sqdmull:
12136+
case Intrinsic::aarch64_neon_sqdmulh:
12137+
case Intrinsic::aarch64_neon_sqrdmulh:
12138+
// Sink splats for index lane variants
12139+
if (isSplatShuffle(II->getOperand(0)))
12140+
Ops.push_back(&II->getOperandUse(0));
12141+
if (isSplatShuffle(II->getOperand(1)))
12142+
Ops.push_back(&II->getOperandUse(1));
12143+
return !Ops.empty();
1212612144

1212712145
case Intrinsic::aarch64_neon_pmull64:
1212812146
if (!areOperandsOfVmullHighP64(II->getArgOperand(0),

llvm/test/CodeGen/AArch64/sinksplat.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,11 @@ define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) {
77
; CHECK-NEXT: fmov d1, d0
88
; CHECK-NEXT: mov w8, #1
99
; CHECK-NEXT: movi v0.2d, #0000000000000000
10-
; CHECK-NEXT: dup v1.4h, v1.h[3]
1110
; CHECK-NEXT: .LBB0_1: // %l1
1211
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
1312
; CHECK-NEXT: ldr d2, [x0]
1413
; CHECK-NEXT: subs w8, w8, #1
15-
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h
14+
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.h[3]
1615
; CHECK-NEXT: b.eq .LBB0_1
1716
; CHECK-NEXT: // %bb.2: // %l2
1817
; CHECK-NEXT: ret
@@ -40,12 +39,11 @@ define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) {
4039
; CHECK-NEXT: fmov d1, d0
4140
; CHECK-NEXT: mov w8, #1
4241
; CHECK-NEXT: movi v0.2d, #0000000000000000
43-
; CHECK-NEXT: dup v1.4h, v1.h[3]
4442
; CHECK-NEXT: .LBB1_1: // %l1
4543
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
4644
; CHECK-NEXT: ldr d2, [x0]
4745
; CHECK-NEXT: subs w8, w8, #1
48-
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
46+
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.h[3]
4947
; CHECK-NEXT: b.eq .LBB1_1
5048
; CHECK-NEXT: // %bb.2: // %l2
5149
; CHECK-NEXT: ret
@@ -73,12 +71,11 @@ define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) {
7371
; CHECK-NEXT: mov v1.16b, v0.16b
7472
; CHECK-NEXT: mov w8, #1
7573
; CHECK-NEXT: movi v0.2d, #0000000000000000
76-
; CHECK-NEXT: dup v1.4s, v1.s[3]
7774
; CHECK-NEXT: .LBB2_1: // %l1
7875
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
7976
; CHECK-NEXT: ldr q2, [x0]
8077
; CHECK-NEXT: subs w8, w8, #1
81-
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
78+
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3]
8279
; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s
8380
; CHECK-NEXT: b.eq .LBB2_1
8481
; CHECK-NEXT: // %bb.2: // %l2
@@ -107,12 +104,11 @@ define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) {
107104
; CHECK-NEXT: mov v1.16b, v0.16b
108105
; CHECK-NEXT: mov w8, #1
109106
; CHECK-NEXT: movi v0.2d, #0000000000000000
110-
; CHECK-NEXT: dup v1.4s, v1.s[3]
111107
; CHECK-NEXT: .LBB3_1: // %l1
112108
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
113109
; CHECK-NEXT: ldr q2, [x0]
114110
; CHECK-NEXT: subs w8, w8, #1
115-
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
111+
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.s[3]
116112
; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s
117113
; CHECK-NEXT: b.eq .LBB3_1
118114
; CHECK-NEXT: // %bb.2: // %l2
@@ -141,12 +137,11 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) {
141137
; CHECK-NEXT: mov v1.16b, v0.16b
142138
; CHECK-NEXT: mov w8, #1
143139
; CHECK-NEXT: movi v0.2d, #0000000000000000
144-
; CHECK-NEXT: dup v1.4s, v1.s[3]
145140
; CHECK-NEXT: .LBB4_1: // %l1
146141
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
147142
; CHECK-NEXT: ldr q2, [x0]
148143
; CHECK-NEXT: subs w8, w8, #1
149-
; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.4s
144+
; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.s[3]
150145
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
151146
; CHECK-NEXT: b.eq .LBB4_1
152147
; CHECK-NEXT: // %bb.2: // %l2
@@ -175,12 +170,11 @@ define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) {
175170
; CHECK-NEXT: fmov d1, d0
176171
; CHECK-NEXT: mov w8, #1
177172
; CHECK-NEXT: movi v0.2d, #0000000000000000
178-
; CHECK-NEXT: dup v1.4h, v1.h[3]
179173
; CHECK-NEXT: .LBB5_1: // %l1
180174
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
181175
; CHECK-NEXT: ldr d2, [x0]
182176
; CHECK-NEXT: subs w8, w8, #1
183-
; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.4h
177+
; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.h[3]
184178
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
185179
; CHECK-NEXT: b.eq .LBB5_1
186180
; CHECK-NEXT: // %bb.2: // %l2

llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,38 @@ if.else:
150150
ret <8 x i16> %vmull1
151151
}
152152

153+
; The masks used are suitable for umull, sink shufflevector to users.
154+
define <8 x i16> @sink_shufflevector_smull(<16 x i8> %a, <16 x i8> %b) {
155+
; CHECK-LABEL: @sink_shufflevector_smull(
156+
; CHECK-NEXT: entry:
157+
; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
158+
; CHECK: if.then:
159+
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
160+
; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
161+
; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
162+
; CHECK-NEXT: ret <8 x i16> [[VMULL0]]
163+
; CHECK: if.else:
164+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
165+
; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
166+
; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
167+
; CHECK-NEXT: ret <8 x i16> [[VMULL1]]
168+
;
169+
entry:
170+
%s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
171+
%s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
172+
br i1 undef, label %if.then, label %if.else
173+
174+
if.then:
175+
%s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
176+
%vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
177+
ret <8 x i16> %vmull0
178+
179+
if.else:
180+
%s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
181+
%vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
182+
ret <8 x i16> %vmull1
183+
}
184+
153185
; Both exts and their shufflevector operands can be sunk.
154186
define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
155187
; CHECK-LABEL: @sink_shufflevector_ext_subadd(
@@ -271,8 +303,8 @@ if.else:
271303
}
272304

273305

274-
; Function Attrs: nounwind readnone
275-
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2
306+
declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
307+
declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
276308

277309
; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur.
278310
define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {

0 commit comments

Comments
 (0)