Skip to content

Commit bbca78f

Browse files
authored
[PowerPC] vector shift word/double by element size - 1 use all ones (#139794)
Vector shift word or double requires a shift amount vector of 31 or 63 which is too big for splat immediate and requires a multi-instruction sequence. However the PPC instructions only use 5 or 6 bits of the shift amount vector elements so an all ones mask, which we can generate efficiently, works.
1 parent 3c9812e commit bbca78f

File tree

6 files changed

+101
-73
lines changed

6 files changed

+101
-73
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18456,36 +18456,80 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
1845618456
return SDValue();
1845718457
}
1845818458

18459-
SDValue PPCTargetLowering::combineVectorSHL(SDNode *N,
18460-
DAGCombinerInfo &DCI) const {
18459+
SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
18460+
DAGCombinerInfo &DCI) const {
1846118461
EVT VT = N->getValueType(0);
1846218462
assert(VT.isVector() && "Vector type expected.");
1846318463

18464-
SDValue N1 = N->getOperand(1);
18465-
if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR ||
18466-
!isOperationLegal(ISD::ADD, VT))
18464+
unsigned Opc = N->getOpcode();
18465+
assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
18466+
"Unexpected opcode.");
18467+
18468+
if (!isOperationLegal(Opc, VT))
1846718469
return SDValue();
1846818470

18469-
// For 64-bit there is no splat immediate so we want to catch shift by 1 here
18470-
// before the BUILD_VECTOR is replaced by a load.
1847118471
EVT EltTy = VT.getScalarType();
18472-
if (EltTy != MVT::i64)
18472+
unsigned EltBits = EltTy.getSizeInBits();
18473+
if (EltTy != MVT::i64 && EltTy != MVT::i32)
1847318474
return SDValue();
1847418475

18475-
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
18476-
APInt APSplatBits, APSplatUndef;
18477-
unsigned SplatBitSize;
18478-
bool HasAnyUndefs;
18479-
bool BVNIsConstantSplat =
18480-
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
18481-
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
18482-
if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits())
18476+
SDValue N1 = N->getOperand(1);
18477+
uint64_t SplatBits = 0;
18478+
bool AddSplatCase = false;
18479+
unsigned OpcN1 = N1.getOpcode();
18480+
if (OpcN1 == PPCISD::VADD_SPLAT &&
18481+
N1.getConstantOperandVal(1) == VT.getVectorNumElements()) {
18482+
AddSplatCase = true;
18483+
SplatBits = N1.getConstantOperandVal(0);
18484+
}
18485+
18486+
if (!AddSplatCase) {
18487+
if (OpcN1 != ISD::BUILD_VECTOR)
18488+
return SDValue();
18489+
18490+
unsigned SplatBitSize;
18491+
bool HasAnyUndefs;
18492+
APInt APSplatBits, APSplatUndef;
18493+
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
18494+
bool BVNIsConstantSplat =
18495+
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
18496+
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
18497+
if (!BVNIsConstantSplat || SplatBitSize != EltBits)
18498+
return SDValue();
18499+
SplatBits = APSplatBits.getZExtValue();
18500+
}
18501+
18502+
SDLoc DL(N);
18503+
SDValue N0 = N->getOperand(0);
18504+
// PPC vector shifts by word/double look at only the low 5/6 bits of the
18505+
// shift vector, which means the max value is 31/63. A shift vector of all
18506+
// 1s will be truncated to 31/63, which is useful as vspltiw is limited to
18507+
// -16 to 15 range.
18508+
if (SplatBits == (EltBits - 1)) {
18509+
unsigned NewOpc;
18510+
switch (Opc) {
18511+
case ISD::SHL:
18512+
NewOpc = PPCISD::SHL;
18513+
break;
18514+
case ISD::SRL:
18515+
NewOpc = PPCISD::SRL;
18516+
break;
18517+
case ISD::SRA:
18518+
NewOpc = PPCISD::SRA;
18519+
break;
18520+
}
18521+
SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL);
18522+
return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes);
18523+
}
18524+
18525+
if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT))
1848318526
return SDValue();
18484-
uint64_t SplatBits = APSplatBits.getZExtValue();
18485-
if (SplatBits != 1)
18527+
18528+
// For 64-bit there is no splat immediate so we want to catch shift by 1 here
18529+
// before the BUILD_VECTOR is replaced by a load.
18530+
if (EltTy != MVT::i64 || SplatBits != 1)
1848618531
return SDValue();
1848718532

18488-
SDValue N0 = N->getOperand(0);
1848918533
return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
1849018534
}
1849118535

@@ -18494,7 +18538,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
1849418538
return Value;
1849518539

1849618540
if (N->getValueType(0).isVector())
18497-
return combineVectorSHL(N, DCI);
18541+
return combineVectorShift(N, DCI);
1849818542

1849918543
SDValue N0 = N->getOperand(0);
1850018544
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -18526,13 +18570,19 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
1852618570
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
1852718571
return Value;
1852818572

18573+
if (N->getValueType(0).isVector())
18574+
return combineVectorShift(N, DCI);
18575+
1852918576
return SDValue();
1853018577
}
1853118578

1853218579
SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
1853318580
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
1853418581
return Value;
1853518582

18583+
if (N->getValueType(0).isVector())
18584+
return combineVectorShift(N, DCI);
18585+
1853618586
return SDValue();
1853718587
}
1853818588

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1441,7 +1441,7 @@ namespace llvm {
14411441
SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
14421442
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
14431443
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
1444-
SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const;
1444+
SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const;
14451445
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
14461446
SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
14471447
SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;

llvm/test/CodeGen/PowerPC/mul-const-vector.ll

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) {
252252
ret <4 x i32> %tmp.1
253253
}
254254
; CHECK-LABEL: test7_v4i32:
255-
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
256-
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
257-
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
255+
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
258256
; CHECK-NOT: vmul
259-
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
257+
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
260258

261259
define <4 x i32> @test8_v4i32(<4 x i32> %a) {
262260
%tmp.1 = mul nsw <4 x i32> %a, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> ; <<4 x i32>> [#uses=1]
263261
ret <4 x i32> %tmp.1
264262
}
265263
; CHECK-LABEL: test8_v4i32:
266-
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
267-
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
268-
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
264+
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
269265
; CHECK-NOT: vmul
270-
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
271-
; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2
266+
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
267+
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2
272268

273269
define <2 x i64> @test1_v2i64(<2 x i64> %a) {
274270
%tmp.1 = mul nsw <2 x i64> %a, <i64 16, i64 16> ; <<2 x i64>> [#uses=1]
@@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) {
356352
}
357353

358354
; CHECK-LABEL: test7_v2i64:
359-
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
360-
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
355+
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
361356
; CHECK-NOT: vmul
362357
; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]]
363358

@@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) {
367362
}
368363

369364
; CHECK-LABEL: test8_v2i64:
370-
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
371-
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
365+
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
372366
; CHECK-NOT: vmul
373367
; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
374368
; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2

llvm/test/CodeGen/PowerPC/pr47891.ll

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,11 @@
77
define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
88
; CHECK-LABEL: poly2_lshift1:
99
; CHECK: # %bb.0: # %entry
10-
; CHECK-NEXT: addis r6, r2, .LCPI0_0@toc@ha
10+
; CHECK-NEXT: ld r6, 0(r3)
1111
; CHECK-NEXT: li r4, 72
1212
; CHECK-NEXT: ld r5, 64(r3)
13-
; CHECK-NEXT: addi r6, r6, .LCPI0_0@toc@l
13+
; CHECK-NEXT: xxleqv v4, v4, v4
1414
; CHECK-NEXT: lxvd2x vs0, r3, r4
15-
; CHECK-NEXT: lxvd2x v4, 0, r6
16-
; CHECK-NEXT: ld r6, 0(r3)
1715
; CHECK-NEXT: sldi r7, r6, 1
1816
; CHECK-NEXT: rotldi r6, r6, 1
1917
; CHECK-NEXT: std r7, 0(r3)
@@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
3533
; CHECK-NEXT: std r7, 32(r3)
3634
; CHECK-NEXT: ld r7, 40(r3)
3735
; CHECK-NEXT: rldimi r6, r7, 1, 0
38-
; CHECK-NEXT: xxswapd v2, vs0
39-
; CHECK-NEXT: mtfprd f0, r5
4036
; CHECK-NEXT: rotldi r7, r7, 1
4137
; CHECK-NEXT: std r6, 40(r3)
4238
; CHECK-NEXT: ld r6, 48(r3)
39+
; CHECK-NEXT: xxswapd v2, vs0
40+
; CHECK-NEXT: mtfprd f0, r5
4341
; CHECK-NEXT: rldimi r7, r6, 1, 0
4442
; CHECK-NEXT: rotldi r6, r6, 1
4543
; CHECK-NEXT: std r7, 48(r3)

llvm/test/CodeGen/PowerPC/signbit-shift.ll

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) {
188188
define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
189189
; CHECK-LABEL: add_lshr_not_vec_splat:
190190
; CHECK: # %bb.0:
191-
; CHECK-NEXT: vspltisw 3, -16
192-
; CHECK-NEXT: vspltisw 4, 15
193191
; CHECK-NEXT: addis 3, 2, .LCPI15_0@toc@ha
194-
; CHECK-NEXT: vsubuwm 3, 4, 3
195-
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
192+
; CHECK-NEXT: xxleqv 35, 35, 35
196193
; CHECK-NEXT: vsraw 2, 2, 3
194+
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
197195
; CHECK-NEXT: lxvd2x 35, 0, 3
198196
; CHECK-NEXT: vadduwm 2, 2, 3
199197
; CHECK-NEXT: blr
@@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) {
218216
define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
219217
; CHECK-LABEL: sub_lshr_not_vec_splat:
220218
; CHECK: # %bb.0:
221-
; CHECK-NEXT: vspltisw 3, -16
222-
; CHECK-NEXT: vspltisw 4, 15
223219
; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha
224-
; CHECK-NEXT: vsubuwm 3, 4, 3
225-
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
220+
; CHECK-NEXT: xxleqv 35, 35, 35
226221
; CHECK-NEXT: vsrw 2, 2, 3
222+
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
227223
; CHECK-NEXT: lxvd2x 35, 0, 3
228224
; CHECK-NEXT: vadduwm 2, 2, 3
229225
; CHECK-NEXT: blr
@@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
247243
define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
248244
; CHECK-LABEL: sub_lshr_vec:
249245
; CHECK: # %bb.0:
250-
; CHECK-NEXT: vspltisw 4, -16
251-
; CHECK-NEXT: vspltisw 5, 15
252-
; CHECK-NEXT: vsubuwm 4, 5, 4
246+
; CHECK-NEXT: xxleqv 36, 36, 36
253247
; CHECK-NEXT: vsraw 2, 2, 4
254248
; CHECK-NEXT: vadduwm 2, 3, 2
255249
; CHECK-NEXT: blr
@@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) {
272266
define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
273267
; CHECK-LABEL: sub_const_op_lshr_vec:
274268
; CHECK: # %bb.0:
275-
; CHECK-NEXT: vspltisw 3, -16
276-
; CHECK-NEXT: vspltisw 4, 15
277269
; CHECK-NEXT: addis 3, 2, .LCPI21_0@toc@ha
278-
; CHECK-NEXT: vsubuwm 3, 4, 3
279-
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
270+
; CHECK-NEXT: xxleqv 35, 35, 35
280271
; CHECK-NEXT: vsraw 2, 2, 3
272+
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
281273
; CHECK-NEXT: lxvd2x 35, 0, 3
282274
; CHECK-NEXT: vadduwm 2, 2, 3
283275
; CHECK-NEXT: blr

llvm/test/CodeGen/PowerPC/vselect-constants.ll

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
1111
; CHECK-LABEL: sel_C1_or_C2_vec:
1212
; CHECK: # %bb.0:
1313
; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha
14-
; CHECK-NEXT: vspltisw 3, -16
15-
; CHECK-NEXT: vspltisw 4, 15
14+
; CHECK-NEXT: xxleqv 37, 37, 37
15+
; CHECK-NEXT: vslw 2, 2, 5
1616
; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l
17-
; CHECK-NEXT: vsubuwm 3, 4, 3
17+
; CHECK-NEXT: vsraw 2, 2, 5
1818
; CHECK-NEXT: lxvd2x 0, 0, 3
1919
; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha
20-
; CHECK-NEXT: vslw 2, 2, 3
2120
; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l
22-
; CHECK-NEXT: vsraw 2, 2, 3
23-
; CHECK-NEXT: xxswapd 37, 0
21+
; CHECK-NEXT: xxswapd 35, 0
2422
; CHECK-NEXT: lxvd2x 0, 0, 3
25-
; CHECK-NEXT: xxswapd 32, 0
26-
; CHECK-NEXT: xxsel 34, 32, 37, 34
23+
; CHECK-NEXT: xxswapd 36, 0
24+
; CHECK-NEXT: xxsel 34, 36, 35, 34
2725
; CHECK-NEXT: blr
2826
%add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
2927
ret <4 x i32> %add
@@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
8280
; CHECK-LABEL: sel_Cminus1_or_C_vec:
8381
; CHECK: # %bb.0:
8482
; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha
85-
; CHECK-NEXT: vspltisw 3, -16
86-
; CHECK-NEXT: vspltisw 4, 15
83+
; CHECK-NEXT: xxleqv 36, 36, 36
84+
; CHECK-NEXT: vslw 2, 2, 4
8785
; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l
88-
; CHECK-NEXT: vsubuwm 3, 4, 3
86+
; CHECK-NEXT: vsraw 2, 2, 4
8987
; CHECK-NEXT: lxvd2x 0, 0, 3
90-
; CHECK-NEXT: vslw 2, 2, 3
91-
; CHECK-NEXT: vsraw 2, 2, 3
92-
; CHECK-NEXT: xxswapd 37, 0
93-
; CHECK-NEXT: vadduwm 2, 2, 5
88+
; CHECK-NEXT: xxswapd 35, 0
89+
; CHECK-NEXT: vadduwm 2, 2, 3
9490
; CHECK-NEXT: blr
9591
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
9692
ret <4 x i32> %add
@@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
114110
define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
115111
; CHECK-LABEL: sel_minus1_or_0_vec:
116112
; CHECK: # %bb.0:
117-
; CHECK-NEXT: vspltisw 3, -16
118-
; CHECK-NEXT: vspltisw 4, 15
119-
; CHECK-NEXT: vsubuwm 3, 4, 3
113+
; CHECK-NEXT: xxleqv 35, 35, 35
120114
; CHECK-NEXT: vslw 2, 2, 3
121115
; CHECK-NEXT: vsraw 2, 2, 3
122116
; CHECK-NEXT: blr

0 commit comments

Comments
 (0)