Skip to content

Commit 7b021f2

Browse files
committed
[PowerPC] Optimize VPERM and fix code order for swapping vector operands on LE
This patch reverts commit 7614ba0 to optimize VPERM when one of its vector operands is XXSWAPD, similar to XXPERM. It also reorganizes the little-endian swap code on LE, swapping the vector operand after adjusting the mask operand. This ensures that the vector operand is swapped at the correct point in the code, resulting in a valid constant pool for the mask operand. Reviewed By: stefanp Differential Revision: https://reviews.llvm.org/D149083
1 parent d6d4a52 commit 7b021f2

File tree

5 files changed

+93
-102
lines changed

5 files changed

+93
-102
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10314,11 +10314,6 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
1031410314
bool isLittleEndian = Subtarget.isLittleEndian();
1031510315
bool isPPC64 = Subtarget.isPPC64();
1031610316

10317-
// Only need to place items backwards in LE,
10318-
// the mask will be properly calculated.
10319-
if (isLittleEndian)
10320-
std::swap(V1, V2);
10321-
1032210317
if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
1032310318
(V1->hasOneUse() || V2->hasOneUse())) {
1032410319
LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
@@ -10328,7 +10323,8 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
1032810323
// The second input to XXPERM is also an output so if the second input has
1032910324
// multiple uses then copying is necessary, as a result we want the
1033010325
// single-use operand to be used as the second input to prevent copying.
10331-
if (!V2->hasOneUse() && V1->hasOneUse()) {
10326+
if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) ||
10327+
(isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) {
1033210328
std::swap(V1, V2);
1033310329
NeedSwap = !NeedSwap;
1033410330
}
@@ -10367,27 +10363,24 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
1036710363
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
1036810364
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
1036910365

10370-
if (Opcode == PPCISD::XXPERM) {
10371-
if (V1HasXXSWAPD) {
10372-
if (SrcElt < 8)
10373-
SrcElt += 8;
10374-
else if (SrcElt < 16)
10375-
SrcElt -= 8;
10376-
}
10377-
if (V2HasXXSWAPD) {
10378-
if (SrcElt > 23)
10379-
SrcElt -= 8;
10380-
else if (SrcElt > 15)
10381-
SrcElt += 8;
10382-
}
10383-
if (NeedSwap) {
10384-
if (SrcElt < 16)
10385-
SrcElt += 16;
10386-
else
10387-
SrcElt -= 16;
10388-
}
10366+
if (V1HasXXSWAPD) {
10367+
if (SrcElt < 8)
10368+
SrcElt += 8;
10369+
else if (SrcElt < 16)
10370+
SrcElt -= 8;
10371+
}
10372+
if (V2HasXXSWAPD) {
10373+
if (SrcElt > 23)
10374+
SrcElt -= 8;
10375+
else if (SrcElt > 15)
10376+
SrcElt += 8;
10377+
}
10378+
if (NeedSwap) {
10379+
if (SrcElt < 16)
10380+
SrcElt += 16;
10381+
else
10382+
SrcElt -= 16;
1038910383
}
10390-
1039110384
for (unsigned j = 0; j != BytesPerElement; ++j)
1039210385
if (isLittleEndian)
1039310386
ResultMask.push_back(
@@ -10397,18 +10390,19 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
1039710390
DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
1039810391
}
1039910392

10400-
if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10401-
if (V1HasXXSWAPD) {
10402-
dl = SDLoc(V1->getOperand(0));
10403-
V1 = V1->getOperand(0)->getOperand(1);
10404-
}
10405-
if (V2HasXXSWAPD) {
10406-
dl = SDLoc(V2->getOperand(0));
10407-
V2 = V2->getOperand(0)->getOperand(1);
10408-
}
10409-
if (isPPC64 && ValType != MVT::v2f64)
10393+
if (V1HasXXSWAPD) {
10394+
dl = SDLoc(V1->getOperand(0));
10395+
V1 = V1->getOperand(0)->getOperand(1);
10396+
}
10397+
if (V2HasXXSWAPD) {
10398+
dl = SDLoc(V2->getOperand(0));
10399+
V2 = V2->getOperand(0)->getOperand(1);
10400+
}
10401+
10402+
if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10403+
if (ValType != MVT::v2f64)
1041010404
V1 = DAG.getBitcast(MVT::v2f64, V1);
10411-
if (isPPC64 && V2.getValueType() != MVT::v2f64)
10405+
if (V2.getValueType() != MVT::v2f64)
1041210406
V2 = DAG.getBitcast(MVT::v2f64, V2);
1041310407
}
1041410408

@@ -10429,6 +10423,11 @@ SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
1042910423
if (Opcode == PPCISD::XXPERM)
1043010424
VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
1043110425

10426+
// Only need to place items backwards in LE,
10427+
// the mask was properly calculated.
10428+
if (isLittleEndian)
10429+
std::swap(V1, V2);
10430+
1043210431
SDValue VPERMNode =
1043310432
DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
1043410433

llvm/test/CodeGen/PowerPC/build-vector-tests.ll

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,16 +1058,15 @@ define <4 x i32> @fromDiffMemVarDi(ptr nocapture readonly %arr, i32 signext %ele
10581058
;
10591059
; P8LE-LABEL: fromDiffMemVarDi:
10601060
; P8LE: # %bb.0: # %entry
1061+
; P8LE-NEXT: addis r5, r2, .LCPI9_0@toc@ha
10611062
; P8LE-NEXT: sldi r4, r4, 2
1063+
; P8LE-NEXT: addi r5, r5, .LCPI9_0@toc@l
10621064
; P8LE-NEXT: add r3, r3, r4
1065+
; P8LE-NEXT: lxvd2x vs0, 0, r5
10631066
; P8LE-NEXT: addi r3, r3, -12
1064-
; P8LE-NEXT: lxvd2x vs0, 0, r3
1065-
; P8LE-NEXT: addis r3, r2, .LCPI9_0@toc@ha
1066-
; P8LE-NEXT: addi r3, r3, .LCPI9_0@toc@l
1067+
; P8LE-NEXT: lxvd2x v3, 0, r3
10671068
; P8LE-NEXT: xxswapd v2, vs0
1068-
; P8LE-NEXT: lxvd2x vs0, 0, r3
1069-
; P8LE-NEXT: xxswapd v3, vs0
1070-
; P8LE-NEXT: vperm v2, v2, v2, v3
1069+
; P8LE-NEXT: vperm v2, v3, v3, v2
10711070
; P8LE-NEXT: blr
10721071
entry:
10731072
%idxprom = sext i32 %elem to i64
@@ -1478,13 +1477,12 @@ define <4 x i32> @fromDiffMemConsDConvftoi(ptr nocapture readonly %ptr) {
14781477
;
14791478
; P8LE-LABEL: fromDiffMemConsDConvftoi:
14801479
; P8LE: # %bb.0: # %entry
1481-
; P8LE-NEXT: lxvd2x vs0, 0, r3
1482-
; P8LE-NEXT: addis r3, r2, .LCPI18_0@toc@ha
1483-
; P8LE-NEXT: addi r3, r3, .LCPI18_0@toc@l
1480+
; P8LE-NEXT: addis r4, r2, .LCPI18_0@toc@ha
1481+
; P8LE-NEXT: lxvd2x v3, 0, r3
1482+
; P8LE-NEXT: addi r4, r4, .LCPI18_0@toc@l
1483+
; P8LE-NEXT: lxvd2x vs0, 0, r4
14841484
; P8LE-NEXT: xxswapd v2, vs0
1485-
; P8LE-NEXT: lxvd2x vs0, 0, r3
1486-
; P8LE-NEXT: xxswapd v3, vs0
1487-
; P8LE-NEXT: vperm v2, v2, v2, v3
1485+
; P8LE-NEXT: vperm v2, v3, v3, v2
14881486
; P8LE-NEXT: xvcvspsxws v2, v2
14891487
; P8LE-NEXT: blr
14901488
entry:
@@ -2580,16 +2578,15 @@ define <4 x i32> @fromDiffMemVarDui(ptr nocapture readonly %arr, i32 signext %el
25802578
;
25812579
; P8LE-LABEL: fromDiffMemVarDui:
25822580
; P8LE: # %bb.0: # %entry
2581+
; P8LE-NEXT: addis r5, r2, .LCPI41_0@toc@ha
25832582
; P8LE-NEXT: sldi r4, r4, 2
2583+
; P8LE-NEXT: addi r5, r5, .LCPI41_0@toc@l
25842584
; P8LE-NEXT: add r3, r3, r4
2585+
; P8LE-NEXT: lxvd2x vs0, 0, r5
25852586
; P8LE-NEXT: addi r3, r3, -12
2586-
; P8LE-NEXT: lxvd2x vs0, 0, r3
2587-
; P8LE-NEXT: addis r3, r2, .LCPI41_0@toc@ha
2588-
; P8LE-NEXT: addi r3, r3, .LCPI41_0@toc@l
2587+
; P8LE-NEXT: lxvd2x v3, 0, r3
25892588
; P8LE-NEXT: xxswapd v2, vs0
2590-
; P8LE-NEXT: lxvd2x vs0, 0, r3
2591-
; P8LE-NEXT: xxswapd v3, vs0
2592-
; P8LE-NEXT: vperm v2, v2, v2, v3
2589+
; P8LE-NEXT: vperm v2, v3, v3, v2
25932590
; P8LE-NEXT: blr
25942591
entry:
25952592
%idxprom = sext i32 %elem to i64
@@ -3000,13 +2997,12 @@ define <4 x i32> @fromDiffMemConsDConvftoui(ptr nocapture readonly %ptr) {
30002997
;
30012998
; P8LE-LABEL: fromDiffMemConsDConvftoui:
30022999
; P8LE: # %bb.0: # %entry
3003-
; P8LE-NEXT: lxvd2x vs0, 0, r3
3004-
; P8LE-NEXT: addis r3, r2, .LCPI50_0@toc@ha
3005-
; P8LE-NEXT: addi r3, r3, .LCPI50_0@toc@l
3000+
; P8LE-NEXT: addis r4, r2, .LCPI50_0@toc@ha
3001+
; P8LE-NEXT: lxvd2x v3, 0, r3
3002+
; P8LE-NEXT: addi r4, r4, .LCPI50_0@toc@l
3003+
; P8LE-NEXT: lxvd2x vs0, 0, r4
30063004
; P8LE-NEXT: xxswapd v2, vs0
3007-
; P8LE-NEXT: lxvd2x vs0, 0, r3
3008-
; P8LE-NEXT: xxswapd v3, vs0
3009-
; P8LE-NEXT: vperm v2, v2, v2, v3
3005+
; P8LE-NEXT: vperm v2, v3, v3, v2
30103006
; P8LE-NEXT: xvcvspuxws v2, v2
30113007
; P8LE-NEXT: blr
30123008
entry:

llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,13 @@ entry:
183183
define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
184184
; CHECK-LE-P8-LABEL: test_none_v16i8:
185185
; CHECK-LE-P8: # %bb.0: # %entry
186-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
187-
; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha
186+
; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI2_0@toc@ha
187+
; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4
188188
; CHECK-LE-P8-NEXT: mtvsrd v4, r3
189-
; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l
189+
; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI2_0@toc@l
190+
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5
190191
; CHECK-LE-P8-NEXT: xxswapd v2, vs0
191-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
192-
; CHECK-LE-P8-NEXT: xxswapd v3, vs0
193-
; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3
192+
; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2
194193
; CHECK-LE-P8-NEXT: blr
195194
;
196195
; CHECK-LE-P9-LABEL: test_none_v16i8:
@@ -431,14 +430,13 @@ entry:
431430
define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) {
432431
; CHECK-LE-P8-LABEL: test_none_v8i16:
433432
; CHECK-LE-P8: # %bb.0: # %entry
434-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
435-
; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI5_0@toc@ha
433+
; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI5_0@toc@ha
434+
; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4
436435
; CHECK-LE-P8-NEXT: mtvsrd v4, r3
437-
; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI5_0@toc@l
436+
; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI5_0@toc@l
437+
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5
438438
; CHECK-LE-P8-NEXT: xxswapd v2, vs0
439-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
440-
; CHECK-LE-P8-NEXT: xxswapd v3, vs0
441-
; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3
439+
; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2
442440
; CHECK-LE-P8-NEXT: blr
443441
;
444442
; CHECK-LE-P9-LABEL: test_none_v8i16:

llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -469,19 +469,18 @@ entry:
469469
define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonly %ptr2) {
470470
; CHECK-LE-P8-LABEL: test_none_v2i64:
471471
; CHECK-LE-P8: # %bb.0: # %entry
472-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
473-
; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI4_0@toc@ha
474-
; CHECK-LE-P8-NEXT: lxsdx v4, 0, r3
472+
; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha
473+
; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3
475474
; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI4_1@toc@ha
476-
; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI4_0@toc@l
475+
; CHECK-LE-P8-NEXT: lxvd2x v4, 0, r4
476+
; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l
477477
; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI4_1@toc@l
478-
; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r4
478+
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5
479479
; CHECK-LE-P8-NEXT: xxswapd v2, vs0
480480
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3
481-
; CHECK-LE-P8-NEXT: xxswapd v3, vs1
482-
; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3
483-
; CHECK-LE-P8-NEXT: xxswapd v3, vs0
481+
; CHECK-LE-P8-NEXT: vperm v2, v3, v4, v2
484482
; CHECK-LE-P8-NEXT: xxlxor v4, v4, v4
483+
; CHECK-LE-P8-NEXT: xxswapd v3, vs0
485484
; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3
486485
; CHECK-LE-P8-NEXT: xxswapd vs0, v2
487486
; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3

llvm/test/CodeGen/PowerPC/vperm-swap.ll

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,31 @@
44

55
define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) {
66
; CHECK-LE-P8: .LCPI0_0:
7-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
8-
; CHECK-LE-P8-NEXT: .byte 30 # 0x1e
7+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
8+
; CHECK-LE-P8-NEXT: .byte 22 # 0x16
99
; CHECK-LE-P8-NEXT: .byte 7 # 0x7
10-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
11-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
12-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
13-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
14-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
15-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
16-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
17-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
18-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
19-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
20-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
21-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
22-
; CHECK-LE-P8-NEXT: .byte 31 # 0x1f
10+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
11+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
12+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
13+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
14+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
15+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
16+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
17+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
18+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
19+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
20+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
21+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
22+
; CHECK-LE-P8-NEXT: .byte 23 # 0x17
2323
; CHECK-LE-P8-LABEL: test_none_v16i8:
2424
; CHECK-LE-P8: # %bb.0: # %entry
25-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
26-
; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha
25+
; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha
26+
; CHECK-LE-P8-NEXT: lxvd2x v3, 0, r4
2727
; CHECK-LE-P8-NEXT: mtvsrd v4, r3
28-
; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l
28+
; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI0_0@toc@l
29+
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5
2930
; CHECK-LE-P8-NEXT: xxswapd v2, vs0
30-
; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4
31-
; CHECK-LE-P8-NEXT: xxswapd v3, vs0
32-
; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3
31+
; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2
3332
; CHECK-LE-P8-NEXT: blr
3433
entry:
3534
%lhs = load <16 x i8>, ptr %b, align 4

0 commit comments

Comments
 (0)