Skip to content

Commit 63cd184

Browse files
author
Chen Zheng
committed
[PowerPC] use lvx + splat directly for aligned splat load
Reviewed By: nemanjai Differential Revision: https://reviews.llvm.org/D114062
1 parent 8720247 commit 63cd184

File tree

3 files changed

+53
-54
lines changed

3 files changed

+53
-54
lines changed

llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4464,9 +4464,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
44644464
bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
44654465
LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
44664466
StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
4467+
MemIntrinsicSDNode *MIN = dyn_cast<MemIntrinsicSDNode>(N);
44674468
SDValue AddrOp;
4468-
if (LDN)
4469-
AddrOp = LDN->getOperand(1);
4469+
if (LDN || (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT))
4470+
AddrOp = N->getOperand(1);
44704471
else if (STN)
44714472
AddrOp = STN->getOperand(2);
44724473

@@ -5973,6 +5974,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
59735974
if (Type != MVT::v16i8 && Type != MVT::v8i16)
59745975
break;
59755976

5977+
// If the alignment for the load is 16 or bigger, we don't need the
5978+
// permutated mask to get the required value. The value must be the 0
5979+
// element in big endian target or 7/15 in little endian target in the
5980+
// result vsx register of lvx instruction.
5981+
// Select the instruction in the .td file.
5982+
if (cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
5983+
isOffsetMultipleOf(N, 16))
5984+
break;
5985+
59765986
SDValue ZeroReg =
59775987
CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
59785988
Subtarget->isPPC64() ? MVT::i64 : MVT::i32);

llvm/lib/Target/PowerPC/PPCInstrVSX.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,11 @@ def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
158158
def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
159159
def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
160160

161+
def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{
162+
return cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) &&
163+
isOffsetMultipleOf(N, 16);
164+
}]>;
165+
161166
//--------------------- VSX-specific instruction formats ---------------------//
162167
// By default, all VSX instructions are to be selected over their Altivec
163168
// counter parts and they do not have unmodeled sideeffects.
@@ -3180,13 +3185,25 @@ defm : ScalToVecWPermute<
31803185
v2f64, (f64 (load ForceXForm:$src)),
31813186
(XXPERMDIs (XFLOADf64 ForceXForm:$src), 2),
31823187
(SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>;
3188+
3189+
// Splat loads.
3190+
def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
3191+
(v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>;
3192+
def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
3193+
(v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>;
31833194
} // HasVSX, NoP9Vector, IsLittleEndian
31843195

31853196
let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in {
31863197
def : Pat<(v2f64 (int_ppc_vsx_lxvd2x ForceXForm:$src)),
31873198
(LXVD2X ForceXForm:$src)>;
31883199
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst),
31893200
(STXVD2X $rS, ForceXForm:$dst)>;
3201+
3202+
// Splat loads.
3203+
def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)),
3204+
(v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>;
3205+
def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)),
3206+
(v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>;
31903207
} // HasVSX, NoP9Vector, IsBigEndian
31913208

31923209
// Any VSX subtarget that only has loads and stores that load in big endian

llvm/test/CodeGen/PowerPC/load-and-splat.ll

Lines changed: 24 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,18 +1065,13 @@ define <8 x i16> @test_aligned_v8i16_1(i16* %Ptr) {
10651065
;
10661066
; P8-LABEL: test_aligned_v8i16_1:
10671067
; P8: # %bb.0: # %entry
1068-
; P8-NEXT: lhzx r3, 0, r3
1069-
; P8-NEXT: mtvsrwz v2, r3
1070-
; P8-NEXT: vsplth v2, v2, 3
1068+
; P8-NEXT: lvx v2, 0, r3
1069+
; P8-NEXT: vsplth v2, v2, 7
10711070
; P8-NEXT: blr
10721071
;
10731072
; P7-LABEL: test_aligned_v8i16_1:
10741073
; P7: # %bb.0: # %entry
1075-
; P7-NEXT: li r4, 1
10761074
; P7-NEXT: lvx v2, 0, r3
1077-
; P7-NEXT: lvsl v4, 0, r3
1078-
; P7-NEXT: lvx v3, r4, r3
1079-
; P7-NEXT: vperm v2, v2, v3, v4
10801075
; P7-NEXT: vsplth v2, v2, 0
10811076
; P7-NEXT: blr
10821077
;
@@ -1088,18 +1083,13 @@ define <8 x i16> @test_aligned_v8i16_1(i16* %Ptr) {
10881083
;
10891084
; P8-AIX32-LABEL: test_aligned_v8i16_1:
10901085
; P8-AIX32: # %bb.0: # %entry
1091-
; P8-AIX32-NEXT: lhzx r3, 0, r3
1092-
; P8-AIX32-NEXT: mtvsrwz v2, r3
1093-
; P8-AIX32-NEXT: vsplth v2, v2, 3
1086+
; P8-AIX32-NEXT: lvx v2, 0, r3
1087+
; P8-AIX32-NEXT: vsplth v2, v2, 0
10941088
; P8-AIX32-NEXT: blr
10951089
;
10961090
; P7-AIX32-LABEL: test_aligned_v8i16_1:
10971091
; P7-AIX32: # %bb.0: # %entry
1098-
; P7-AIX32-NEXT: li r4, 1
10991092
; P7-AIX32-NEXT: lvx v2, 0, r3
1100-
; P7-AIX32-NEXT: lvsl v4, 0, r3
1101-
; P7-AIX32-NEXT: lvx v3, r4, r3
1102-
; P7-AIX32-NEXT: vperm v2, v2, v3, v4
11031093
; P7-AIX32-NEXT: vsplth v2, v2, 0
11041094
; P7-AIX32-NEXT: blr
11051095
entry:
@@ -1119,19 +1109,15 @@ define <8 x i16> @test_aligned_v8i16_2(i16* %Ptr) {
11191109
;
11201110
; P8-LABEL: test_aligned_v8i16_2:
11211111
; P8: # %bb.0: # %entry
1122-
; P8-NEXT: lhz r3, 32(r3)
1123-
; P8-NEXT: mtvsrwz v2, r3
1124-
; P8-NEXT: vsplth v2, v2, 3
1112+
; P8-NEXT: addi r3, r3, 32
1113+
; P8-NEXT: lvx v2, 0, r3
1114+
; P8-NEXT: vsplth v2, v2, 7
11251115
; P8-NEXT: blr
11261116
;
11271117
; P7-LABEL: test_aligned_v8i16_2:
11281118
; P7: # %bb.0: # %entry
1129-
; P7-NEXT: li r4, 1
11301119
; P7-NEXT: addi r3, r3, 32
11311120
; P7-NEXT: lvx v2, 0, r3
1132-
; P7-NEXT: lvx v3, r4, r3
1133-
; P7-NEXT: lvsl v4, 0, r3
1134-
; P7-NEXT: vperm v2, v2, v3, v4
11351121
; P7-NEXT: vsplth v2, v2, 0
11361122
; P7-NEXT: blr
11371123
;
@@ -1144,19 +1130,15 @@ define <8 x i16> @test_aligned_v8i16_2(i16* %Ptr) {
11441130
;
11451131
; P8-AIX32-LABEL: test_aligned_v8i16_2:
11461132
; P8-AIX32: # %bb.0: # %entry
1147-
; P8-AIX32-NEXT: lhz r3, 32(r3)
1148-
; P8-AIX32-NEXT: mtvsrwz v2, r3
1149-
; P8-AIX32-NEXT: vsplth v2, v2, 3
1133+
; P8-AIX32-NEXT: addi r3, r3, 32
1134+
; P8-AIX32-NEXT: lvx v2, 0, r3
1135+
; P8-AIX32-NEXT: vsplth v2, v2, 0
11501136
; P8-AIX32-NEXT: blr
11511137
;
11521138
; P7-AIX32-LABEL: test_aligned_v8i16_2:
11531139
; P7-AIX32: # %bb.0: # %entry
1154-
; P7-AIX32-NEXT: li r4, 1
11551140
; P7-AIX32-NEXT: addi r3, r3, 32
11561141
; P7-AIX32-NEXT: lvx v2, 0, r3
1157-
; P7-AIX32-NEXT: lvx v3, r4, r3
1158-
; P7-AIX32-NEXT: lvsl v4, 0, r3
1159-
; P7-AIX32-NEXT: vperm v2, v2, v3, v4
11601142
; P7-AIX32-NEXT: vsplth v2, v2, 0
11611143
; P7-AIX32-NEXT: blr
11621144
entry:
@@ -1176,16 +1158,13 @@ define <16 x i8> @test_aligned_v16i8_1(i8* %Ptr) {
11761158
;
11771159
; P8-LABEL: test_aligned_v16i8_1:
11781160
; P8: # %bb.0: # %entry
1179-
; P8-NEXT: lbzx r3, 0, r3
1180-
; P8-NEXT: mtvsrwz v2, r3
1181-
; P8-NEXT: vspltb v2, v2, 7
1161+
; P8-NEXT: lvx v2, 0, r3
1162+
; P8-NEXT: vspltb v2, v2, 15
11821163
; P8-NEXT: blr
11831164
;
11841165
; P7-LABEL: test_aligned_v16i8_1:
11851166
; P7: # %bb.0: # %entry
1186-
; P7-NEXT: lvsl v2, 0, r3
1187-
; P7-NEXT: lvx v3, 0, r3
1188-
; P7-NEXT: vperm v2, v3, v3, v2
1167+
; P7-NEXT: lvx v2, 0, r3
11891168
; P7-NEXT: vspltb v2, v2, 0
11901169
; P7-NEXT: blr
11911170
;
@@ -1197,16 +1176,13 @@ define <16 x i8> @test_aligned_v16i8_1(i8* %Ptr) {
11971176
;
11981177
; P8-AIX32-LABEL: test_aligned_v16i8_1:
11991178
; P8-AIX32: # %bb.0: # %entry
1200-
; P8-AIX32-NEXT: lbzx r3, 0, r3
1201-
; P8-AIX32-NEXT: mtvsrwz v2, r3
1202-
; P8-AIX32-NEXT: vspltb v2, v2, 7
1179+
; P8-AIX32-NEXT: lvx v2, 0, r3
1180+
; P8-AIX32-NEXT: vspltb v2, v2, 0
12031181
; P8-AIX32-NEXT: blr
12041182
;
12051183
; P7-AIX32-LABEL: test_aligned_v16i8_1:
12061184
; P7-AIX32: # %bb.0: # %entry
1207-
; P7-AIX32-NEXT: lvsl v2, 0, r3
1208-
; P7-AIX32-NEXT: lvx v3, 0, r3
1209-
; P7-AIX32-NEXT: vperm v2, v3, v3, v2
1185+
; P7-AIX32-NEXT: lvx v2, 0, r3
12101186
; P7-AIX32-NEXT: vspltb v2, v2, 0
12111187
; P7-AIX32-NEXT: blr
12121188
entry:
@@ -1226,17 +1202,15 @@ define <16 x i8> @test_aligned_v16i8_2(i8* %Ptr) {
12261202
;
12271203
; P8-LABEL: test_aligned_v16i8_2:
12281204
; P8: # %bb.0: # %entry
1229-
; P8-NEXT: lbz r3, 16(r3)
1230-
; P8-NEXT: mtvsrwz v2, r3
1231-
; P8-NEXT: vspltb v2, v2, 7
1205+
; P8-NEXT: addi r3, r3, 16
1206+
; P8-NEXT: lvx v2, 0, r3
1207+
; P8-NEXT: vspltb v2, v2, 15
12321208
; P8-NEXT: blr
12331209
;
12341210
; P7-LABEL: test_aligned_v16i8_2:
12351211
; P7: # %bb.0: # %entry
12361212
; P7-NEXT: addi r3, r3, 16
1237-
; P7-NEXT: lvsl v2, 0, r3
1238-
; P7-NEXT: lvx v3, 0, r3
1239-
; P7-NEXT: vperm v2, v3, v3, v2
1213+
; P7-NEXT: lvx v2, 0, r3
12401214
; P7-NEXT: vspltb v2, v2, 0
12411215
; P7-NEXT: blr
12421216
;
@@ -1249,17 +1223,15 @@ define <16 x i8> @test_aligned_v16i8_2(i8* %Ptr) {
12491223
;
12501224
; P8-AIX32-LABEL: test_aligned_v16i8_2:
12511225
; P8-AIX32: # %bb.0: # %entry
1252-
; P8-AIX32-NEXT: lbz r3, 16(r3)
1253-
; P8-AIX32-NEXT: mtvsrwz v2, r3
1254-
; P8-AIX32-NEXT: vspltb v2, v2, 7
1226+
; P8-AIX32-NEXT: addi r3, r3, 16
1227+
; P8-AIX32-NEXT: lvx v2, 0, r3
1228+
; P8-AIX32-NEXT: vspltb v2, v2, 0
12551229
; P8-AIX32-NEXT: blr
12561230
;
12571231
; P7-AIX32-LABEL: test_aligned_v16i8_2:
12581232
; P7-AIX32: # %bb.0: # %entry
12591233
; P7-AIX32-NEXT: addi r3, r3, 16
1260-
; P7-AIX32-NEXT: lvsl v2, 0, r3
1261-
; P7-AIX32-NEXT: lvx v3, 0, r3
1262-
; P7-AIX32-NEXT: vperm v2, v3, v3, v2
1234+
; P7-AIX32-NEXT: lvx v2, 0, r3
12631235
; P7-AIX32-NEXT: vspltb v2, v2, 0
12641236
; P7-AIX32-NEXT: blr
12651237
entry:

0 commit comments

Comments
 (0)