[PowerPC] vector shift word/double by element size - 1 use all ones (#139794)

RolandF77 · web-flow · commit bbca78fbcbc4 · 2025-05-23T10:49:37.000-04:00
Vector shift word or double requires a shift amount vector of 31 or 63
which is too big for splat immediate and requires a multi-instruction
sequence. However the PPC instructions only use 5 or 6 bits of the shift
amount vector elements so an all ones mask, which we can generate
efficiently, works.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18456,36 +18456,80 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
   return SDValue();
 }
 
-SDValue PPCTargetLowering::combineVectorSHL(SDNode *N,
-                                            DAGCombinerInfo &DCI) const {
+SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Vector type expected.");
 
-  SDValue N1 = N->getOperand(1);
-  if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR ||
-      !isOperationLegal(ISD::ADD, VT))
+  unsigned Opc = N->getOpcode();
+  assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
+         "Unexpected opcode.");
+
+  if (!isOperationLegal(Opc, VT))
     return SDValue();
 
-  // For 64-bit there is no splat immediate so we want to catch shift by 1 here
-  // before the BUILD_VECTOR is replaced by a load.
   EVT EltTy = VT.getScalarType();
-  if (EltTy != MVT::i64)
+  unsigned EltBits = EltTy.getSizeInBits();
+  if (EltTy != MVT::i64 && EltTy != MVT::i32)
     return SDValue();
 
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
-  APInt APSplatBits, APSplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  bool BVNIsConstantSplat =
-      BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                           HasAnyUndefs, 0, !Subtarget.isLittleEndian());
-  if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits())
+  SDValue N1 = N->getOperand(1);
+  uint64_t SplatBits = 0;
+  bool AddSplatCase = false;
+  unsigned OpcN1 = N1.getOpcode();
+  if (OpcN1 == PPCISD::VADD_SPLAT &&
+      N1.getConstantOperandVal(1) == VT.getVectorNumElements()) {
+    AddSplatCase = true;
+    SplatBits = N1.getConstantOperandVal(0);
+  }
+
+  if (!AddSplatCase) {
+    if (OpcN1 != ISD::BUILD_VECTOR)
+      return SDValue();
+
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    APInt APSplatBits, APSplatUndef;
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
+    bool BVNIsConstantSplat =
+        BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+    if (!BVNIsConstantSplat || SplatBitSize != EltBits)
+      return SDValue();
+    SplatBits = APSplatBits.getZExtValue();
+  }
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  // PPC vector shifts by word/double look at only the low 5/6 bits of the
+  // shift vector, which means the max value is 31/63. A shift vector of all
+  // 1s will be truncated to 31/63, which is useful as vspltiw is limited to
+  // -16 to 15 range.
+  if (SplatBits == (EltBits - 1)) {
+    unsigned NewOpc;
+    switch (Opc) {
+    case ISD::SHL:
+      NewOpc = PPCISD::SHL;
+      break;
+    case ISD::SRL:
+      NewOpc = PPCISD::SRL;
+      break;
+    case ISD::SRA:
+      NewOpc = PPCISD::SRA;
+      break;
+    }
+    SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL);
+    return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes);
+  }
+
+  if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT))
     return SDValue();
-  uint64_t SplatBits = APSplatBits.getZExtValue();
-  if (SplatBits != 1)
+
+  // For 64-bit there is no splat immediate so we want to catch shift by 1 here
+  // before the BUILD_VECTOR is replaced by a load.
+  if (EltTy != MVT::i64 || SplatBits != 1)
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
   return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
 }
 
@@ -18494,7 +18538,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
     return Value;
 
   if (N->getValueType(0).isVector())
-    return combineVectorSHL(N, DCI);
+    return combineVectorShift(N, DCI);
 
   SDValue N0 = N->getOperand(0);
   ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -18526,13 +18570,19 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
     return Value;
 
+  if (N->getValueType(0).isVector())
+    return combineVectorShift(N, DCI);
+
   return SDValue();
 }
 
 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
     return Value;
 
+  if (N->getValueType(0).isVector())
+    return combineVectorShift(N, DCI);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1441,7 +1441,7 @@ namespace llvm {
     SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
-    SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
@@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) {
         ret <4 x i32> %tmp.1
 }
 ; CHECK-LABEL: test7_v4i32:
-; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
-; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
-; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
+; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
 ; CHECK-NOT: vmul
-; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
+; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
 
 define <4 x i32> @test8_v4i32(<4 x i32> %a) {
         %tmp.1 = mul nsw <4 x i32> %a, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> ; <<4 x i32>> [#uses=1]
         ret <4 x i32> %tmp.1
 }
 ; CHECK-LABEL: test8_v4i32:
-; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
-; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
-; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
+; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
 ; CHECK-NOT: vmul
-; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
-; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2
+; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
+; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2
 
 define <2 x i64> @test1_v2i64(<2 x i64> %a) {
         %tmp.1 = mul nsw <2 x i64> %a, <i64 16, i64 16>         ; <<2 x i64>> [#uses=1]
@@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test7_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]]
 
@@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) {
 }
 
 ; CHECK-LABEL: test8_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
 ; CHECK-NOT: vmul
 ; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
 ; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2
diff --git a/llvm/test/CodeGen/PowerPC/pr47891.ll b/llvm/test/CodeGen/PowerPC/pr47891.ll
@@ -7,13 +7,11 @@
 define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
 ; CHECK-LABEL: poly2_lshift1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis r6, r2, .LCPI0_0@toc@ha
+; CHECK-NEXT:    ld r6, 0(r3)
 ; CHECK-NEXT:    li r4, 72
 ; CHECK-NEXT:    ld r5, 64(r3)
-; CHECK-NEXT:    addi r6, r6, .LCPI0_0@toc@l
+; CHECK-NEXT:    xxleqv v4, v4, v4
 ; CHECK-NEXT:    lxvd2x vs0, r3, r4
-; CHECK-NEXT:    lxvd2x v4, 0, r6
-; CHECK-NEXT:    ld r6, 0(r3)
 ; CHECK-NEXT:    sldi r7, r6, 1
 ; CHECK-NEXT:    rotldi r6, r6, 1
 ; CHECK-NEXT:    std r7, 0(r3)
@@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
 ; CHECK-NEXT:    std r7, 32(r3)
 ; CHECK-NEXT:    ld r7, 40(r3)
 ; CHECK-NEXT:    rldimi r6, r7, 1, 0
-; CHECK-NEXT:    xxswapd v2, vs0
-; CHECK-NEXT:    mtfprd f0, r5
 ; CHECK-NEXT:    rotldi r7, r7, 1
 ; CHECK-NEXT:    std r6, 40(r3)
 ; CHECK-NEXT:    ld r6, 48(r3)
+; CHECK-NEXT:    xxswapd v2, vs0
+; CHECK-NEXT:    mtfprd f0, r5
 ; CHECK-NEXT:    rldimi r7, r6, 1, 0
 ; CHECK-NEXT:    rotldi r6, r6, 1
 ; CHECK-NEXT:    std r7, 48(r3)
diff --git a/llvm/test/CodeGen/PowerPC/signbit-shift.ll b/llvm/test/CodeGen/PowerPC/signbit-shift.ll
@@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) {
 define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: add_lshr_not_vec_splat:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
 ; CHECK-NEXT:    addis 3, 2, .LCPI15_0@toc@ha
-; CHECK-NEXT:    vsubuwm 3, 4, 3
-; CHECK-NEXT:    addi 3, 3, .LCPI15_0@toc@l
+; CHECK-NEXT:    xxleqv 35, 35, 35
 ; CHECK-NEXT:    vsraw 2, 2, 3
+; CHECK-NEXT:    addi 3, 3, .LCPI15_0@toc@l
 ; CHECK-NEXT:    lxvd2x 35, 0, 3
 ; CHECK-NEXT:    vadduwm 2, 2, 3
 ; CHECK-NEXT:    blr
@@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) {
 define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
 ; CHECK-LABEL: sub_lshr_not_vec_splat:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
 ; CHECK-NEXT:    addis 3, 2, .LCPI17_0@toc@ha
-; CHECK-NEXT:    vsubuwm 3, 4, 3
-; CHECK-NEXT:    addi 3, 3, .LCPI17_0@toc@l
+; CHECK-NEXT:    xxleqv 35, 35, 35
 ; CHECK-NEXT:    vsrw 2, 2, 3
+; CHECK-NEXT:    addi 3, 3, .LCPI17_0@toc@l
 ; CHECK-NEXT:    lxvd2x 35, 0, 3
 ; CHECK-NEXT:    vadduwm 2, 2, 3
 ; CHECK-NEXT:    blr
@@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
 define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: sub_lshr_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vspltisw 4, -16
-; CHECK-NEXT:    vspltisw 5, 15
-; CHECK-NEXT:    vsubuwm 4, 5, 4
+; CHECK-NEXT:    xxleqv 36, 36, 36
 ; CHECK-NEXT:    vsraw 2, 2, 4
 ; CHECK-NEXT:    vadduwm 2, 3, 2
 ; CHECK-NEXT:    blr
@@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) {
 define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
 ; CHECK-LABEL: sub_const_op_lshr_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
 ; CHECK-NEXT:    addis 3, 2, .LCPI21_0@toc@ha
-; CHECK-NEXT:    vsubuwm 3, 4, 3
-; CHECK-NEXT:    addi 3, 3, .LCPI21_0@toc@l
+; CHECK-NEXT:    xxleqv 35, 35, 35
 ; CHECK-NEXT:    vsraw 2, 2, 3
+; CHECK-NEXT:    addi 3, 3, .LCPI21_0@toc@l
 ; CHECK-NEXT:    lxvd2x 35, 0, 3
 ; CHECK-NEXT:    vadduwm 2, 2, 3
 ; CHECK-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/vselect-constants.ll b/llvm/test/CodeGen/PowerPC/vselect-constants.ll
@@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_C1_or_C2_vec:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
+; CHECK-NEXT:    xxleqv 37, 37, 37
+; CHECK-NEXT:    vslw 2, 2, 5
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
-; CHECK-NEXT:    vsubuwm 3, 4, 3
+; CHECK-NEXT:    vsraw 2, 2, 5
 ; CHECK-NEXT:    lxvd2x 0, 0, 3
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
-; CHECK-NEXT:    vslw 2, 2, 3
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
-; CHECK-NEXT:    vsraw 2, 2, 3
-; CHECK-NEXT:    xxswapd 37, 0
+; CHECK-NEXT:    xxswapd 35, 0
 ; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    xxswapd 32, 0
-; CHECK-NEXT:    xxsel 34, 32, 37, 34
+; CHECK-NEXT:    xxswapd 36, 0
+; CHECK-NEXT:    xxsel 34, 36, 35, 34
 ; CHECK-NEXT:    blr
   %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
   ret <4 x i32> %add
@@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_Cminus1_or_C_vec:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
+; CHECK-NEXT:    xxleqv 36, 36, 36
+; CHECK-NEXT:    vslw 2, 2, 4
 ; CHECK-NEXT:    addi 3, 3, .LCPI4_0@toc@l
-; CHECK-NEXT:    vsubuwm 3, 4, 3
+; CHECK-NEXT:    vsraw 2, 2, 4
 ; CHECK-NEXT:    lxvd2x 0, 0, 3
-; CHECK-NEXT:    vslw 2, 2, 3
-; CHECK-NEXT:    vsraw 2, 2, 3
-; CHECK-NEXT:    xxswapd 37, 0
-; CHECK-NEXT:    vadduwm 2, 2, 5
+; CHECK-NEXT:    xxswapd 35, 0
+; CHECK-NEXT:    vadduwm 2, 2, 3
 ; CHECK-NEXT:    blr
   %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
   ret <4 x i32> %add
@@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_minus1_or_0_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vspltisw 3, -16
-; CHECK-NEXT:    vspltisw 4, 15
-; CHECK-NEXT:    vsubuwm 3, 4, 3
+; CHECK-NEXT:    xxleqv 35, 35, 35
 ; CHECK-NEXT:    vslw 2, 2, 3
 ; CHECK-NEXT:    vsraw 2, 2, 3
 ; CHECK-NEXT:    blr