[RISCV] Improve support for forming widening multiplies when one input is a scalar splat.

topperc · topperc · commit 933182e948bb · 2021-09-27T09:37:07.000-07:00
If one input of a fixed vector multiply is a sign/zero extend and the other operand is a splat of a scalar, we can use a widening multiply if the scalar value has sufficient sign/zero bits. Reviewed By: frasercrmck Differential Revision: https://reviews.llvm.org/D110028
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -6576,6 +6576,87 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
   return SDValue(N, 0);
 }
 
+// Try to form VWMUL or VWMULU.
+// FIXME: Support VWMULSU.
+static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1,
+                                    SelectionDAG &DAG) {
+  assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode");
+  bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
+  bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
+  if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse())
+    return SDValue();
+
+  SDValue Mask = N->getOperand(2);
+  SDValue VL = N->getOperand(3);
+
+  // Make sure the mask and VL match.
+  if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL)
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+
+  // Determine the narrow size for a widening multiply.
+  unsigned NarrowSize = VT.getScalarSizeInBits() / 2;
+  MVT NarrowVT = MVT::getVectorVT(MVT::getIntegerVT(NarrowSize),
+                                  VT.getVectorElementCount());
+
+  SDLoc DL(N);
+
+  // See if the other operand is the same opcode.
+  if (Op0.getOpcode() == Op1.getOpcode()) {
+    if (!Op1.hasOneUse())
+      return SDValue();
+
+    // Make sure the mask and VL match.
+    if (Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL)
+      return SDValue();
+
+    Op1 = Op1.getOperand(0);
+  } else if (Op1.getOpcode() == RISCVISD::VMV_V_X_VL) {
+    // The operand is a splat of a scalar.
+
+    // The VL must be the same.
+    if (Op1.getOperand(1) != VL)
+      return SDValue();
+
+    // Get the scalar value.
+    Op1 = Op1.getOperand(0);
+
+    // See if have enough sign bits or zero bits in the scalar to use a
+    // widening multiply by splatting to smaller element size.
+    unsigned EltBits = VT.getScalarSizeInBits();
+    unsigned ScalarBits = Op1.getValueSizeInBits();
+    // Make sure we're getting all element bits from the scalar register.
+    // FIXME: Support implicit sign extension of vmv.v.x?
+    if (ScalarBits < EltBits)
+      return SDValue();
+
+    if (IsSignExt) {
+      if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
+        return SDValue();
+    } else {
+      APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
+      if (!DAG.MaskedValueIsZero(Op1, Mask))
+        return SDValue();
+    }
+
+    Op1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, NarrowVT, Op1, VL);
+  } else
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+
+  // Re-introduce narrower extends if needed.
+  unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
+  if (Op0.getValueType() != NarrowVT)
+    Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
+  if (Op1.getValueType() != NarrowVT)
+    Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
+
+  unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
+  return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -7027,45 +7108,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
   case RISCVISD::MUL_VL: {
-    // Try to form VWMUL or VWMULU.
-    // FIXME: Look for splat of extended scalar as well.
-    // FIXME: Support VWMULSU.
     SDValue Op0 = N->getOperand(0);
     SDValue Op1 = N->getOperand(1);
-    bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
-    bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
-    if ((!IsSignExt && !IsZeroExt) || Op0.getOpcode() != Op1.getOpcode())
-      return SDValue();
-
-    // Make sure the extends have a single use.
-    if (!Op0.hasOneUse() || !Op1.hasOneUse())
-      return SDValue();
-
-    SDValue Mask = N->getOperand(2);
-    SDValue VL = N->getOperand(3);
-    if (Op0.getOperand(1) != Mask || Op1.getOperand(1) != Mask ||
-        Op0.getOperand(2) != VL || Op1.getOperand(2) != VL)
-      return SDValue();
-
-    Op0 = Op0.getOperand(0);
-    Op1 = Op1.getOperand(0);
-
-    MVT VT = N->getSimpleValueType(0);
-    MVT NarrowVT =
-        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() / 2),
-                         VT.getVectorElementCount());
-
-    SDLoc DL(N);
-
-    // Re-introduce narrower extends if needed.
-    unsigned ExtOpc = IsSignExt ? RISCVISD::VSEXT_VL : RISCVISD::VZEXT_VL;
-    if (Op0.getValueType() != NarrowVT)
-      Op0 = DAG.getNode(ExtOpc, DL, NarrowVT, Op0, Mask, VL);
-    if (Op1.getValueType() != NarrowVT)
-      Op1 = DAG.getNode(ExtOpc, DL, NarrowVT, Op1, Mask, VL);
-
-    unsigned WMulOpc = IsSignExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL;
-    return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
+    if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG))
+      return V;
+    if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG))
+      return V;
+    return SDValue();
   }
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
-; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 define <2 x i16> @vwmul_v2i16(<2 x i8>* %x, <2 x i8>* %y) {
 ; CHECK-LABEL: vwmul_v2i16:
@@ -649,3 +649,239 @@ define <16 x i64> @vwmul_vx_v16i64(<16 x i32>* %x, i32 %y) {
   ret <16 x i64> %f
 }
 
+define <8 x i16> @vwmul_vx_v8i16_i8(<8 x i8>* %x, i8* %y) {
+; CHECK-LABEL: vwmul_vx_v8i16_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v25, (a0)
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vwmul.vx v8, v25, a0
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, <8 x i8>* %x
+  %b = load i8, i8* %y
+  %c = sext i8 %b to i16
+  %d = insertelement <8 x i16> undef, i16 %c, i32 0
+  %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+  %f = sext <8 x i8> %a to <8 x i16>
+  %g = mul <8 x i16> %e, %f
+  ret <8 x i16> %g
+}
+
+define <8 x i16> @vwmul_vx_v8i16_i16(<8 x i8>* %x, i16* %y) {
+; CHECK-LABEL: vwmul_vx_v8i16_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v25, (a0)
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
+; CHECK-NEXT:    vsext.vf2 v26, v25
+; CHECK-NEXT:    vmul.vx v8, v26, a0
+; CHECK-NEXT:    ret
+  %a = load <8 x i8>, <8 x i8>* %x
+  %b = load i16, i16* %y
+  %d = insertelement <8 x i16> undef, i16 %b, i32 0
+  %e = shufflevector <8 x i16> %d, <8 x i16> undef, <8 x i32> zeroinitializer
+  %f = sext <8 x i8> %a to <8 x i16>
+  %g = mul <8 x i16> %e, %f
+  ret <8 x i16> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i8(<4 x i16>* %x, i8* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v25, (a0)
+; CHECK-NEXT:    lb a0, 0(a1)
+; CHECK-NEXT:    vwmul.vx v8, v25, a0
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, <4 x i16>* %x
+  %b = load i8, i8* %y
+  %c = sext i8 %b to i32
+  %d = insertelement <4 x i32> undef, i32 %c, i32 0
+  %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+  %f = sext <4 x i16> %a to <4 x i32>
+  %g = mul <4 x i32> %e, %f
+  ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i16(<4 x i16>* %x, i16* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v25, (a0)
+; CHECK-NEXT:    lh a0, 0(a1)
+; CHECK-NEXT:    vwmul.vx v8, v25, a0
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, <4 x i16>* %x
+  %b = load i16, i16* %y
+  %c = sext i16 %b to i32
+  %d = insertelement <4 x i32> undef, i32 %c, i32 0
+  %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+  %f = sext <4 x i16> %a to <4 x i32>
+  %g = mul <4 x i32> %e, %f
+  ret <4 x i32> %g
+}
+
+define <4 x i32> @vwmul_vx_v4i32_i32(<4 x i16>* %x, i32* %y) {
+; CHECK-LABEL: vwmul_vx_v4i32_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vle16.v v25, (a0)
+; CHECK-NEXT:    lw a0, 0(a1)
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
+; CHECK-NEXT:    vsext.vf2 v26, v25
+; CHECK-NEXT:    vmul.vx v8, v26, a0
+; CHECK-NEXT:    ret
+  %a = load <4 x i16>, <4 x i16>* %x
+  %b = load i32, i32* %y
+  %d = insertelement <4 x i32> undef, i32 %b, i32 0
+  %e = shufflevector <4 x i32> %d, <4 x i32> undef, <4 x i32> zeroinitializer
+  %f = sext <4 x i16> %a to <4 x i32>
+  %g = mul <4 x i32> %e, %f
+  ret <4 x i32> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i8(<2 x i32>* %x, i8* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    lb a1, 0(a1)
+; RV32-NEXT:    vle32.v v25, (a0)
+; RV32-NEXT:    srai a0, a1, 31
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v26, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vsext.vf2 v27, v25
+; RV32-NEXT:    vmul.vv v8, v26, v27
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v25, (a0)
+; RV64-NEXT:    lb a0, 0(a1)
+; RV64-NEXT:    vwmul.vx v8, v25, a0
+; RV64-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load i8, i8* %y
+  %c = sext i8 %b to i64
+  %d = insertelement <2 x i64> undef, i64 %c, i64 0
+  %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+  %f = sext <2 x i32> %a to <2 x i64>
+  %g = mul <2 x i64> %e, %f
+  ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i16(<2 x i32>* %x, i16* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    lh a1, 0(a1)
+; RV32-NEXT:    vle32.v v25, (a0)
+; RV32-NEXT:    srai a0, a1, 31
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v26, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vsext.vf2 v27, v25
+; RV32-NEXT:    vmul.vv v8, v26, v27
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v25, (a0)
+; RV64-NEXT:    lh a0, 0(a1)
+; RV64-NEXT:    vwmul.vx v8, v25, a0
+; RV64-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load i16, i16* %y
+  %c = sext i16 %b to i64
+  %d = insertelement <2 x i64> undef, i64 %c, i64 0
+  %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+  %f = sext <2 x i32> %a to <2 x i64>
+  %g = mul <2 x i64> %e, %f
+  ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i32(<2 x i32>* %x, i32* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vle32.v v25, (a0)
+; RV32-NEXT:    srai a0, a1, 31
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v26, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vsext.vf2 v27, v25
+; RV32-NEXT:    vmul.vv v8, v26, v27
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v25, (a0)
+; RV64-NEXT:    lw a0, 0(a1)
+; RV64-NEXT:    vwmul.vx v8, v25, a0
+; RV64-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load i32, i32* %y
+  %c = sext i32 %b to i64
+  %d = insertelement <2 x i64> undef, i64 %c, i64 0
+  %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+  %f = sext <2 x i32> %a to <2 x i64>
+  %g = mul <2 x i64> %e, %f
+  ret <2 x i64> %g
+}
+
+define <2 x i64> @vwmul_vx_v2i64_i64(<2 x i32>* %x, i64* %y) {
+; RV32-LABEL: vwmul_vx_v2i64_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV32-NEXT:    lw a2, 4(a1)
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vle32.v v25, (a0)
+; RV32-NEXT:    sw a2, 12(sp)
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vlse64.v v26, (a0), zero
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV32-NEXT:    vsext.vf2 v27, v25
+; RV32-NEXT:    vmul.vv v8, v26, v27
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vwmul_vx_v2i64_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; RV64-NEXT:    vle32.v v25, (a0)
+; RV64-NEXT:    ld a0, 0(a1)
+; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
+; RV64-NEXT:    vsext.vf2 v26, v25
+; RV64-NEXT:    vmul.vx v8, v26, a0
+; RV64-NEXT:    ret
+  %a = load <2 x i32>, <2 x i32>* %x
+  %b = load i64, i64* %y
+  %d = insertelement <2 x i64> undef, i64 %b, i64 0
+  %e = shufflevector <2 x i64> %d, <2 x i64> undef, <2 x i32> zeroinitializer
+  %f = sext <2 x i32> %a to <2 x i64>
+  %g = mul <2 x i64> %e, %f
+  ret <2 x i64> %g
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll