Skip to content

Commit ac518c7

Browse files
authored
[RISCV] Vector sub (zext, zext) -> sext (sub (zext, zext)) (#82455)
This is legal as long as the inner zext retains at least one bit of increase so that the sub overflow case (0 - UINT_MAX) can be represented. Alive2 proof: https://alive2.llvm.org/ce/z/BKeV3W For RVV, restrict this to power of two sizes with the operation type being at least e8 to stick to legal extends. We could arguably handle i1 source types with some care if we wanted to. This is likely profitable because it may allow us to perform the sub instruction in a narrow LMUL (equivalently, in fewer DLEN-sized pieces) before widening for the user. We could arguably avoid narrowing below DLEN, but the transform should at worst introduce one extra extend and one extra vsetvli toggle if the source could previously be handled via loads explicit w/EEW.
1 parent cd1d4d8 commit ac518c7

File tree

2 files changed

+40
-17
lines changed

2 files changed

+40
-17
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12887,21 +12887,44 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
1288712887
if (SDValue V = combineSubOfBoolean(N, DAG))
1288812888
return V;
1288912889

12890+
EVT VT = N->getValueType(0);
1289012891
SDValue N0 = N->getOperand(0);
1289112892
SDValue N1 = N->getOperand(1);
1289212893
// fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
1289312894
if (isNullConstant(N0) && N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
1289412895
isNullConstant(N1.getOperand(1))) {
1289512896
ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
1289612897
if (CCVal == ISD::SETLT) {
12897-
EVT VT = N->getValueType(0);
1289812898
SDLoc DL(N);
1289912899
unsigned ShAmt = N0.getValueSizeInBits() - 1;
1290012900
return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0),
1290112901
DAG.getConstant(ShAmt, DL, VT));
1290212902
}
1290312903
}
1290412904

12905+
// sub (zext, zext) -> sext (sub (zext, zext))
12906+
// where the sum of the extend widths match, and the inner zexts
12907+
// add at least one bit. (For profitability on rvv, we use a
12908+
// power of two for both inner and outer extend.)
12909+
if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) &&
12910+
N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND &&
12911+
N0.hasOneUse() && N1.hasOneUse()) {
12912+
SDValue Src0 = N0.getOperand(0);
12913+
SDValue Src1 = N1.getOperand(0);
12914+
EVT SrcVT = Src0.getValueType();
12915+
if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) &&
12916+
SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 &&
12917+
SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) {
12918+
LLVMContext &C = *DAG.getContext();
12919+
EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C);
12920+
EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount());
12921+
Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0);
12922+
Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1);
12923+
return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
12924+
DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1));
12925+
}
12926+
}
12927+
1290512928
// fold (sub x, (select lhs, rhs, cc, 0, y)) ->
1290612929
// (select lhs, rhs, cc, x, (sub x, y))
1290712930
return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -385,12 +385,12 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind {
385385
define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) {
386386
; CHECK-LABEL: vwsubu_v2i32_v2i8:
387387
; CHECK: # %bb.0:
388-
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
388+
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
389389
; CHECK-NEXT: vle8.v v8, (a0)
390390
; CHECK-NEXT: vle8.v v9, (a1)
391-
; CHECK-NEXT: vzext.vf2 v10, v8
392-
; CHECK-NEXT: vzext.vf2 v11, v9
393-
; CHECK-NEXT: vwsubu.vv v8, v10, v11
391+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
392+
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
393+
; CHECK-NEXT: vsext.vf2 v8, v10
394394
; CHECK-NEXT: ret
395395
%a = load <2 x i8>, ptr %x
396396
%b = load <2 x i8>, ptr %y
@@ -899,12 +899,12 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
899899
define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
900900
; CHECK-LABEL: vwsubu_v2i32_of_v2i8:
901901
; CHECK: # %bb.0:
902-
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
902+
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
903903
; CHECK-NEXT: vle8.v v8, (a0)
904904
; CHECK-NEXT: vle8.v v9, (a1)
905-
; CHECK-NEXT: vzext.vf2 v10, v8
906-
; CHECK-NEXT: vzext.vf2 v11, v9
907-
; CHECK-NEXT: vwsubu.vv v8, v10, v11
905+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
906+
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
907+
; CHECK-NEXT: vsext.vf2 v8, v10
908908
; CHECK-NEXT: ret
909909
%a = load <2 x i8>, ptr %x
910910
%b = load <2 x i8>, ptr %y
@@ -917,12 +917,12 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
917917
define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
918918
; CHECK-LABEL: vwsubu_v2i64_of_v2i8:
919919
; CHECK: # %bb.0:
920-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
920+
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
921921
; CHECK-NEXT: vle8.v v8, (a0)
922922
; CHECK-NEXT: vle8.v v9, (a1)
923-
; CHECK-NEXT: vzext.vf4 v10, v8
924-
; CHECK-NEXT: vzext.vf4 v11, v9
925-
; CHECK-NEXT: vwsubu.vv v8, v10, v11
923+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
924+
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
925+
; CHECK-NEXT: vsext.vf4 v8, v10
926926
; CHECK-NEXT: ret
927927
%a = load <2 x i8>, ptr %x
928928
%b = load <2 x i8>, ptr %y
@@ -935,12 +935,12 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
935935
define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) {
936936
; CHECK-LABEL: vwsubu_v2i64_of_v2i16:
937937
; CHECK: # %bb.0:
938-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
938+
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
939939
; CHECK-NEXT: vle16.v v8, (a0)
940940
; CHECK-NEXT: vle16.v v9, (a1)
941-
; CHECK-NEXT: vzext.vf2 v10, v8
942-
; CHECK-NEXT: vzext.vf2 v11, v9
943-
; CHECK-NEXT: vwsubu.vv v8, v10, v11
941+
; CHECK-NEXT: vwsubu.vv v10, v8, v9
942+
; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma
943+
; CHECK-NEXT: vsext.vf2 v8, v10
944944
; CHECK-NEXT: ret
945945
%a = load <2 x i16>, ptr %x
946946
%b = load <2 x i16>, ptr %y

0 commit comments

Comments
 (0)