Skip to content

Commit e7e72a9

Browse files
authored
[RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. (llvm#124848)
This adds a VP version of an existing DAG combine. I've put it in RISCVISelLowering since we would need to add a ISD::VP_AVGCEIL opcode otherwise. This pattern appears in 525.264_r.
1 parent a643e44 commit e7e72a9

File tree

2 files changed

+268
-12
lines changed

2 files changed

+268
-12
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 99 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,18 +1526,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
15261526
setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
15271527
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
15281528
if (Subtarget.hasVInstructions())
1529-
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER,
1530-
ISD::MSCATTER, ISD::VP_GATHER,
1531-
ISD::VP_SCATTER, ISD::SRA,
1532-
ISD::SRL, ISD::SHL,
1533-
ISD::STORE, ISD::SPLAT_VECTOR,
1534-
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1535-
ISD::VP_STORE, ISD::EXPERIMENTAL_VP_REVERSE,
1536-
ISD::MUL, ISD::SDIV,
1537-
ISD::UDIV, ISD::SREM,
1538-
ISD::UREM, ISD::INSERT_VECTOR_ELT,
1539-
ISD::ABS, ISD::CTPOP,
1540-
ISD::VECTOR_SHUFFLE, ISD::VSELECT});
1529+
setTargetDAGCombine(
1530+
{ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
1531+
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA,
1532+
ISD::SRL, ISD::SHL, ISD::STORE,
1533+
ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1534+
ISD::VP_STORE, ISD::VP_TRUNCATE, ISD::EXPERIMENTAL_VP_REVERSE,
1535+
ISD::MUL, ISD::SDIV, ISD::UDIV,
1536+
ISD::SREM, ISD::UREM, ISD::INSERT_VECTOR_ELT,
1537+
ISD::ABS, ISD::CTPOP, ISD::VECTOR_SHUFFLE,
1538+
ISD::VSELECT});
15411539

15421540
if (Subtarget.hasVendorXTHeadMemPair())
15431541
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -16373,6 +16371,93 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
1637316371
VPStore->isTruncatingStore(), VPStore->isCompressingStore());
1637416372
}
1637516373

16374+
// Peephole avgceil pattern.
16375+
// %1 = zext <N x i8> %a to <N x i32>
16376+
// %2 = zext <N x i8> %b to <N x i32>
16377+
// %3 = add nuw nsw <N x i32> %1, splat (i32 1)
16378+
// %4 = add nuw nsw <N x i32> %3, %2
16379+
// %5 = lshr <N x i32> %4, splat (i32 1)
16380+
// %6 = trunc <N x i32> %5 to <N x i8>
16381+
static SDValue performVP_TRUNCATECombine(SDNode *N, SelectionDAG &DAG,
16382+
const RISCVSubtarget &Subtarget) {
16383+
EVT VT = N->getValueType(0);
16384+
16385+
// Ignore fixed vectors.
16386+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16387+
if (!VT.isScalableVector() || !TLI.isTypeLegal(VT))
16388+
return SDValue();
16389+
16390+
SDValue In = N->getOperand(0);
16391+
SDValue Mask = N->getOperand(1);
16392+
SDValue VL = N->getOperand(2);
16393+
16394+
// Input should be a vp_srl with same mask and VL.
16395+
if (In.getOpcode() != ISD::VP_SRL || In.getOperand(2) != Mask ||
16396+
In.getOperand(3) != VL)
16397+
return SDValue();
16398+
16399+
// Shift amount should be 1.
16400+
if (!isOneOrOneSplat(In.getOperand(1)))
16401+
return SDValue();
16402+
16403+
// Shifted value should be a vp_add with same mask and VL.
16404+
SDValue LHS = In.getOperand(0);
16405+
if (LHS.getOpcode() != ISD::VP_ADD || LHS.getOperand(2) != Mask ||
16406+
LHS.getOperand(3) != VL)
16407+
return SDValue();
16408+
16409+
SDValue Operands[3];
16410+
16411+
// Matches another VP_ADD with same VL and Mask.
16412+
auto FindAdd = [&](SDValue V, SDValue Other) {
16413+
if (V.getOpcode() != ISD::VP_ADD || V.getOperand(2) != Mask ||
16414+
V.getOperand(3) != VL)
16415+
return false;
16416+
16417+
Operands[0] = Other;
16418+
Operands[1] = V.getOperand(1);
16419+
Operands[2] = V.getOperand(0);
16420+
return true;
16421+
};
16422+
16423+
// We need to find another VP_ADD in one of the operands.
16424+
SDValue LHS0 = LHS.getOperand(0);
16425+
SDValue LHS1 = LHS.getOperand(1);
16426+
if (!FindAdd(LHS0, LHS1) && !FindAdd(LHS1, LHS0))
16427+
return SDValue();
16428+
16429+
// Now we have three operands of two additions. Check that one of them is a
16430+
// constant vector with ones.
16431+
auto I = llvm::find_if(Operands,
16432+
[](const SDValue &Op) { return isOneOrOneSplat(Op); });
16433+
if (I == std::end(Operands))
16434+
return SDValue();
16435+
// We found a vector with ones, move if it to the end of the Operands array.
16436+
std::swap(*I, Operands[2]);
16437+
16438+
// Make sure the other 2 operands can be promoted from the result type.
16439+
for (SDValue Op : drop_end(Operands)) {
16440+
if (Op.getOpcode() != ISD::VP_ZERO_EXTEND || Op.getOperand(1) != Mask ||
16441+
Op.getOperand(2) != VL)
16442+
return SDValue();
16443+
// Input must be the same size or smaller than our result.
16444+
if (Op.getOperand(0).getScalarValueSizeInBits() > VT.getScalarSizeInBits())
16445+
return SDValue();
16446+
}
16447+
16448+
// Pattern is detected.
16449+
// Rebuild the zero extends in case the inputs are smaller than our result.
16450+
SDValue NewOp0 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT,
16451+
Operands[0].getOperand(0), Mask, VL);
16452+
SDValue NewOp1 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT,
16453+
Operands[1].getOperand(0), Mask, VL);
16454+
// Build a AVGCEILU_VL which will be selected as a VAADDU with RNU rounding
16455+
// mode.
16456+
SDLoc DL(N);
16457+
return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,
16458+
{NewOp0, NewOp1, DAG.getUNDEF(VT), Mask, VL});
16459+
}
16460+
1637616461
// Convert from one FMA opcode to another based on whether we are negating the
1637716462
// multiply result and/or the accumulator.
1637816463
// NOTE: Only supports RVV operations with VL.
@@ -17930,6 +18015,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1793018015
if (SDValue V = combineTruncOfSraSext(N, DAG))
1793118016
return V;
1793218017
return combineTruncToVnclip(N, DAG, Subtarget);
18018+
case ISD::VP_TRUNCATE:
18019+
return performVP_TRUNCATECombine(N, DAG, Subtarget);
1793318020
case ISD::TRUNCATE:
1793418021
return performTRUNCATECombine(N, DAG, Subtarget);
1793518022
case ISD::SELECT:
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
3+
4+
declare <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
5+
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
6+
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
7+
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
8+
declare <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
9+
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
10+
declare <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
11+
declare <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
12+
declare <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
13+
declare <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
14+
15+
define <vscale x 2 x i8> @vaaddu_1(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
16+
; CHECK-LABEL: vaaddu_1:
17+
; CHECK: # %bb.0:
18+
; CHECK-NEXT: csrwi vxrm, 0
19+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
20+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
21+
; CHECK-NEXT: ret
22+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
23+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
24+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
25+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
26+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
27+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
28+
ret <vscale x 2 x i8> %d
29+
}
30+
31+
define <vscale x 2 x i8> @vaaddu_2(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
32+
; CHECK-LABEL: vaaddu_2:
33+
; CHECK: # %bb.0:
34+
; CHECK-NEXT: csrwi vxrm, 0
35+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
36+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
37+
; CHECK-NEXT: ret
38+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
39+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
40+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
41+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
42+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
43+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
44+
ret <vscale x 2 x i8> %d
45+
}
46+
47+
define <vscale x 2 x i8> @vaaddu_3(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
48+
; CHECK-LABEL: vaaddu_3:
49+
; CHECK: # %bb.0:
50+
; CHECK-NEXT: csrwi vxrm, 0
51+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
52+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
53+
; CHECK-NEXT: ret
54+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
55+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
56+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
57+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
58+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
59+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
60+
ret <vscale x 2 x i8> %d
61+
}
62+
63+
define <vscale x 2 x i8> @vaaddu_4(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
64+
; CHECK-LABEL: vaaddu_4:
65+
; CHECK: # %bb.0:
66+
; CHECK-NEXT: csrwi vxrm, 0
67+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
68+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
69+
; CHECK-NEXT: ret
70+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
71+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
72+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
73+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
74+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
75+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
76+
ret <vscale x 2 x i8> %d
77+
}
78+
79+
define <vscale x 2 x i8> @vaaddu_5(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
80+
; CHECK-LABEL: vaaddu_5:
81+
; CHECK: # %bb.0:
82+
; CHECK-NEXT: csrwi vxrm, 0
83+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
84+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
85+
; CHECK-NEXT: ret
86+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
87+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
88+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
89+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
90+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
91+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
92+
ret <vscale x 2 x i8> %d
93+
}
94+
95+
define <vscale x 2 x i8> @vaaddu_6(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
96+
; CHECK-LABEL: vaaddu_6:
97+
; CHECK: # %bb.0:
98+
; CHECK-NEXT: csrwi vxrm, 0
99+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
100+
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
101+
; CHECK-NEXT: ret
102+
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
103+
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
104+
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
105+
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
106+
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
107+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
108+
ret <vscale x 2 x i8> %d
109+
}
110+
111+
; Test where the size is reduced by 4x instead of 2x.
112+
define <vscale x 2 x i8> @vaaddu_7(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
113+
; CHECK-LABEL: vaaddu_7:
114+
; CHECK: # %bb.0:
115+
; CHECK-NEXT: csrwi vxrm, 0
116+
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
117+
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
118+
; CHECK-NEXT: ret
119+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
120+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
121+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
122+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
123+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
124+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
125+
ret <vscale x 2 x i8> %d
126+
}
127+
128+
; Test where the zext can't be completely removed.
129+
define <vscale x 2 x i16> @vaaddu_8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
130+
; CHECK-LABEL: vaaddu_8:
131+
; CHECK: # %bb.0:
132+
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
133+
; CHECK-NEXT: vzext.vf2 v10, v8, v0.t
134+
; CHECK-NEXT: csrwi vxrm, 0
135+
; CHECK-NEXT: vzext.vf2 v8, v9, v0.t
136+
; CHECK-NEXT: vaaddu.vv v8, v10, v8, v0.t
137+
; CHECK-NEXT: ret
138+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
139+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
140+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
141+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
142+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
143+
%d = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
144+
ret <vscale x 2 x i16> %d
145+
}
146+
147+
; Negative test. The truncate has a smaller type than the zero extend.
148+
; TODO: Could still handle this by truncating after an i16 vaaddu.
149+
define <vscale x 2 x i8> @vaaddu_9(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
150+
; CHECK-LABEL: vaaddu_9:
151+
; CHECK: # %bb.0:
152+
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
153+
; CHECK-NEXT: vwaddu.vv v10, v8, v9, v0.t
154+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
155+
; CHECK-NEXT: vadd.vi v8, v10, 1, v0.t
156+
; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t
157+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
158+
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
159+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
160+
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
161+
; CHECK-NEXT: ret
162+
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1> %m, i32 %vl)
163+
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 %vl)
164+
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
165+
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
166+
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
167+
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
168+
ret <vscale x 2 x i8> %d
169+
}

0 commit comments

Comments
 (0)