Skip to content

Commit f7c8a03

Browse files
authored
[RISCV] Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) -> (bitcast (sra (v2Xi16 (bitcast X)), 15)) (#93565)
Similar for i16 and i64 elements for both fixed and scalable vectors. This reduces the number of vector instructions, but increases vl/vtype toggles. This reduces some code in 525.x264_r from SPEC2017. In that usage, the vectors are fixed with a small number of elements so vsetivli can be used. This is similar to `performMulVectorCmpZeroCombine` from AArch64.
1 parent 760c2aa commit f7c8a03

File tree

2 files changed

+158
-0
lines changed

2 files changed

+158
-0
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13704,6 +13704,44 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1370413704
return SDValue();
1370513705
}
1370613706

13707+
// Combine vXi32 (mul (and (lshr X, 15), 0x10001), 0xffff) ->
13708+
// (bitcast (sra (v2Xi16 (bitcast X)), 15))
13709+
// Same for other equivalent types with other equivalent constants.
13710+
static SDValue combineVectorMulToSraBitcast(SDNode *N, SelectionDAG &DAG) {
13711+
EVT VT = N->getValueType(0);
13712+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13713+
13714+
// Do this for legal vectors unless they are i1 or i8 vectors.
13715+
if (!VT.isVector() || !TLI.isTypeLegal(VT) || VT.getScalarSizeInBits() < 16)
13716+
return SDValue();
13717+
13718+
if (N->getOperand(0).getOpcode() != ISD::AND ||
13719+
N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
13720+
return SDValue();
13721+
13722+
SDValue And = N->getOperand(0);
13723+
SDValue Srl = And.getOperand(0);
13724+
13725+
APInt V1, V2, V3;
13726+
if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
13727+
!ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
13728+
!ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
13729+
return SDValue();
13730+
13731+
unsigned HalfSize = VT.getScalarSizeInBits() / 2;
13732+
if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
13733+
V3 != (HalfSize - 1))
13734+
return SDValue();
13735+
13736+
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
13737+
EVT::getIntegerVT(*DAG.getContext(), HalfSize),
13738+
VT.getVectorElementCount() * 2);
13739+
SDLoc DL(N);
13740+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, HalfVT, Srl.getOperand(0));
13741+
SDValue Sra = DAG.getNode(ISD::SRA, DL, HalfVT, Cast,
13742+
DAG.getConstant(HalfSize - 1, DL, HalfVT));
13743+
return DAG.getNode(ISD::BITCAST, DL, VT, Sra);
13744+
}
1370713745

1370813746
static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
1370913747
TargetLowering::DAGCombinerInfo &DCI,
@@ -13748,6 +13786,9 @@ static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
1374813786
if (SDValue V = combineBinOpOfZExt(N, DAG))
1374913787
return V;
1375013788

13789+
if (SDValue V = combineVectorMulToSraBitcast(N, DAG))
13790+
return V;
13791+
1375113792
return SDValue();
1375213793
}
1375313794

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV32
3+
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
4+
5+
define <2 x i16> @test_v2i16(<2 x i16> %x) {
6+
; CHECK-RV32-LABEL: test_v2i16:
7+
; CHECK-RV32: # %bb.0:
8+
; CHECK-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
9+
; CHECK-RV32-NEXT: vsra.vi v8, v8, 7
10+
; CHECK-RV32-NEXT: ret
11+
;
12+
; CHECK-RV64-LABEL: test_v2i16:
13+
; CHECK-RV64: # %bb.0:
14+
; CHECK-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
15+
; CHECK-RV64-NEXT: vsra.vi v8, v8, 7
16+
; CHECK-RV64-NEXT: ret
17+
%1 = lshr <2 x i16> %x, <i16 7, i16 7>
18+
%2 = and <2 x i16> %1, <i16 257, i16 257>
19+
%3 = mul <2 x i16> %2, <i16 255, i16 255>
20+
ret <2 x i16> %3
21+
}
22+
23+
define <vscale x 2 x i16> @test_nxv2i16(<vscale x 2 x i16> %x) {
24+
; CHECK-RV32-LABEL: test_nxv2i16:
25+
; CHECK-RV32: # %bb.0:
26+
; CHECK-RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
27+
; CHECK-RV32-NEXT: vsrl.vi v8, v8, 7
28+
; CHECK-RV32-NEXT: li a0, 257
29+
; CHECK-RV32-NEXT: vand.vx v8, v8, a0
30+
; CHECK-RV32-NEXT: vsll.vi v8, v8, 8
31+
; CHECK-RV32-NEXT: ret
32+
;
33+
; CHECK-RV64-LABEL: test_nxv2i16:
34+
; CHECK-RV64: # %bb.0:
35+
; CHECK-RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
36+
; CHECK-RV64-NEXT: vsrl.vi v8, v8, 7
37+
; CHECK-RV64-NEXT: li a0, 257
38+
; CHECK-RV64-NEXT: vand.vx v8, v8, a0
39+
; CHECK-RV64-NEXT: vsll.vi v8, v8, 8
40+
; CHECK-RV64-NEXT: ret
41+
%1 = lshr <vscale x 2 x i16> %x, splat (i16 7)
42+
%2 = and <vscale x 2 x i16> %1, splat (i16 257)
43+
%3 = mul <vscale x 2 x i16> %2, splat (i16 256)
44+
ret <vscale x 2 x i16> %3
45+
}
46+
47+
define <2 x i32> @test_v2i32(<2 x i32> %x) {
48+
; CHECK-RV32-LABEL: test_v2i32:
49+
; CHECK-RV32: # %bb.0:
50+
; CHECK-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
51+
; CHECK-RV32-NEXT: vsra.vi v8, v8, 15
52+
; CHECK-RV32-NEXT: ret
53+
;
54+
; CHECK-RV64-LABEL: test_v2i32:
55+
; CHECK-RV64: # %bb.0:
56+
; CHECK-RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
57+
; CHECK-RV64-NEXT: vsra.vi v8, v8, 15
58+
; CHECK-RV64-NEXT: ret
59+
%1 = lshr <2 x i32> %x, <i32 15, i32 15>
60+
%2 = and <2 x i32> %1, <i32 65537, i32 65537>
61+
%3 = mul <2 x i32> %2, <i32 65535, i32 65535>
62+
ret <2 x i32> %3
63+
}
64+
65+
define <vscale x 2 x i32> @test_nxv2i32(<vscale x 2 x i32> %x) {
66+
; CHECK-RV32-LABEL: test_nxv2i32:
67+
; CHECK-RV32: # %bb.0:
68+
; CHECK-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
69+
; CHECK-RV32-NEXT: vsra.vi v8, v8, 15
70+
; CHECK-RV32-NEXT: ret
71+
;
72+
; CHECK-RV64-LABEL: test_nxv2i32:
73+
; CHECK-RV64: # %bb.0:
74+
; CHECK-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
75+
; CHECK-RV64-NEXT: vsra.vi v8, v8, 15
76+
; CHECK-RV64-NEXT: ret
77+
%1 = lshr <vscale x 2 x i32> %x, splat (i32 15)
78+
%2 = and <vscale x 2 x i32> %1, splat (i32 65537)
79+
%3 = mul <vscale x 2 x i32> %2, splat (i32 65535)
80+
ret <vscale x 2 x i32> %3
81+
}
82+
83+
define <2 x i64> @test_v2i64(<2 x i64> %x) {
84+
; CHECK-RV32-LABEL: test_v2i64:
85+
; CHECK-RV32: # %bb.0:
86+
; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
87+
; CHECK-RV32-NEXT: vsra.vi v8, v8, 31
88+
; CHECK-RV32-NEXT: ret
89+
;
90+
; CHECK-RV64-LABEL: test_v2i64:
91+
; CHECK-RV64: # %bb.0:
92+
; CHECK-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
93+
; CHECK-RV64-NEXT: vsra.vi v8, v8, 31
94+
; CHECK-RV64-NEXT: ret
95+
%1 = lshr <2 x i64> %x, <i64 31, i64 31>
96+
%2 = and <2 x i64> %1, <i64 4294967297, i64 4294967297>
97+
%3 = mul <2 x i64> %2, <i64 4294967295, i64 4294967295>
98+
ret <2 x i64> %3
99+
}
100+
101+
define <vscale x 2 x i64> @test_nxv2i64(<vscale x 2 x i64> %x) {
102+
; CHECK-RV32-LABEL: test_nxv2i64:
103+
; CHECK-RV32: # %bb.0:
104+
; CHECK-RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma
105+
; CHECK-RV32-NEXT: vsra.vi v8, v8, 31
106+
; CHECK-RV32-NEXT: ret
107+
;
108+
; CHECK-RV64-LABEL: test_nxv2i64:
109+
; CHECK-RV64: # %bb.0:
110+
; CHECK-RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma
111+
; CHECK-RV64-NEXT: vsra.vi v8, v8, 31
112+
; CHECK-RV64-NEXT: ret
113+
%1 = lshr <vscale x 2 x i64> %x, splat (i64 31)
114+
%2 = and <vscale x 2 x i64> %1, splat (i64 4294967297)
115+
%3 = mul <vscale x 2 x i64> %2, splat (i64 4294967295)
116+
ret <vscale x 2 x i64> %3
117+
}

0 commit comments

Comments
 (0)