Skip to content

Commit d28b4d8

Browse files
authored
[RISCV] Lower BUILD_VECTOR with i64 type to VID on RV32 if possible (#132339)
The element type i64 of the BUILD_VECTOR is not legal on RV32. It doesn't catch the VID pattern after being legalized for i64. So try to customized lower it to VID during type legalization.
1 parent fbc6241 commit d28b4d8

File tree

5 files changed

+136
-174
lines changed

5 files changed

+136
-174
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 83 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
12891289
if (!Subtarget.is64Bit() && VT.getVectorElementType() == MVT::i64) {
12901290
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
12911291
setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);
1292+
1293+
// Lower BUILD_VECTOR with i64 type to VID on RV32 if possible.
1294+
setOperationAction(ISD::BUILD_VECTOR, MVT::i64, Custom);
12921295
}
12931296

12941297
setOperationAction(
@@ -3622,6 +3625,78 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
36223625
return Gather;
36233626
}
36243627

3628+
static SDValue lowerBuildVectorViaVID(SDValue Op, SelectionDAG &DAG,
3629+
const RISCVSubtarget &Subtarget) {
3630+
MVT VT = Op.getSimpleValueType();
3631+
assert(VT.isFixedLengthVector() && "Unexpected vector!");
3632+
3633+
MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
3634+
3635+
SDLoc DL(Op);
3636+
auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
3637+
3638+
if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
3639+
int64_t StepNumerator = SimpleVID->StepNumerator;
3640+
unsigned StepDenominator = SimpleVID->StepDenominator;
3641+
int64_t Addend = SimpleVID->Addend;
3642+
3643+
assert(StepNumerator != 0 && "Invalid step");
3644+
bool Negate = false;
3645+
int64_t SplatStepVal = StepNumerator;
3646+
unsigned StepOpcode = ISD::MUL;
3647+
// Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
3648+
// anyway as the shift of 63 won't fit in uimm5.
3649+
if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
3650+
isPowerOf2_64(std::abs(StepNumerator))) {
3651+
Negate = StepNumerator < 0;
3652+
StepOpcode = ISD::SHL;
3653+
SplatStepVal = Log2_64(std::abs(StepNumerator));
3654+
}
3655+
3656+
// Only emit VIDs with suitably-small steps/addends. We use imm5 is a
3657+
// threshold since it's the immediate value many RVV instructions accept.
3658+
// There is no vmul.vi instruction so ensure multiply constant can fit in
3659+
// a single addi instruction.
3660+
if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
3661+
(StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
3662+
isPowerOf2_32(StepDenominator) &&
3663+
(SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
3664+
MVT VIDVT =
3665+
VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
3666+
MVT VIDContainerVT =
3667+
getContainerForFixedLengthVector(DAG, VIDVT, Subtarget);
3668+
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL);
3669+
// Convert right out of the scalable type so we can use standard ISD
3670+
// nodes for the rest of the computation. If we used scalable types with
3671+
// these, we'd lose the fixed-length vector info and generate worse
3672+
// vsetvli code.
3673+
VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
3674+
if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
3675+
(StepOpcode == ISD::SHL && SplatStepVal != 0)) {
3676+
SDValue SplatStep = DAG.getSignedConstant(SplatStepVal, DL, VIDVT);
3677+
VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
3678+
}
3679+
if (StepDenominator != 1) {
3680+
SDValue SplatStep =
3681+
DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
3682+
VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
3683+
}
3684+
if (Addend != 0 || Negate) {
3685+
SDValue SplatAddend = DAG.getSignedConstant(Addend, DL, VIDVT);
3686+
VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
3687+
VID);
3688+
}
3689+
if (VT.isFloatingPoint()) {
3690+
// TODO: Use vfwcvt to reduce register pressure.
3691+
VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID);
3692+
}
3693+
return VID;
3694+
}
3695+
}
3696+
3697+
return SDValue();
3698+
}
3699+
36253700
/// Try and optimize BUILD_VECTORs with "dominant values" - these are values
36263701
/// which constitute a large proportion of the elements. In such cases we can
36273702
/// splat a vector with the dominant element and make up the shortfall with
@@ -3839,64 +3914,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
38393914
// Try and match index sequences, which we can lower to the vid instruction
38403915
// with optional modifications. An all-undef vector is matched by
38413916
// getSplatValue, above.
3842-
if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
3843-
int64_t StepNumerator = SimpleVID->StepNumerator;
3844-
unsigned StepDenominator = SimpleVID->StepDenominator;
3845-
int64_t Addend = SimpleVID->Addend;
3846-
3847-
assert(StepNumerator != 0 && "Invalid step");
3848-
bool Negate = false;
3849-
int64_t SplatStepVal = StepNumerator;
3850-
unsigned StepOpcode = ISD::MUL;
3851-
// Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
3852-
// anyway as the shift of 63 won't fit in uimm5.
3853-
if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
3854-
isPowerOf2_64(std::abs(StepNumerator))) {
3855-
Negate = StepNumerator < 0;
3856-
StepOpcode = ISD::SHL;
3857-
SplatStepVal = Log2_64(std::abs(StepNumerator));
3858-
}
3859-
3860-
// Only emit VIDs with suitably-small steps/addends. We use imm5 is a
3861-
// threshold since it's the immediate value many RVV instructions accept.
3862-
// There is no vmul.vi instruction so ensure multiply constant can fit in
3863-
// a single addi instruction.
3864-
if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
3865-
(StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
3866-
isPowerOf2_32(StepDenominator) &&
3867-
(SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
3868-
MVT VIDVT =
3869-
VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
3870-
MVT VIDContainerVT =
3871-
getContainerForFixedLengthVector(DAG, VIDVT, Subtarget);
3872-
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL);
3873-
// Convert right out of the scalable type so we can use standard ISD
3874-
// nodes for the rest of the computation. If we used scalable types with
3875-
// these, we'd lose the fixed-length vector info and generate worse
3876-
// vsetvli code.
3877-
VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
3878-
if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
3879-
(StepOpcode == ISD::SHL && SplatStepVal != 0)) {
3880-
SDValue SplatStep = DAG.getSignedConstant(SplatStepVal, DL, VIDVT);
3881-
VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
3882-
}
3883-
if (StepDenominator != 1) {
3884-
SDValue SplatStep =
3885-
DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
3886-
VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
3887-
}
3888-
if (Addend != 0 || Negate) {
3889-
SDValue SplatAddend = DAG.getSignedConstant(Addend, DL, VIDVT);
3890-
VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
3891-
VID);
3892-
}
3893-
if (VT.isFloatingPoint()) {
3894-
// TODO: Use vfwcvt to reduce register pressure.
3895-
VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID);
3896-
}
3897-
return VID;
3898-
}
3899-
}
3917+
if (SDValue Res = lowerBuildVectorViaVID(Op, DAG, Subtarget))
3918+
return Res;
39003919

39013920
// For very small build_vectors, use a single scalar insert of a constant.
39023921
// TODO: Base this on constant rematerialization cost, not size.
@@ -7586,8 +7605,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
75867605
return lowerVECTOR_REVERSE(Op, DAG);
75877606
case ISD::VECTOR_SPLICE:
75887607
return lowerVECTOR_SPLICE(Op, DAG);
7589-
case ISD::BUILD_VECTOR:
7608+
case ISD::BUILD_VECTOR: {
7609+
MVT VT = Op.getSimpleValueType();
7610+
MVT EltVT = VT.getVectorElementType();
7611+
if (!Subtarget.is64Bit() && EltVT == MVT::i64)
7612+
return lowerBuildVectorViaVID(Op, DAG, Subtarget);
75907613
return lowerBUILD_VECTOR(Op, DAG, Subtarget);
7614+
}
75917615
case ISD::SPLAT_VECTOR: {
75927616
MVT VT = Op.getSimpleValueType();
75937617
MVT EltVT = VT.getVectorElementType();

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -290,15 +290,11 @@ define void @buildvec_vid_stepn3_addn3_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3)
290290
ret void
291291
}
292292

293-
; FIXME: RV32 doesn't catch this pattern due to BUILD_VECTOR legalization.
294293
define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
295294
; RV32-LABEL: buildvec_vid_step1_add0_v4i64:
296295
; RV32: # %bb.0:
297-
; RV32-NEXT: lui a0, %hi(.LCPI25_0)
298-
; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0)
299-
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
300-
; RV32-NEXT: vle8.v v10, (a0)
301-
; RV32-NEXT: vsext.vf4 v8, v10
296+
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
297+
; RV32-NEXT: vid.v v8
302298
; RV32-NEXT: ret
303299
;
304300
; RV64V-LABEL: buildvec_vid_step1_add0_v4i64:
@@ -323,11 +319,9 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
323319
define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
324320
; RV32-LABEL: buildvec_vid_step2_add0_v4i64:
325321
; RV32: # %bb.0:
326-
; RV32-NEXT: lui a0, %hi(.LCPI26_0)
327-
; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0)
328-
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
329-
; RV32-NEXT: vle8.v v10, (a0)
330-
; RV32-NEXT: vsext.vf4 v8, v10
322+
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
323+
; RV32-NEXT: vid.v v8
324+
; RV32-NEXT: vadd.vv v8, v8, v8
331325
; RV32-NEXT: ret
332326
;
333327
; RV64V-LABEL: buildvec_vid_step2_add0_v4i64:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,15 +1193,11 @@ define void @mulhu_v2i64(ptr %x) {
11931193
; RV32-NEXT: addi a1, a1, %lo(.LCPI69_0)
11941194
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
11951195
; RV32-NEXT: vle32.v v9, (a1)
1196-
; RV32-NEXT: lui a1, 32
1197-
; RV32-NEXT: addi a1, a1, 1
11981196
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
11991197
; RV32-NEXT: vmulhu.vv v8, v8, v9
1200-
; RV32-NEXT: vmv.s.x v9, a1
1201-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1202-
; RV32-NEXT: vsext.vf4 v10, v9
1203-
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1204-
; RV32-NEXT: vsrl.vv v8, v8, v10
1198+
; RV32-NEXT: vid.v v9
1199+
; RV32-NEXT: vadd.vi v9, v9, 1
1200+
; RV32-NEXT: vsrl.vv v8, v8, v9
12051201
; RV32-NEXT: vse64.v v8, (a0)
12061202
; RV32-NEXT: ret
12071203
;
@@ -1348,27 +1344,21 @@ define void @mulhs_v2i64(ptr %x) {
13481344
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
13491345
; RV32-NEXT: vle64.v v8, (a0)
13501346
; RV32-NEXT: lui a1, 349525
1351-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
13521347
; RV32-NEXT: vid.v v9
13531348
; RV32-NEXT: addi a2, a1, 1365
1349+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
13541350
; RV32-NEXT: vmv.v.x v10, a2
13551351
; RV32-NEXT: li a2, 63
13561352
; RV32-NEXT: addi a1, a1, 1366
1357-
; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, ma
1358-
; RV32-NEXT: vmv.s.x v10, a1
1359-
; RV32-NEXT: lui a1, 16
1360-
; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
1361-
; RV32-NEXT: vsrl.vi v9, v9, 1
1362-
; RV32-NEXT: vrsub.vi v9, v9, 0
13631353
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1354+
; RV32-NEXT: vrsub.vi v11, v9, 0
1355+
; RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma
1356+
; RV32-NEXT: vmv.s.x v10, a1
1357+
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
13641358
; RV32-NEXT: vmulh.vv v10, v8, v10
1365-
; RV32-NEXT: vmadd.vv v9, v8, v10
1366-
; RV32-NEXT: vmv.s.x v8, a1
1367-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1368-
; RV32-NEXT: vsext.vf4 v10, v8
1369-
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
1370-
; RV32-NEXT: vsrl.vx v8, v9, a2
1371-
; RV32-NEXT: vsra.vv v9, v9, v10
1359+
; RV32-NEXT: vmadd.vv v11, v8, v10
1360+
; RV32-NEXT: vsrl.vx v8, v11, a2
1361+
; RV32-NEXT: vsra.vv v9, v11, v9
13721362
; RV32-NEXT: vadd.vv v8, v9, v8
13731363
; RV32-NEXT: vse64.v v8, (a0)
13741364
; RV32-NEXT: ret

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll

Lines changed: 22 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3-
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
2+
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
3+
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
44

55
declare <2 x i8> @llvm.stepvector.v2i8()
66

@@ -161,83 +161,47 @@ define <16 x i32> @stepvector_v16i32() {
161161
declare <2 x i64> @llvm.stepvector.v2i64()
162162

163163
define <2 x i64> @stepvector_v2i64() {
164-
; RV32-LABEL: stepvector_v2i64:
165-
; RV32: # %bb.0:
166-
; RV32-NEXT: lui a0, 16
167-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
168-
; RV32-NEXT: vmv.s.x v9, a0
169-
; RV32-NEXT: vsext.vf4 v8, v9
170-
; RV32-NEXT: ret
171-
;
172-
; RV64-LABEL: stepvector_v2i64:
173-
; RV64: # %bb.0:
174-
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
175-
; RV64-NEXT: vid.v v8
176-
; RV64-NEXT: ret
164+
; CHECK-LABEL: stepvector_v2i64:
165+
; CHECK: # %bb.0:
166+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
167+
; CHECK-NEXT: vid.v v8
168+
; CHECK-NEXT: ret
177169
%v = call <2 x i64> @llvm.stepvector.v2i64()
178170
ret <2 x i64> %v
179171
}
180172

181173
declare <4 x i64> @llvm.stepvector.v4i64()
182174

183175
define <4 x i64> @stepvector_v4i64() {
184-
; RV32-LABEL: stepvector_v4i64:
185-
; RV32: # %bb.0:
186-
; RV32-NEXT: lui a0, %hi(.LCPI14_0)
187-
; RV32-NEXT: addi a0, a0, %lo(.LCPI14_0)
188-
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
189-
; RV32-NEXT: vle8.v v10, (a0)
190-
; RV32-NEXT: vsext.vf4 v8, v10
191-
; RV32-NEXT: ret
192-
;
193-
; RV64-LABEL: stepvector_v4i64:
194-
; RV64: # %bb.0:
195-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
196-
; RV64-NEXT: vid.v v8
197-
; RV64-NEXT: ret
176+
; CHECK-LABEL: stepvector_v4i64:
177+
; CHECK: # %bb.0:
178+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
179+
; CHECK-NEXT: vid.v v8
180+
; CHECK-NEXT: ret
198181
%v = call <4 x i64> @llvm.stepvector.v4i64()
199182
ret <4 x i64> %v
200183
}
201184

202185
declare <8 x i64> @llvm.stepvector.v8i64()
203186

204187
define <8 x i64> @stepvector_v8i64() {
205-
; RV32-LABEL: stepvector_v8i64:
206-
; RV32: # %bb.0:
207-
; RV32-NEXT: lui a0, %hi(.LCPI15_0)
208-
; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0)
209-
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
210-
; RV32-NEXT: vle8.v v12, (a0)
211-
; RV32-NEXT: vsext.vf4 v8, v12
212-
; RV32-NEXT: ret
213-
;
214-
; RV64-LABEL: stepvector_v8i64:
215-
; RV64: # %bb.0:
216-
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
217-
; RV64-NEXT: vid.v v8
218-
; RV64-NEXT: ret
188+
; CHECK-LABEL: stepvector_v8i64:
189+
; CHECK: # %bb.0:
190+
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
191+
; CHECK-NEXT: vid.v v8
192+
; CHECK-NEXT: ret
219193
%v = call <8 x i64> @llvm.stepvector.v8i64()
220194
ret <8 x i64> %v
221195
}
222196

223197
declare <16 x i64> @llvm.stepvector.v16i64()
224198

225199
define <16 x i64> @stepvector_v16i64() {
226-
; RV32-LABEL: stepvector_v16i64:
227-
; RV32: # %bb.0:
228-
; RV32-NEXT: li a0, 32
229-
; RV32-NEXT: lui a1, %hi(.LCPI16_0)
230-
; RV32-NEXT: addi a1, a1, %lo(.LCPI16_0)
231-
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
232-
; RV32-NEXT: vle8.v v16, (a1)
233-
; RV32-NEXT: vsext.vf4 v8, v16
234-
; RV32-NEXT: ret
235-
;
236-
; RV64-LABEL: stepvector_v16i64:
237-
; RV64: # %bb.0:
238-
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
239-
; RV64-NEXT: vid.v v8
240-
; RV64-NEXT: ret
200+
; CHECK-LABEL: stepvector_v16i64:
201+
; CHECK: # %bb.0:
202+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
203+
; CHECK-NEXT: vid.v v8
204+
; CHECK-NEXT: ret
241205
%v = call <16 x i64> @llvm.stepvector.v16i64()
242206
ret <16 x i64> %v
243207
}

0 commit comments

Comments
 (0)