Skip to content

[RISCV] Lower BUILD_VECTOR with i64 type to VID on RV32 if possible #132339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 83 additions & 59 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (!Subtarget.is64Bit() && VT.getVectorElementType() == MVT::i64) {
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::SPLAT_VECTOR_PARTS, VT, Custom);

// Lower BUILD_VECTOR with i64 type to VID on RV32 if possible.
setOperationAction(ISD::BUILD_VECTOR, MVT::i64, Custom);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be VT, not MVT::i64? (i.e. the vector type, not the scalar element type)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MVT::i64 is correct so that the type legalizer calls it when the scalar type is illegal. Calling it on the vector type would mean it only gets called after the type legalizer since the vector type is legal.

}

setOperationAction(
Expand Down Expand Up @@ -3622,6 +3625,78 @@ static SDValue matchSplatAsGather(SDValue SplatVal, MVT VT, const SDLoc &DL,
return Gather;
}

static SDValue lowerBuildVectorViaVID(SDValue Op, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
assert(VT.isFixedLengthVector() && "Unexpected vector!");

MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);

SDLoc DL(Op);
auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);

if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
int64_t StepNumerator = SimpleVID->StepNumerator;
unsigned StepDenominator = SimpleVID->StepDenominator;
int64_t Addend = SimpleVID->Addend;

assert(StepNumerator != 0 && "Invalid step");
bool Negate = false;
int64_t SplatStepVal = StepNumerator;
unsigned StepOpcode = ISD::MUL;
// Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
// anyway as the shift of 63 won't fit in uimm5.
if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
isPowerOf2_64(std::abs(StepNumerator))) {
Negate = StepNumerator < 0;
StepOpcode = ISD::SHL;
SplatStepVal = Log2_64(std::abs(StepNumerator));
}

// Only emit VIDs with suitably-small steps/addends. We use imm5 is a
// threshold since it's the immediate value many RVV instructions accept.
// There is no vmul.vi instruction so ensure multiply constant can fit in
// a single addi instruction.
if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
(StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
isPowerOf2_32(StepDenominator) &&
(SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
MVT VIDVT =
VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
MVT VIDContainerVT =
getContainerForFixedLengthVector(DAG, VIDVT, Subtarget);
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL);
// Convert right out of the scalable type so we can use standard ISD
// nodes for the rest of the computation. If we used scalable types with
// these, we'd lose the fixed-length vector info and generate worse
// vsetvli code.
VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
(StepOpcode == ISD::SHL && SplatStepVal != 0)) {
SDValue SplatStep = DAG.getSignedConstant(SplatStepVal, DL, VIDVT);
VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
}
if (StepDenominator != 1) {
SDValue SplatStep =
DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
}
if (Addend != 0 || Negate) {
SDValue SplatAddend = DAG.getSignedConstant(Addend, DL, VIDVT);
VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
VID);
}
if (VT.isFloatingPoint()) {
// TODO: Use vfwcvt to reduce register pressure.
VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID);
}
return VID;
}
}

return SDValue();
}

/// Try and optimize BUILD_VECTORs with "dominant values" - these are values
/// which constitute a large proportion of the elements. In such cases we can
/// splat a vector with the dominant element and make up the shortfall with
Expand Down Expand Up @@ -3839,64 +3914,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
// Try and match index sequences, which we can lower to the vid instruction
// with optional modifications. An all-undef vector is matched by
// getSplatValue, above.
if (auto SimpleVID = isSimpleVIDSequence(Op, Op.getScalarValueSizeInBits())) {
int64_t StepNumerator = SimpleVID->StepNumerator;
unsigned StepDenominator = SimpleVID->StepDenominator;
int64_t Addend = SimpleVID->Addend;

assert(StepNumerator != 0 && "Invalid step");
bool Negate = false;
int64_t SplatStepVal = StepNumerator;
unsigned StepOpcode = ISD::MUL;
// Exclude INT64_MIN to avoid passing it to std::abs. We won't optimize it
// anyway as the shift of 63 won't fit in uimm5.
if (StepNumerator != 1 && StepNumerator != INT64_MIN &&
isPowerOf2_64(std::abs(StepNumerator))) {
Negate = StepNumerator < 0;
StepOpcode = ISD::SHL;
SplatStepVal = Log2_64(std::abs(StepNumerator));
}

// Only emit VIDs with suitably-small steps/addends. We use imm5 is a
// threshold since it's the immediate value many RVV instructions accept.
// There is no vmul.vi instruction so ensure multiply constant can fit in
// a single addi instruction.
if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) ||
(StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) &&
isPowerOf2_32(StepDenominator) &&
(SplatStepVal >= 0 || StepDenominator == 1) && isInt<5>(Addend)) {
MVT VIDVT =
VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
MVT VIDContainerVT =
getContainerForFixedLengthVector(DAG, VIDVT, Subtarget);
SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, VIDContainerVT, Mask, VL);
// Convert right out of the scalable type so we can use standard ISD
// nodes for the rest of the computation. If we used scalable types with
// these, we'd lose the fixed-length vector info and generate worse
// vsetvli code.
VID = convertFromScalableVector(VIDVT, VID, DAG, Subtarget);
if ((StepOpcode == ISD::MUL && SplatStepVal != 1) ||
(StepOpcode == ISD::SHL && SplatStepVal != 0)) {
SDValue SplatStep = DAG.getSignedConstant(SplatStepVal, DL, VIDVT);
VID = DAG.getNode(StepOpcode, DL, VIDVT, VID, SplatStep);
}
if (StepDenominator != 1) {
SDValue SplatStep =
DAG.getConstant(Log2_64(StepDenominator), DL, VIDVT);
VID = DAG.getNode(ISD::SRL, DL, VIDVT, VID, SplatStep);
}
if (Addend != 0 || Negate) {
SDValue SplatAddend = DAG.getSignedConstant(Addend, DL, VIDVT);
VID = DAG.getNode(Negate ? ISD::SUB : ISD::ADD, DL, VIDVT, SplatAddend,
VID);
}
if (VT.isFloatingPoint()) {
// TODO: Use vfwcvt to reduce register pressure.
VID = DAG.getNode(ISD::SINT_TO_FP, DL, VT, VID);
}
return VID;
}
}
if (SDValue Res = lowerBuildVectorViaVID(Op, DAG, Subtarget))
return Res;

// For very small build_vectors, use a single scalar insert of a constant.
// TODO: Base this on constant rematerialization cost, not size.
Expand Down Expand Up @@ -7588,8 +7607,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerVECTOR_REVERSE(Op, DAG);
case ISD::VECTOR_SPLICE:
return lowerVECTOR_SPLICE(Op, DAG);
case ISD::BUILD_VECTOR:
case ISD::BUILD_VECTOR: {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
if (!Subtarget.is64Bit() && EltVT == MVT::i64)
return lowerBuildVectorViaVID(Op, DAG, Subtarget);
return lowerBUILD_VECTOR(Op, DAG, Subtarget);
}
case ISD::SPLAT_VECTOR: {
MVT VT = Op.getSimpleValueType();
MVT EltVT = VT.getVectorElementType();
Expand Down
16 changes: 5 additions & 11 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
Original file line number Diff line number Diff line change
Expand Up @@ -290,15 +290,11 @@ define void @buildvec_vid_stepn3_addn3_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3)
ret void
}

; FIXME: RV32 doesn't catch this pattern due to BUILD_VECTOR legalization.
define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
; RV32-LABEL: buildvec_vid_step1_add0_v4i64:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, %hi(.LCPI25_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle8.v v10, (a0)
; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vid.v v8
; RV32-NEXT: ret
;
; RV64V-LABEL: buildvec_vid_step1_add0_v4i64:
Expand All @@ -323,11 +319,9 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
; RV32-LABEL: buildvec_vid_step2_add0_v4i64:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, %hi(.LCPI26_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle8.v v10, (a0)
; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vid.v v8
; RV32-NEXT: vadd.vv v8, v8, v8
; RV32-NEXT: ret
;
; RV64V-LABEL: buildvec_vid_step2_add0_v4i64:
Expand Down
32 changes: 11 additions & 21 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1193,15 +1193,11 @@ define void @mulhu_v2i64(ptr %x) {
; RV32-NEXT: addi a1, a1, %lo(.LCPI69_0)
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vle32.v v9, (a1)
; RV32-NEXT: lui a1, 32
; RV32-NEXT: addi a1, a1, 1
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vmulhu.vv v8, v8, v9
; RV32-NEXT: vmv.s.x v9, a1
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vsext.vf4 v10, v9
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vsrl.vv v8, v8, v10
; RV32-NEXT: vid.v v9
; RV32-NEXT: vadd.vi v9, v9, 1
; RV32-NEXT: vsrl.vv v8, v8, v9
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
Expand Down Expand Up @@ -1348,27 +1344,21 @@ define void @mulhs_v2i64(ptr %x) {
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
; RV32-NEXT: lui a1, 349525
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vid.v v9
; RV32-NEXT: addi a2, a1, 1365
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.v.x v10, a2
; RV32-NEXT: li a2, 63
; RV32-NEXT: addi a1, a1, 1366
; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, ma
; RV32-NEXT: vmv.s.x v10, a1
; RV32-NEXT: lui a1, 16
; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV32-NEXT: vsrl.vi v9, v9, 1
; RV32-NEXT: vrsub.vi v9, v9, 0
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vrsub.vi v11, v9, 0
; RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma
; RV32-NEXT: vmv.s.x v10, a1
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV32-NEXT: vmulh.vv v10, v8, v10
; RV32-NEXT: vmadd.vv v9, v8, v10
; RV32-NEXT: vmv.s.x v8, a1
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vsext.vf4 v10, v8
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vsrl.vx v8, v9, a2
; RV32-NEXT: vsra.vv v9, v9, v10
; RV32-NEXT: vmadd.vv v11, v8, v10
; RV32-NEXT: vsrl.vx v8, v11, a2
; RV32-NEXT: vsra.vv v9, v11, v9
; RV32-NEXT: vadd.vv v8, v9, v8
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
Expand Down
80 changes: 22 additions & 58 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s

declare <2 x i8> @llvm.stepvector.v2i8()

Expand Down Expand Up @@ -161,83 +161,47 @@ define <16 x i32> @stepvector_v16i32() {
declare <2 x i64> @llvm.stepvector.v2i64()

define <2 x i64> @stepvector_v2i64() {
; RV32-LABEL: stepvector_v2i64:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, 16
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmv.s.x v9, a0
; RV32-NEXT: vsext.vf4 v8, v9
; RV32-NEXT: ret
;
; RV64-LABEL: stepvector_v2i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vid.v v8
; RV64-NEXT: ret
; CHECK-LABEL: stepvector_v2i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: ret
%v = call <2 x i64> @llvm.stepvector.v2i64()
ret <2 x i64> %v
}

declare <4 x i64> @llvm.stepvector.v4i64()

define <4 x i64> @stepvector_v4i64() {
; RV32-LABEL: stepvector_v4i64:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, %hi(.LCPI14_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI14_0)
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; RV32-NEXT: vle8.v v10, (a0)
; RV32-NEXT: vsext.vf4 v8, v10
; RV32-NEXT: ret
;
; RV64-LABEL: stepvector_v4i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vid.v v8
; RV64-NEXT: ret
; CHECK-LABEL: stepvector_v4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: ret
%v = call <4 x i64> @llvm.stepvector.v4i64()
ret <4 x i64> %v
}

declare <8 x i64> @llvm.stepvector.v8i64()

define <8 x i64> @stepvector_v8i64() {
; RV32-LABEL: stepvector_v8i64:
; RV32: # %bb.0:
; RV32-NEXT: lui a0, %hi(.LCPI15_0)
; RV32-NEXT: addi a0, a0, %lo(.LCPI15_0)
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vle8.v v12, (a0)
; RV32-NEXT: vsext.vf4 v8, v12
; RV32-NEXT: ret
;
; RV64-LABEL: stepvector_v8i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-NEXT: vid.v v8
; RV64-NEXT: ret
; CHECK-LABEL: stepvector_v8i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: ret
%v = call <8 x i64> @llvm.stepvector.v8i64()
ret <8 x i64> %v
}

declare <16 x i64> @llvm.stepvector.v16i64()

define <16 x i64> @stepvector_v16i64() {
; RV32-LABEL: stepvector_v16i64:
; RV32: # %bb.0:
; RV32-NEXT: li a0, 32
; RV32-NEXT: lui a1, %hi(.LCPI16_0)
; RV32-NEXT: addi a1, a1, %lo(.LCPI16_0)
; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
; RV32-NEXT: vle8.v v16, (a1)
; RV32-NEXT: vsext.vf4 v8, v16
; RV32-NEXT: ret
;
; RV64-LABEL: stepvector_v16i64:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vid.v v8
; RV64-NEXT: ret
; CHECK-LABEL: stepvector_v16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: ret
%v = call <16 x i64> @llvm.stepvector.v16i64()
ret <16 x i64> %v
}
Loading