Skip to content

Commit 275eeda

Browse files
preameslukel97
andauthored
[RISCV] Split long build_vector sequences to reduce critical path (#81312)
If we have a long chain of vslide1down instructions to build e.g. a <16 x i8> from scalar, we end up with a critical path going through the entire chain. We can instead build two halves, and then combine them with a vselect. This costs one additional temporary register, but reduces the critical path by roughly half. To avoid needing to change VL, we fill each half with undefs for the elements which will come from the other half. The vselect will at worst become a vmerge, but is often folded back into the final instruction of the sequence building the lower half. A couple notes on the heuristic here: * This is restricted to LMUL1 to avoid quadratic costing reasoning. * This only splits once. In future work, we can explore recursive splitting here, but I'm a bit worried about register pressure and thus decided to be conservative. It also happens to be "enough" at the default zvl of 128. * "8" is picked somewhat arbitrarily as being "long". In practice, our build_vector codegen for 2 defined elements in a VL=4 vector appears to need some work. 4 defined elements in a VL=8 vector seems to generally produce reasonable results. * Halves may not be an optimal split point. I went down the rabit hole of trying to find the optimal one, and decided it wasn't worth the effort to start with. --------- Co-authored-by: Luke Lau <[email protected]>
1 parent d99d258 commit 275eeda

File tree

5 files changed

+258
-275
lines changed

5 files changed

+258
-275
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3877,6 +3877,47 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
38773877
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
38783878
}
38793879

3880+
// For m1 vectors, if we have non-undef values in both halves of our vector,
3881+
// split the vector into low and high halves, build them separately, then
3882+
// use a vselect to combine them. For long vectors, this cuts the critical
3883+
// path of the vslide1down sequence in half, and gives us an opportunity
3884+
// to special case each half independently. Note that we don't change the
3885+
// length of the sub-vectors here, so if both fallback to the generic
3886+
// vslide1down path, we should be able to fold the vselect into the final
3887+
// vslidedown (for the undef tail) for the first half w/ masking.
3888+
unsigned NumElts = VT.getVectorNumElements();
3889+
unsigned NumUndefElts =
3890+
count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
3891+
unsigned NumDefElts = NumElts - NumUndefElts;
3892+
if (NumDefElts >= 8 && NumDefElts > NumElts / 2 &&
3893+
ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) {
3894+
SmallVector<SDValue> SubVecAOps, SubVecBOps;
3895+
SmallVector<SDValue> MaskVals;
3896+
SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0));
3897+
SubVecAOps.reserve(NumElts);
3898+
SubVecBOps.reserve(NumElts);
3899+
for (unsigned i = 0; i < NumElts; i++) {
3900+
SDValue Elem = Op->getOperand(i);
3901+
if (i < NumElts / 2) {
3902+
SubVecAOps.push_back(Elem);
3903+
SubVecBOps.push_back(UndefElem);
3904+
} else {
3905+
SubVecAOps.push_back(UndefElem);
3906+
SubVecBOps.push_back(Elem);
3907+
}
3908+
bool SelectMaskVal = (i < NumElts / 2);
3909+
MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
3910+
}
3911+
assert(SubVecAOps.size() == NumElts && SubVecBOps.size() == NumElts &&
3912+
MaskVals.size() == NumElts);
3913+
3914+
SDValue SubVecA = DAG.getBuildVector(VT, DL, SubVecAOps);
3915+
SDValue SubVecB = DAG.getBuildVector(VT, DL, SubVecBOps);
3916+
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
3917+
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
3918+
return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, SubVecA, SubVecB);
3919+
}
3920+
38803921
// Cap the cost at a value linear to the number of elements in the vector.
38813922
// The default lowering is to use the stack. The vector store + scalar loads
38823923
// is linear in VL. However, at high lmuls vslide1down and vslidedown end up

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,15 +1399,17 @@ define <2 x double> @vid_step2_v2f64() {
13991399
define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float %e3, float %e4, float %e5, float %e6, float %e7) vscale_range(4, 128) {
14001400
; CHECK-LABEL: buildvec_v8f32_zvl256:
14011401
; CHECK: # %bb.0:
1402-
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
1402+
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, mu
14031403
; CHECK-NEXT: vfmv.v.f v8, fa0
14041404
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
14051405
; CHECK-NEXT: vfslide1down.vf v8, v8, fa2
1406-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3
1407-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
1406+
; CHECK-NEXT: vfslide1down.vf v9, v8, fa3
1407+
; CHECK-NEXT: vfmv.v.f v8, fa4
14081408
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
14091409
; CHECK-NEXT: vfslide1down.vf v8, v8, fa6
1410+
; CHECK-NEXT: vmv.v.i v0, 15
14101411
; CHECK-NEXT: vfslide1down.vf v8, v8, fa7
1412+
; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t
14111413
; CHECK-NEXT: ret
14121414
%v0 = insertelement <8 x float> poison, float %e0, i64 0
14131415
%v1 = insertelement <8 x float> %v0, float %e1, i64 1
@@ -1448,15 +1450,17 @@ define <8 x double> @buildvec_v8f64_zvl256(double %e0, double %e1, double %e2, d
14481450
define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7) vscale_range(8, 128) {
14491451
; CHECK-LABEL: buildvec_v8f64_zvl512:
14501452
; CHECK: # %bb.0:
1451-
; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1453+
; CHECK-NEXT: vsetivli zero, 8, e64, m1, ta, mu
14521454
; CHECK-NEXT: vfmv.v.f v8, fa0
14531455
; CHECK-NEXT: vfslide1down.vf v8, v8, fa1
14541456
; CHECK-NEXT: vfslide1down.vf v8, v8, fa2
1455-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa3
1456-
; CHECK-NEXT: vfslide1down.vf v8, v8, fa4
1457+
; CHECK-NEXT: vfslide1down.vf v9, v8, fa3
1458+
; CHECK-NEXT: vfmv.v.f v8, fa4
14571459
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
14581460
; CHECK-NEXT: vfslide1down.vf v8, v8, fa6
1461+
; CHECK-NEXT: vmv.v.i v0, 15
14591462
; CHECK-NEXT: vfslide1down.vf v8, v8, fa7
1463+
; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t
14601464
; CHECK-NEXT: ret
14611465
%v0 = insertelement <8 x double> poison, double %e0, i64 0
14621466
%v1 = insertelement <8 x double> %v0, double %e1, i64 1

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -359,44 +359,46 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
359359
; RV32-NEXT: feq.d a0, fa3, fa3
360360
; RV32-NEXT: fmax.d fa3, fa3, fa5
361361
; RV32-NEXT: fmin.d fa3, fa3, fa4
362+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
363+
; RV32-NEXT: fld fa2, 40(sp)
362364
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
363-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
364-
; RV32-NEXT: fld fa3, 32(sp)
365365
; RV32-NEXT: neg a0, a0
366366
; RV32-NEXT: and a0, a0, a2
367-
; RV32-NEXT: vslide1down.vx v8, v10, a0
368-
; RV32-NEXT: feq.d a0, fa3, fa3
369-
; RV32-NEXT: fmax.d fa3, fa3, fa5
367+
; RV32-NEXT: feq.d a2, fa2, fa2
368+
; RV32-NEXT: fmax.d fa3, fa2, fa5
370369
; RV32-NEXT: fmin.d fa3, fa3, fa4
371-
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
372-
; RV32-NEXT: fld fa3, 40(sp)
373-
; RV32-NEXT: neg a0, a0
374-
; RV32-NEXT: and a0, a0, a2
375-
; RV32-NEXT: vslide1down.vx v8, v8, a0
376-
; RV32-NEXT: feq.d a0, fa3, fa3
370+
; RV32-NEXT: fcvt.w.d a3, fa3, rtz
371+
; RV32-NEXT: fld fa3, 32(sp)
372+
; RV32-NEXT: vslide1down.vx v8, v10, a0
373+
; RV32-NEXT: neg a0, a2
374+
; RV32-NEXT: and a0, a0, a3
375+
; RV32-NEXT: feq.d a2, fa3, fa3
376+
; RV32-NEXT: neg a2, a2
377377
; RV32-NEXT: fmax.d fa3, fa3, fa5
378378
; RV32-NEXT: fmin.d fa3, fa3, fa4
379-
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
379+
; RV32-NEXT: fcvt.w.d a3, fa3, rtz
380380
; RV32-NEXT: fld fa3, 48(sp)
381-
; RV32-NEXT: neg a0, a0
382-
; RV32-NEXT: and a0, a0, a2
383-
; RV32-NEXT: vslide1down.vx v8, v8, a0
381+
; RV32-NEXT: and a2, a2, a3
382+
; RV32-NEXT: vmv.v.x v9, a2
383+
; RV32-NEXT: vslide1down.vx v9, v9, a0
384384
; RV32-NEXT: feq.d a0, fa3, fa3
385385
; RV32-NEXT: fmax.d fa3, fa3, fa5
386386
; RV32-NEXT: fmin.d fa3, fa3, fa4
387387
; RV32-NEXT: fcvt.w.d a2, fa3, rtz
388388
; RV32-NEXT: fld fa3, 56(sp)
389389
; RV32-NEXT: neg a0, a0
390390
; RV32-NEXT: and a0, a0, a2
391-
; RV32-NEXT: vslide1down.vx v8, v8, a0
391+
; RV32-NEXT: vslide1down.vx v9, v9, a0
392392
; RV32-NEXT: feq.d a0, fa3, fa3
393393
; RV32-NEXT: neg a0, a0
394394
; RV32-NEXT: fmax.d fa5, fa3, fa5
395395
; RV32-NEXT: fmin.d fa5, fa5, fa4
396396
; RV32-NEXT: fcvt.w.d a2, fa5, rtz
397397
; RV32-NEXT: and a0, a0, a2
398-
; RV32-NEXT: vslide1down.vx v8, v8, a0
399-
; RV32-NEXT: vse8.v v8, (a1)
398+
; RV32-NEXT: vmv.v.i v0, 15
399+
; RV32-NEXT: vslide1down.vx v9, v9, a0
400+
; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
401+
; RV32-NEXT: vse8.v v9, (a1)
400402
; RV32-NEXT: addi sp, s0, -128
401403
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
402404
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
@@ -458,44 +460,46 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
458460
; RV64-NEXT: feq.d a0, fa3, fa3
459461
; RV64-NEXT: fmax.d fa3, fa3, fa5
460462
; RV64-NEXT: fmin.d fa3, fa3, fa4
463+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
464+
; RV64-NEXT: fld fa2, 40(sp)
461465
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
462-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
463-
; RV64-NEXT: fld fa3, 32(sp)
464466
; RV64-NEXT: neg a0, a0
465467
; RV64-NEXT: and a0, a0, a2
466-
; RV64-NEXT: vslide1down.vx v8, v10, a0
467-
; RV64-NEXT: feq.d a0, fa3, fa3
468-
; RV64-NEXT: fmax.d fa3, fa3, fa5
468+
; RV64-NEXT: feq.d a2, fa2, fa2
469+
; RV64-NEXT: fmax.d fa3, fa2, fa5
469470
; RV64-NEXT: fmin.d fa3, fa3, fa4
470-
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
471-
; RV64-NEXT: fld fa3, 40(sp)
472-
; RV64-NEXT: neg a0, a0
473-
; RV64-NEXT: and a0, a0, a2
474-
; RV64-NEXT: vslide1down.vx v8, v8, a0
475-
; RV64-NEXT: feq.d a0, fa3, fa3
471+
; RV64-NEXT: fcvt.l.d a3, fa3, rtz
472+
; RV64-NEXT: fld fa3, 32(sp)
473+
; RV64-NEXT: vslide1down.vx v8, v10, a0
474+
; RV64-NEXT: neg a0, a2
475+
; RV64-NEXT: and a0, a0, a3
476+
; RV64-NEXT: feq.d a2, fa3, fa3
477+
; RV64-NEXT: negw a2, a2
476478
; RV64-NEXT: fmax.d fa3, fa3, fa5
477479
; RV64-NEXT: fmin.d fa3, fa3, fa4
478-
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
480+
; RV64-NEXT: fcvt.l.d a3, fa3, rtz
479481
; RV64-NEXT: fld fa3, 48(sp)
480-
; RV64-NEXT: neg a0, a0
481-
; RV64-NEXT: and a0, a0, a2
482-
; RV64-NEXT: vslide1down.vx v8, v8, a0
482+
; RV64-NEXT: and a2, a2, a3
483+
; RV64-NEXT: vmv.v.x v9, a2
484+
; RV64-NEXT: vslide1down.vx v9, v9, a0
483485
; RV64-NEXT: feq.d a0, fa3, fa3
484486
; RV64-NEXT: fmax.d fa3, fa3, fa5
485487
; RV64-NEXT: fmin.d fa3, fa3, fa4
486488
; RV64-NEXT: fcvt.l.d a2, fa3, rtz
487489
; RV64-NEXT: fld fa3, 56(sp)
488490
; RV64-NEXT: neg a0, a0
489491
; RV64-NEXT: and a0, a0, a2
490-
; RV64-NEXT: vslide1down.vx v8, v8, a0
492+
; RV64-NEXT: vslide1down.vx v9, v9, a0
491493
; RV64-NEXT: feq.d a0, fa3, fa3
492494
; RV64-NEXT: neg a0, a0
493495
; RV64-NEXT: fmax.d fa5, fa3, fa5
494496
; RV64-NEXT: fmin.d fa5, fa5, fa4
495497
; RV64-NEXT: fcvt.l.d a2, fa5, rtz
496498
; RV64-NEXT: and a0, a0, a2
497-
; RV64-NEXT: vslide1down.vx v8, v8, a0
498-
; RV64-NEXT: vse8.v v8, (a1)
499+
; RV64-NEXT: vmv.v.i v0, 15
500+
; RV64-NEXT: vslide1down.vx v9, v9, a0
501+
; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
502+
; RV64-NEXT: vse8.v v9, (a1)
499503
; RV64-NEXT: addi sp, s0, -128
500504
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
501505
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
@@ -553,11 +557,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
553557
; RV32-NEXT: vslidedown.vi v8, v8, 3
554558
; RV32-NEXT: vfmv.f.s fa4, v8
555559
; RV32-NEXT: fmax.d fa4, fa4, fa3
556-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
557-
; RV32-NEXT: fld fa2, 32(sp)
560+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
561+
; RV32-NEXT: fld fa2, 40(sp)
558562
; RV32-NEXT: fmin.d fa4, fa4, fa5
559563
; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
560-
; RV32-NEXT: fld fa4, 40(sp)
564+
; RV32-NEXT: fld fa4, 32(sp)
561565
; RV32-NEXT: fmax.d fa2, fa2, fa3
562566
; RV32-NEXT: fmin.d fa2, fa2, fa5
563567
; RV32-NEXT: fcvt.wu.d a2, fa2, rtz
@@ -570,14 +574,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
570574
; RV32-NEXT: fmin.d fa4, fa4, fa5
571575
; RV32-NEXT: fcvt.wu.d a0, fa4, rtz
572576
; RV32-NEXT: fld fa4, 56(sp)
573-
; RV32-NEXT: vslide1down.vx v8, v8, a2
574-
; RV32-NEXT: vslide1down.vx v8, v8, a3
575-
; RV32-NEXT: vslide1down.vx v8, v8, a0
577+
; RV32-NEXT: vmv.v.x v9, a3
578+
; RV32-NEXT: vslide1down.vx v9, v9, a2
579+
; RV32-NEXT: vslide1down.vx v9, v9, a0
576580
; RV32-NEXT: fmax.d fa4, fa4, fa3
577581
; RV32-NEXT: fmin.d fa5, fa4, fa5
578582
; RV32-NEXT: fcvt.wu.d a0, fa5, rtz
579-
; RV32-NEXT: vslide1down.vx v8, v8, a0
580-
; RV32-NEXT: vse8.v v8, (a1)
583+
; RV32-NEXT: vmv.v.i v0, 15
584+
; RV32-NEXT: vslide1down.vx v9, v9, a0
585+
; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
586+
; RV32-NEXT: vse8.v v9, (a1)
581587
; RV32-NEXT: addi sp, s0, -128
582588
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
583589
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
@@ -627,11 +633,11 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
627633
; RV64-NEXT: vslidedown.vi v8, v8, 3
628634
; RV64-NEXT: vfmv.f.s fa4, v8
629635
; RV64-NEXT: fmax.d fa4, fa4, fa3
630-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
631-
; RV64-NEXT: fld fa2, 32(sp)
636+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
637+
; RV64-NEXT: fld fa2, 40(sp)
632638
; RV64-NEXT: fmin.d fa4, fa4, fa5
633639
; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
634-
; RV64-NEXT: fld fa4, 40(sp)
640+
; RV64-NEXT: fld fa4, 32(sp)
635641
; RV64-NEXT: fmax.d fa2, fa2, fa3
636642
; RV64-NEXT: fmin.d fa2, fa2, fa5
637643
; RV64-NEXT: fcvt.lu.d a2, fa2, rtz
@@ -644,14 +650,16 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
644650
; RV64-NEXT: fmin.d fa4, fa4, fa5
645651
; RV64-NEXT: fcvt.lu.d a0, fa4, rtz
646652
; RV64-NEXT: fld fa4, 56(sp)
647-
; RV64-NEXT: vslide1down.vx v8, v8, a2
648-
; RV64-NEXT: vslide1down.vx v8, v8, a3
649-
; RV64-NEXT: vslide1down.vx v8, v8, a0
653+
; RV64-NEXT: vmv.v.x v9, a3
654+
; RV64-NEXT: vslide1down.vx v9, v9, a2
655+
; RV64-NEXT: vslide1down.vx v9, v9, a0
650656
; RV64-NEXT: fmax.d fa4, fa4, fa3
651657
; RV64-NEXT: fmin.d fa5, fa4, fa5
652658
; RV64-NEXT: fcvt.lu.d a0, fa5, rtz
653-
; RV64-NEXT: vslide1down.vx v8, v8, a0
654-
; RV64-NEXT: vse8.v v8, (a1)
659+
; RV64-NEXT: vmv.v.i v0, 15
660+
; RV64-NEXT: vslide1down.vx v9, v9, a0
661+
; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
662+
; RV64-NEXT: vse8.v v9, (a1)
655663
; RV64-NEXT: addi sp, s0, -128
656664
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
657665
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload

0 commit comments

Comments
 (0)