Skip to content

Commit 55bbe70

Browse files
committed
[RISCV] Merge shuffle sources if lanes are disjoint
In x264, there's a few kernels with shuffles like this: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = shufflevector <16 x i32> %41, <16 x i32> %42, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16> Because this is a complex two-source shuffle, this will get lowered as two vrgather.vvs that are blended together. vadd.vv v20, v16, v12 vsub.vv v12, v16, v12 vrgatherei16.vv v24, v20, v10 vrgatherei16.vv v24, v12, v16, v0.t However the indices coming from each source are disjoint, so we can blend the two together and perform a single source shuffle instead: %41 = add nsw <16 x i32> %39, %40 %42 = sub nsw <16 x i32> %39, %40 %43 = select <0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1> %41, %42 %44 = shufflevector <16 x i32> %43, <16 x i32> poison, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 10, i32 14, i32 6, i32 2, i32 9, i32 13, i32 5, i32 1, i32 8, i32 12, i32 4, i32 0> The select will likely get merged into the preceding instruction, and then we only have to do one vrgather.vv: vadd.vv v20, v16, v12 vsub.vv v20, v16, v12, v0.t vrgatherei16.vv v24, v20, v10 This patch bails if either of the sources are a splat however, since that will usually already have some sort of cheaper lowering via vrgather.vi. This improves performance on 525.x264_r by 4.12% with -O3 -flto -march=rva22u64_v on the spacemit-x60: https://lnt.lukelau.me/db_default/v4/nts/71?compare_to=70
1 parent 8812992 commit 55bbe70

File tree

5 files changed

+859
-730
lines changed

5 files changed

+859
-730
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5197,6 +5197,67 @@ static bool isCompressMask(ArrayRef<int> Mask) {
51975197
return true;
51985198
}
51995199

5200+
/// Given a shuffle where the indices are disjoint between the two sources,
5201+
/// e.g.:
5202+
///
5203+
/// t2:v4i8 = vector_shuffle t0:v4i8, t1:v4i8, <2, 7, 1, 4>
5204+
///
5205+
/// Merge the two sources into one and do a single source shuffle:
5206+
///
5207+
/// t2:v4i8 = vselect t1:v4i8, t0:v4i8, <0, 1, 0, 1>
5208+
/// t3:v4i8 = vector_shuffle t2:v4i8, undef, <2, 3, 1, 0>
5209+
///
5210+
/// A vselect will either be merged into a masked instruction or be lowered as a
5211+
/// vmerge.vvm, which is cheaper than a vrgather.vv.
5212+
static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
5213+
SelectionDAG &DAG,
5214+
const RISCVSubtarget &Subtarget) {
5215+
MVT VT = SVN->getSimpleValueType(0);
5216+
MVT XLenVT = Subtarget.getXLenVT();
5217+
SDLoc DL(SVN);
5218+
5219+
const ArrayRef<int> Mask = SVN->getMask();
5220+
5221+
// Work out which source each lane will come from.
5222+
SmallVector<int, 16> Srcs(Mask.size(), -1);
5223+
5224+
for (int Idx : Mask) {
5225+
if (Idx == -1)
5226+
continue;
5227+
unsigned SrcIdx = Idx % Mask.size();
5228+
int Src = (uint32_t)Idx < Mask.size() ? 0 : 1;
5229+
if (Srcs[SrcIdx] == -1)
5230+
// Mark this source as using this lane.
5231+
Srcs[SrcIdx] = Src;
5232+
else if (Srcs[SrcIdx] != Src)
5233+
// The other source is using this lane: not disjoint.
5234+
return SDValue();
5235+
}
5236+
5237+
SmallVector<SDValue> SelectMaskVals;
5238+
for (int Lane : Srcs) {
5239+
if (Lane == -1)
5240+
SelectMaskVals.push_back(DAG.getUNDEF(XLenVT));
5241+
else
5242+
SelectMaskVals.push_back(DAG.getConstant(Lane, DL, XLenVT));
5243+
}
5244+
MVT MaskVT = VT.changeVectorElementType(MVT::i1);
5245+
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, SelectMaskVals);
5246+
SDValue Select = DAG.getNode(ISD::VSELECT, DL, VT, SelectMask,
5247+
SVN->getOperand(1), SVN->getOperand(0));
5248+
5249+
// Move all indices relative to the first source.
5250+
SmallVector<int> NewMask(Mask.size());
5251+
for (unsigned I = 0; I < Mask.size(); I++) {
5252+
if (Mask[I] == -1)
5253+
NewMask[I] = -1;
5254+
else
5255+
NewMask[I] = Mask[I] % Mask.size();
5256+
}
5257+
5258+
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
5259+
}
5260+
52005261
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
52015262
const RISCVSubtarget &Subtarget) {
52025263
SDValue V1 = Op.getOperand(0);
@@ -5540,6 +5601,16 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55405601
ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
55415602
}
55425603

5604+
// If the mask indices are disjoint between the two sources, we can lower it
5605+
// as a vselect + a single source vrgather.vv. Don't do this if the operands
5606+
// will be splatted since they will be lowered to something cheaper like
5607+
// vrgather.vi anyway.
5608+
if (!DAG.isSplatValue(V2) && !DAG.isSplatValue(V1) &&
5609+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskLHS.data(), VT) &&
5610+
!ShuffleVectorSDNode::isSplatMask(ShuffleMaskRHS.data(), VT))
5611+
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
5612+
return V;
5613+
55435614
// Try to pick a profitable operand order.
55445615
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
55455616
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ define <4 x half> @shuffle_v4f16(<4 x half> %x, <4 x half> %y) {
2929
define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) {
3030
; CHECK-LABEL: shuffle_v8f32:
3131
; CHECK: # %bb.0:
32-
; CHECK-NEXT: li a0, -20
32+
; CHECK-NEXT: li a0, 19
3333
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
3434
; CHECK-NEXT: vmv.s.x v0, a0
35-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
35+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
3636
; CHECK-NEXT: ret
3737
%s = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 12, i32 5, i32 6, i32 7>
3838
ret <8 x float> %s
@@ -401,17 +401,16 @@ define <16 x float> @shuffle_disjoint_lanes(<16 x float> %v, <16 x float> %w) {
401401
; CHECK: # %bb.0:
402402
; CHECK-NEXT: lui a0, %hi(.LCPI30_0)
403403
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0)
404-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
405-
; CHECK-NEXT: vle16.v v20, (a0)
406-
; CHECK-NEXT: lui a0, %hi(.LCPI30_1)
407-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_1)
408-
; CHECK-NEXT: vle16.v v22, (a0)
409-
; CHECK-NEXT: lui a0, 15
410-
; CHECK-NEXT: addi a0, a0, 240
404+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
405+
; CHECK-NEXT: vle8.v v16, (a0)
406+
; CHECK-NEXT: lui a0, 5
407+
; CHECK-NEXT: addi a0, a0, 1365
411408
; CHECK-NEXT: vmv.s.x v0, a0
412-
; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
413-
; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
414-
; CHECK-NEXT: vmv.v.v v8, v16
409+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
410+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
411+
; CHECK-NEXT: vsext.vf2 v18, v16
412+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
413+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
415414
; CHECK-NEXT: ret
416415
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
417416
ret <16 x float> %out
@@ -422,11 +421,15 @@ define <16 x float> @shuffle_disjoint_lanes_one_identity(<16 x float> %v, <16 x
422421
; CHECK: # %bb.0:
423422
; CHECK-NEXT: lui a0, %hi(.LCPI31_0)
424423
; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0)
425-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
426-
; CHECK-NEXT: vle16.v v16, (a0)
427-
; CHECK-NEXT: li a0, -272
424+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
425+
; CHECK-NEXT: vle8.v v16, (a0)
426+
; CHECK-NEXT: li a0, -304
428427
; CHECK-NEXT: vmv.s.x v0, a0
429-
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
428+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
429+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
430+
; CHECK-NEXT: vsext.vf2 v18, v16
431+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
432+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
430433
; CHECK-NEXT: ret
431434
%out = shufflevector <16 x float> %v, <16 x float> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
432435
ret <16 x float> %out

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 31 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ define <4 x i16> @shuffle_v4i16(<4 x i16> %x, <4 x i16> %y) {
1616
define <8 x i32> @shuffle_v8i32(<8 x i32> %x, <8 x i32> %y) {
1717
; CHECK-LABEL: shuffle_v8i32:
1818
; CHECK: # %bb.0:
19-
; CHECK-NEXT: li a0, 203
19+
; CHECK-NEXT: li a0, 52
2020
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
2121
; CHECK-NEXT: vmv.s.x v0, a0
22-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
22+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
2323
; CHECK-NEXT: ret
2424
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
2525
ret <8 x i32> %s
@@ -451,21 +451,14 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
451451
define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
452452
; CHECK-LABEL: splat_ve2_we0_ins_i2ve4_i5we6:
453453
; CHECK: # %bb.0:
454+
; CHECK-NEXT: lui a0, %hi(.LCPI26_0)
455+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0)
454456
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
455-
; CHECK-NEXT: vmv.v.i v10, 6
456-
; CHECK-NEXT: vmv.v.i v11, 0
457-
; CHECK-NEXT: lui a0, 8256
458-
; CHECK-NEXT: addi a0, a0, 2
459-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
460-
; CHECK-NEXT: vmv.v.x v12, a0
461-
; CHECK-NEXT: li a0, 98
462-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
463-
; CHECK-NEXT: vslideup.vi v11, v10, 5
457+
; CHECK-NEXT: vle8.v v10, (a0)
458+
; CHECK-NEXT: li a0, 65
464459
; CHECK-NEXT: vmv.s.x v0, a0
465-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
466-
; CHECK-NEXT: vrgather.vv v10, v8, v12
467-
; CHECK-NEXT: vrgather.vv v10, v9, v11, v0.t
468-
; CHECK-NEXT: vmv1r.v v8, v10
460+
; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
461+
; CHECK-NEXT: vrgather.vv v8, v9, v10
469462
; CHECK-NEXT: ret
470463
%shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 14, i32 8, i32 2>
471464
ret <8 x i8> %shuff
@@ -693,12 +686,12 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
693686
; CHECK: # %bb.0:
694687
; CHECK-NEXT: lui a0, %hi(.LCPI46_0)
695688
; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0)
696-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
697690
; CHECK-NEXT: vle8.v v10, (a0)
698-
; CHECK-NEXT: li a0, -22
691+
; CHECK-NEXT: li a0, 171
699692
; CHECK-NEXT: vmv.s.x v0, a0
700-
; CHECK-NEXT: vslidedown.vi v8, v8, 2
701-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
693+
; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0
694+
; CHECK-NEXT: vrgather.vv v8, v9, v10
702695
; CHECK-NEXT: ret
703696
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
704697
ret <8 x i8> %res
@@ -709,9 +702,9 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
709702
; CHECK-LABEL: shuffle_v8i32_2:
710703
; CHECK: # %bb.0:
711704
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
712-
; CHECK-NEXT: vmv.v.i v0, -13
705+
; CHECK-NEXT: vmv.v.i v0, 12
713706
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
714-
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
707+
; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0
715708
; CHECK-NEXT: ret
716709
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
717710
ret <8 x i32> %s
@@ -1027,17 +1020,16 @@ define <16 x i32> @shuffle_disjoint_lanes(<16 x i32> %v, <16 x i32> %w) {
10271020
; CHECK: # %bb.0:
10281021
; CHECK-NEXT: lui a0, %hi(.LCPI70_0)
10291022
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0)
1030-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1031-
; CHECK-NEXT: vle16.v v20, (a0)
1032-
; CHECK-NEXT: lui a0, %hi(.LCPI70_1)
1033-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_1)
1034-
; CHECK-NEXT: vle16.v v22, (a0)
1035-
; CHECK-NEXT: lui a0, 15
1036-
; CHECK-NEXT: addi a0, a0, 240
1023+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1024+
; CHECK-NEXT: vle8.v v16, (a0)
1025+
; CHECK-NEXT: lui a0, 5
1026+
; CHECK-NEXT: addi a0, a0, 1365
10371027
; CHECK-NEXT: vmv.s.x v0, a0
1038-
; CHECK-NEXT: vrgatherei16.vv v16, v8, v20
1039-
; CHECK-NEXT: vrgatherei16.vv v16, v12, v22, v0.t
1040-
; CHECK-NEXT: vmv.v.v v8, v16
1028+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
1029+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
1030+
; CHECK-NEXT: vsext.vf2 v18, v16
1031+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1032+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
10411033
; CHECK-NEXT: ret
10421034
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 11, i32 15, i32 7, i32 3, i32 26, i32 30, i32 22, i32 18, i32 9, i32 13, i32 5, i32 1, i32 24, i32 28, i32 20, i32 16>
10431035
ret <16 x i32> %out
@@ -1048,11 +1040,15 @@ define <16 x i32> @shuffle_disjoint_lanes_one_identity(<16 x i32> %v, <16 x i32>
10481040
; CHECK: # %bb.0:
10491041
; CHECK-NEXT: lui a0, %hi(.LCPI71_0)
10501042
; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0)
1051-
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu
1052-
; CHECK-NEXT: vle16.v v16, (a0)
1053-
; CHECK-NEXT: li a0, -272
1043+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1044+
; CHECK-NEXT: vle8.v v16, (a0)
1045+
; CHECK-NEXT: li a0, -304
10541046
; CHECK-NEXT: vmv.s.x v0, a0
1055-
; CHECK-NEXT: vrgatherei16.vv v8, v12, v16, v0.t
1047+
; CHECK-NEXT: vmerge.vvm v12, v8, v12, v0
1048+
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
1049+
; CHECK-NEXT: vsext.vf2 v18, v16
1050+
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1051+
; CHECK-NEXT: vrgatherei16.vv v8, v12, v18
10561052
; CHECK-NEXT: ret
10571053
%out = shufflevector <16 x i32> %v, <16 x i32> %w, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 26, i32 30, i32 22, i32 20, i32 8, i32 31, i32 29, i32 28, i32 27, i32 23, i32 25, i32 22>
10581054
ret <16 x i32> %out

0 commit comments

Comments
 (0)