Skip to content

Commit 7ced328

Browse files
authored
[RISCV] Use ri.vunzip2{a,b} for e64 fixed length deinterleave(2) shuffles (#137217)
If we have xrivosvizip, we can use the vunzip2{a,b} instructions for these cases *provided* that we can prove the layout in the two registers matches the fixed length semantics. The majority of this patch is a straight-forward port of the existing vnsrl logic which has the same requirement (though for slightly different reasoning). The one complicated bit is the addition of the scalable splitting logic inside lowerVZIP to exploit the independent register operands, and allow the use of lower LMUL. This bit is annoyingly complicated, and really "should" be a DAG combine - except that the VL and mask reduction becomes hard when it's not known to be a constant.
1 parent 27143f2 commit 7ced328

File tree

3 files changed

+94
-96
lines changed

3 files changed

+94
-96
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5110,9 +5110,23 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
51105110
Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
51115111
}
51125112

5113-
auto [Mask, VL] = getDefaultVLOps(IntVT, ContainerVT, DL, DAG, Subtarget);
5114-
SDValue Passthru = DAG.getUNDEF(ContainerVT);
5115-
SDValue Res = DAG.getNode(Opc, DL, ContainerVT, Op0, Op1, Passthru, Mask, VL);
5113+
MVT InnerVT = ContainerVT;
5114+
auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget);
5115+
if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
5116+
(RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) {
5117+
InnerVT = ContainerVT.getHalfNumVectorElementsVT();
5118+
VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL,
5119+
Subtarget.getXLenVT());
5120+
Mask = getAllOnesMask(InnerVT, VL, DL, DAG);
5121+
unsigned HighIdx = InnerVT.getVectorElementCount().getKnownMinValue();
5122+
Op1 = DAG.getExtractSubvector(DL, InnerVT, Op0, HighIdx);
5123+
Op0 = DAG.getExtractSubvector(DL, InnerVT, Op0, 0);
5124+
}
5125+
5126+
SDValue Passthru = DAG.getUNDEF(InnerVT);
5127+
SDValue Res = DAG.getNode(Opc, DL, InnerVT, Op0, Op1, Passthru, Mask, VL);
5128+
if (InnerVT.bitsLT(ContainerVT))
5129+
Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), Res, 0);
51165130
if (IntVT.isFixedLengthVector())
51175131
Res = convertFromScalableVector(IntVT, Res, DAG, Subtarget);
51185132
Res = DAG.getBitcast(VT, Res);
@@ -5808,6 +5822,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
58085822
}
58095823
}
58105824

5825+
// If this is a deinterleave(2), try using vunzip{a,b}. This mostly catches
5826+
// e64 which can't match above.
5827+
unsigned Index = 0;
5828+
if (Subtarget.hasVendorXRivosVizip() &&
5829+
ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2, Index) &&
5830+
1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
5831+
unsigned Opc =
5832+
Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL;
5833+
if (V2.isUndef())
5834+
return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
5835+
if (SDValue Src = foldConcatVector(V1, V2)) {
5836+
EVT NewVT = VT.getDoubleNumVectorElementsVT();
5837+
Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
5838+
SDValue Res =
5839+
lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
5840+
return DAG.getExtractSubvector(DL, VT, Res, 0);
5841+
}
5842+
}
5843+
58115844
if (SDValue V =
58125845
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
58135846
return V;

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -347,9 +347,8 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) {
347347
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
348348
; ZIP-NEXT: vle64.v v8, (a0)
349349
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
350-
; ZIP-NEXT: vslidedown.vi v9, v8, 2
351-
; ZIP-NEXT: vslideup.vi v8, v9, 1
352-
; ZIP-NEXT: vse64.v v8, (a1)
350+
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
351+
; ZIP-NEXT: vse64.v v10, (a1)
353352
; ZIP-NEXT: ret
354353
entry:
355354
%0 = load <4 x i64>, ptr %in, align 8
@@ -383,8 +382,7 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) {
383382
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
384383
; ZIP-NEXT: vle64.v v8, (a0)
385384
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
386-
; ZIP-NEXT: vslidedown.vi v9, v8, 2
387-
; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9
385+
; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
388386
; ZIP-NEXT: vse64.v v10, (a1)
389387
; ZIP-NEXT: ret
390388
entry:
@@ -417,10 +415,9 @@ define void @vnsrl_0_double(ptr %in, ptr %out) {
417415
; ZIP: # %bb.0: # %entry
418416
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
419417
; ZIP-NEXT: vle64.v v8, (a0)
418+
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
420419
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
421-
; ZIP-NEXT: vslidedown.vi v9, v8, 2
422-
; ZIP-NEXT: vslideup.vi v8, v9, 1
423-
; ZIP-NEXT: vse64.v v8, (a1)
420+
; ZIP-NEXT: vse64.v v10, (a1)
424421
; ZIP-NEXT: ret
425422
entry:
426423
%0 = load <4 x double>, ptr %in, align 8
@@ -453,9 +450,8 @@ define void @vnsrl_64_double(ptr %in, ptr %out) {
453450
; ZIP: # %bb.0: # %entry
454451
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
455452
; ZIP-NEXT: vle64.v v8, (a0)
453+
; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
456454
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
457-
; ZIP-NEXT: vslidedown.vi v9, v8, 2
458-
; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9
459455
; ZIP-NEXT: vse64.v v10, (a1)
460456
; ZIP-NEXT: ret
461457
entry:

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll

Lines changed: 52 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
8585
;
8686
; ZIP-LABEL: vector_deinterleave_v2i64_v4i64:
8787
; ZIP: # %bb.0:
88-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
89-
; ZIP-NEXT: vslidedown.vi v10, v8, 2
9088
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
91-
; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10
92-
; ZIP-NEXT: vslideup.vi v8, v10, 1
89+
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
90+
; ZIP-NEXT: ri.vunzip2b.vv v11, v8, v9
91+
; ZIP-NEXT: vmv.v.v v8, v10
92+
; ZIP-NEXT: vmv.v.v v9, v11
9393
; ZIP-NEXT: ret
9494
%retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
9595
ret {<2 x i64>, <2 x i64>} %retval
@@ -129,62 +129,51 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) {
129129
; ZIP-LABEL: vector_deinterleave_v4i64_v8i64:
130130
; ZIP: # %bb.0:
131131
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
132-
; ZIP-NEXT: vslidedown.vi v12, v8, 1
133-
; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
134-
; ZIP-NEXT: vmv.v.i v0, 2
135-
; ZIP-NEXT: vmv.v.i v14, 12
136-
; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma
137-
; ZIP-NEXT: vslidedown.vi v16, v8, 4
138-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
139-
; ZIP-NEXT: vslidedown.vi v10, v8, 2
140-
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
141-
; ZIP-NEXT: vslidedown.vi v12, v8, 2, v0.t
142-
; ZIP-NEXT: ri.vzip2a.vv v18, v8, v10
143-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
144-
; ZIP-NEXT: vslidedown.vi v8, v16, 2
145-
; ZIP-NEXT: vmv1r.v v0, v14
146-
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
147-
; ZIP-NEXT: ri.vzip2a.vv v12, v16, v8, v0.t
148-
; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
149-
; ZIP-NEXT: vmv.v.i v0, 8
150-
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
151-
; ZIP-NEXT: vslideup.vi v8, v16, 2
152-
; ZIP-NEXT: vslideup.vi v8, v16, 1, v0.t
153-
; ZIP-NEXT: vmv1r.v v0, v14
154-
; ZIP-NEXT: vmerge.vvm v8, v18, v8, v0
155-
; ZIP-NEXT: vmv2r.v v10, v12
132+
; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
133+
; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
134+
; ZIP-NEXT: vmv.v.v v8, v12
135+
; ZIP-NEXT: vmv.v.v v10, v14
156136
; ZIP-NEXT: ret
157137
%retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec)
158138
ret {<4 x i64>, <4 x i64>} %retval
159139
}
160140

161141
define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) {
162-
; CHECK-LABEL: vector_deinterleave_v8i64_v16i64:
163-
; CHECK: # %bb.0:
164-
; CHECK-NEXT: li a0, 85
165-
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
166-
; CHECK-NEXT: vmv.v.i v0, -16
167-
; CHECK-NEXT: vid.v v16
168-
; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
169-
; CHECK-NEXT: vslidedown.vi v24, v8, 8
170-
; CHECK-NEXT: vmv.s.x v12, a0
171-
; CHECK-NEXT: li a0, 170
172-
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
173-
; CHECK-NEXT: vadd.vv v20, v16, v16
174-
; CHECK-NEXT: vmv.s.x v21, a0
175-
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
176-
; CHECK-NEXT: vcompress.vm v16, v8, v12
177-
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
178-
; CHECK-NEXT: vadd.vi v22, v20, -8
179-
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
180-
; CHECK-NEXT: vcompress.vm v12, v8, v21
181-
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
182-
; CHECK-NEXT: vadd.vi v8, v20, -7
183-
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu
184-
; CHECK-NEXT: vrgatherei16.vv v16, v24, v22, v0.t
185-
; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
186-
; CHECK-NEXT: vmv.v.v v8, v16
187-
; CHECK-NEXT: ret
142+
; V-LABEL: vector_deinterleave_v8i64_v16i64:
143+
; V: # %bb.0:
144+
; V-NEXT: li a0, 85
145+
; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
146+
; V-NEXT: vmv.v.i v0, -16
147+
; V-NEXT: vid.v v16
148+
; V-NEXT: vsetivli zero, 8, e64, m8, ta, ma
149+
; V-NEXT: vslidedown.vi v24, v8, 8
150+
; V-NEXT: vmv.s.x v12, a0
151+
; V-NEXT: li a0, 170
152+
; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
153+
; V-NEXT: vadd.vv v20, v16, v16
154+
; V-NEXT: vmv.s.x v21, a0
155+
; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
156+
; V-NEXT: vcompress.vm v16, v8, v12
157+
; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
158+
; V-NEXT: vadd.vi v22, v20, -8
159+
; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
160+
; V-NEXT: vcompress.vm v12, v8, v21
161+
; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
162+
; V-NEXT: vadd.vi v8, v20, -7
163+
; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu
164+
; V-NEXT: vrgatherei16.vv v16, v24, v22, v0.t
165+
; V-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
166+
; V-NEXT: vmv.v.v v8, v16
167+
; V-NEXT: ret
168+
;
169+
; ZIP-LABEL: vector_deinterleave_v8i64_v16i64:
170+
; ZIP: # %bb.0:
171+
; ZIP-NEXT: vsetivli zero, 8, e64, m4, ta, ma
172+
; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
173+
; ZIP-NEXT: ri.vunzip2b.vv v20, v8, v12
174+
; ZIP-NEXT: vmv.v.v v8, v16
175+
; ZIP-NEXT: vmv.v.v v12, v20
176+
; ZIP-NEXT: ret
188177
%retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec)
189178
ret {<8 x i64>, <8 x i64>} %retval
190179
}
@@ -498,11 +487,11 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
498487
;
499488
; ZIP-LABEL: vector_deinterleave_v2f64_v4f64:
500489
; ZIP: # %bb.0:
501-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
502-
; ZIP-NEXT: vslidedown.vi v10, v8, 2
503490
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
504-
; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10
505-
; ZIP-NEXT: vslideup.vi v8, v10, 1
491+
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
492+
; ZIP-NEXT: ri.vunzip2b.vv v12, v8, v9
493+
; ZIP-NEXT: vmv.v.v v8, v10
494+
; ZIP-NEXT: vmv.v.v v9, v12
506495
; ZIP-NEXT: ret
507496
%retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
508497
ret {<2 x double>, <2 x double>} %retval
@@ -541,31 +530,11 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
541530
;
542531
; ZIP-LABEL: vector_deinterleave_v4f64_v8f64:
543532
; ZIP: # %bb.0:
544-
; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
545-
; ZIP-NEXT: vmv.v.i v0, 8
546-
; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma
547-
; ZIP-NEXT: vslidedown.vi v16, v8, 4
548-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
549-
; ZIP-NEXT: vslidedown.vi v12, v8, 2
550-
; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
551-
; ZIP-NEXT: vmv.v.i v10, 12
552-
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
553-
; ZIP-NEXT: vslideup.vi v14, v16, 2
554-
; ZIP-NEXT: vslideup.vi v14, v16, 1, v0.t
555-
; ZIP-NEXT: ri.vzip2a.vv v18, v8, v12
556-
; ZIP-NEXT: vmv1r.v v0, v10
557-
; ZIP-NEXT: vmerge.vvm v12, v18, v14, v0
558-
; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
559-
; ZIP-NEXT: vslidedown.vi v14, v16, 2
560-
; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
561-
; ZIP-NEXT: vmv.v.i v0, 2
562-
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
563-
; ZIP-NEXT: ri.vzip2a.vv v18, v16, v14
564-
; ZIP-NEXT: vslidedown.vi v14, v8, 1
565-
; ZIP-NEXT: vslidedown.vi v14, v8, 2, v0.t
566-
; ZIP-NEXT: vmv1r.v v0, v10
567-
; ZIP-NEXT: vmerge.vvm v10, v14, v18, v0
568-
; ZIP-NEXT: vmv2r.v v8, v12
533+
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
534+
; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
535+
; ZIP-NEXT: ri.vunzip2b.vv v16, v8, v10
536+
; ZIP-NEXT: vmv.v.v v8, v12
537+
; ZIP-NEXT: vmv.v.v v10, v16
569538
; ZIP-NEXT: ret
570539
%retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec)
571540
ret {<4 x double>, <4 x double>} %retval

0 commit comments

Comments
 (0)