Skip to content

Commit a6e7749

Browse files
authored
[RISCV] Improve lowering of spread(2) shuffles (#118658)
A spread(2) shuffle is just a interleave with an undef lane. The existing lowering was reusing the even lane for the undef value. This was entirely legal, but non-optimal.
1 parent c3d1518 commit a6e7749

File tree

4 files changed

+75
-81
lines changed

4 files changed

+75
-81
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5331,17 +5331,32 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
53315331
// Extract the halves of the vectors.
53325332
MVT HalfVT = VT.getHalfNumVectorElementsVT();
53335333

5334+
// Recognize if one half is actually undef; the matching above will
5335+
// otherwise reuse the even stream for the undef one. This improves
5336+
// spread(2) shuffles.
5337+
bool LaneIsUndef[2] = { true, true};
5338+
for (unsigned i = 0; i < Mask.size(); i++)
5339+
LaneIsUndef[i % 2] &= (Mask[i] == -1);
5340+
53345341
int Size = Mask.size();
53355342
SDValue EvenV, OddV;
5336-
assert(EvenSrc >= 0 && "Undef source?");
5337-
EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
5338-
EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
5339-
DAG.getVectorIdxConstant(EvenSrc % Size, DL));
5340-
5341-
assert(OddSrc >= 0 && "Undef source?");
5342-
OddV = (OddSrc / Size) == 0 ? V1 : V2;
5343-
OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
5344-
DAG.getVectorIdxConstant(OddSrc % Size, DL));
5343+
if (LaneIsUndef[0]) {
5344+
EvenV = DAG.getUNDEF(HalfVT);
5345+
} else {
5346+
assert(EvenSrc >= 0 && "Undef source?");
5347+
EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
5348+
EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
5349+
DAG.getVectorIdxConstant(EvenSrc % Size, DL));
5350+
}
5351+
5352+
if (LaneIsUndef[1]) {
5353+
OddV = DAG.getUNDEF(HalfVT);
5354+
} else {
5355+
assert(OddSrc >= 0 && "Undef source?");
5356+
OddV = (OddSrc / Size) == 0 ? V1 : V2;
5357+
OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
5358+
DAG.getVectorIdxConstant(OddSrc % Size, DL));
5359+
}
53455360

53465361
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
53475362
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -242,33 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
242242
; V128-NEXT: slli a0, a0, 3
243243
; V128-NEXT: sub sp, sp, a0
244244
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
245-
; V128-NEXT: vmv8r.v v24, v16
246-
; V128-NEXT: vmv8r.v v16, v8
247-
; V128-NEXT: vmv8r.v v8, v24
248245
; V128-NEXT: addi a0, sp, 16
249-
; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
246+
; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
250247
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
251-
; V128-NEXT: vslidedown.vi v0, v24, 16
252-
; V128-NEXT: li a0, -1
253-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
254-
; V128-NEXT: vwaddu.vv v24, v8, v0
255-
; V128-NEXT: vwmaccu.vx v24, a0, v0
256-
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
257-
; V128-NEXT: vslidedown.vi v0, v16, 16
248+
; V128-NEXT: vslidedown.vi v24, v16, 16
249+
; V128-NEXT: li a0, 32
250+
; V128-NEXT: vslidedown.vi v0, v8, 16
258251
; V128-NEXT: lui a1, 699051
259-
; V128-NEXT: li a2, 32
260-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
261-
; V128-NEXT: vwaddu.vv v8, v0, v16
252+
; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
253+
; V128-NEXT: vzext.vf2 v8, v24
254+
; V128-NEXT: vzext.vf2 v24, v0
262255
; V128-NEXT: addi a1, a1, -1366
263256
; V128-NEXT: vmv.s.x v0, a1
264-
; V128-NEXT: vwmaccu.vx v8, a0, v16
265-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
266-
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
267-
; V128-NEXT: addi a1, sp, 16
268-
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
257+
; V128-NEXT: vsll.vx v8, v8, a0
258+
; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
259+
; V128-NEXT: vmerge.vvm v24, v24, v8, v0
260+
; V128-NEXT: addi a0, sp, 16
261+
; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
269262
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
270-
; V128-NEXT: vwaddu.vv v0, v16, v8
271-
; V128-NEXT: vwmaccu.vx v0, a0, v8
263+
; V128-NEXT: vwaddu.vv v0, v8, v16
264+
; V128-NEXT: li a0, -1
265+
; V128-NEXT: vwmaccu.vx v0, a0, v16
272266
; V128-NEXT: vmv8r.v v8, v0
273267
; V128-NEXT: vmv8r.v v16, v24
274268
; V128-NEXT: csrr a0, vlenb

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

Lines changed: 29 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -186,35 +186,29 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
186186
define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
187187
; V128-LABEL: interleave_v4i32_offset_1:
188188
; V128: # %bb.0:
189-
; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
190-
; V128-NEXT: vwaddu.vv v10, v8, v8
191-
; V128-NEXT: li a0, -1
192189
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
193-
; V128-NEXT: vid.v v11
194-
; V128-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
195-
; V128-NEXT: vwmaccu.vx v10, a0, v8
196-
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
197-
; V128-NEXT: vsrl.vi v8, v11, 1
190+
; V128-NEXT: vid.v v10
198191
; V128-NEXT: vmv.v.i v0, 10
199-
; V128-NEXT: vadd.vi v8, v8, 1
200-
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
192+
; V128-NEXT: vsrl.vi v10, v10, 1
193+
; V128-NEXT: vadd.vi v11, v10, 1
194+
; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma
195+
; V128-NEXT: vzext.vf2 v10, v8
196+
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
197+
; V128-NEXT: vrgather.vv v10, v9, v11, v0.t
201198
; V128-NEXT: vmv.v.v v8, v10
202199
; V128-NEXT: ret
203200
;
204201
; V512-LABEL: interleave_v4i32_offset_1:
205202
; V512: # %bb.0:
206-
; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
207-
; V512-NEXT: vwaddu.vv v10, v8, v8
208-
; V512-NEXT: li a0, -1
209203
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
210-
; V512-NEXT: vid.v v11
211-
; V512-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
212-
; V512-NEXT: vwmaccu.vx v10, a0, v8
213-
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
214-
; V512-NEXT: vsrl.vi v8, v11, 1
204+
; V512-NEXT: vid.v v10
215205
; V512-NEXT: vmv.v.i v0, 10
216-
; V512-NEXT: vadd.vi v8, v8, 1
217-
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
206+
; V512-NEXT: vsrl.vi v10, v10, 1
207+
; V512-NEXT: vadd.vi v11, v10, 1
208+
; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma
209+
; V512-NEXT: vzext.vf2 v10, v8
210+
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
211+
; V512-NEXT: vrgather.vv v10, v9, v11, v0.t
218212
; V512-NEXT: vmv1r.v v8, v10
219213
; V512-NEXT: ret
220214
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
@@ -411,33 +405,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
411405
; V128-NEXT: slli a0, a0, 3
412406
; V128-NEXT: sub sp, sp, a0
413407
; V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
414-
; V128-NEXT: vmv8r.v v24, v16
415-
; V128-NEXT: vmv8r.v v16, v8
416-
; V128-NEXT: vmv8r.v v8, v24
417408
; V128-NEXT: addi a0, sp, 16
418-
; V128-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
419-
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
420-
; V128-NEXT: vslidedown.vi v0, v24, 16
421-
; V128-NEXT: li a0, -1
422-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
423-
; V128-NEXT: vwaddu.vv v24, v8, v0
424-
; V128-NEXT: vwmaccu.vx v24, a0, v0
409+
; V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
425410
; V128-NEXT: vsetivli zero, 16, e32, m8, ta, ma
426-
; V128-NEXT: vslidedown.vi v0, v16, 16
411+
; V128-NEXT: vslidedown.vi v24, v16, 16
412+
; V128-NEXT: li a0, 32
413+
; V128-NEXT: vslidedown.vi v0, v8, 16
427414
; V128-NEXT: lui a1, 699051
428-
; V128-NEXT: li a2, 32
429-
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
430-
; V128-NEXT: vwaddu.vv v8, v0, v16
415+
; V128-NEXT: vsetivli zero, 16, e64, m8, ta, ma
416+
; V128-NEXT: vzext.vf2 v8, v24
417+
; V128-NEXT: vzext.vf2 v24, v0
431418
; V128-NEXT: addi a1, a1, -1366
432419
; V128-NEXT: vmv.s.x v0, a1
433-
; V128-NEXT: vwmaccu.vx v8, a0, v16
434-
; V128-NEXT: vsetvli zero, a2, e32, m8, ta, ma
435-
; V128-NEXT: vmerge.vvm v24, v8, v24, v0
436-
; V128-NEXT: addi a1, sp, 16
437-
; V128-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
420+
; V128-NEXT: vsll.vx v8, v8, a0
421+
; V128-NEXT: vsetvli zero, a0, e32, m8, ta, ma
422+
; V128-NEXT: vmerge.vvm v24, v24, v8, v0
423+
; V128-NEXT: addi a0, sp, 16
424+
; V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
438425
; V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
439-
; V128-NEXT: vwaddu.vv v0, v16, v8
440-
; V128-NEXT: vwmaccu.vx v0, a0, v8
426+
; V128-NEXT: vwaddu.vv v0, v8, v16
427+
; V128-NEXT: li a0, -1
428+
; V128-NEXT: vwmaccu.vx v0, a0, v16
441429
; V128-NEXT: vmv8r.v v8, v0
442430
; V128-NEXT: vmv8r.v v16, v24
443431
; V128-NEXT: csrr a0, vlenb

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -801,11 +801,9 @@ define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) {
801801
define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
802802
; CHECK-LABEL: shuffle_spread2_singlesrc_e32:
803803
; CHECK: # %bb.0:
804-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
805-
; CHECK-NEXT: vwaddu.vv v10, v8, v8
806-
; CHECK-NEXT: li a0, -1
807-
; CHECK-NEXT: vwmaccu.vx v10, a0, v8
808-
; CHECK-NEXT: vmv2r.v v8, v10
804+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
805+
; CHECK-NEXT: vzext.vf2 v10, v8
806+
; CHECK-NEXT: vmv.v.v v8, v10
809807
; CHECK-NEXT: ret
810808
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
811809
ret <8 x i32> %out
@@ -814,11 +812,10 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32(<8 x i32> %v) {
814812
define <8 x i32> @shuffle_spread2_singlesrc_e32_index1(<8 x i32> %v) {
815813
; CHECK-LABEL: shuffle_spread2_singlesrc_e32_index1:
816814
; CHECK: # %bb.0:
817-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
818-
; CHECK-NEXT: vwaddu.vv v10, v8, v8
819-
; CHECK-NEXT: li a0, -1
820-
; CHECK-NEXT: vwmaccu.vx v10, a0, v8
821-
; CHECK-NEXT: vmv2r.v v8, v10
815+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
816+
; CHECK-NEXT: vzext.vf2 v10, v8
817+
; CHECK-NEXT: li a0, 32
818+
; CHECK-NEXT: vsll.vx v8, v10, a0
822819
; CHECK-NEXT: ret
823820
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
824821
ret <8 x i32> %out

0 commit comments

Comments
 (0)