Skip to content

Commit c50bb99

Browse files
authored
[RISCV] Allow vslidedown.vx in isExtractSubvectorCheap for half VT case (#114886)
We have a special case where we allow the extract of the high half of a vector and consider it cheap. However, we had previously required that the type have no more than 32 elements for this to work. (Because 64/2=32, and the largest immediate for a vslidedown.vi is 31.) This has the effect of pessimizing shuffle vector lowering for long vectors - i.e. at SEW=e8, zvl128b, an m2 or m4 deinterleave can't be matched because it gets scalarized during DAG construction and can't be "profitably" rebuilt by DAG combine. Note that for RISCV, scalarization via insert and extract is extremely expensive (i.e. two vslides per element), so a slide + two half width shuffles is almost always a net win. (i.e, this isn't really specific to vnsrl) Separately, I want to look at the decision to scalarize at all, but it seems worthwhile adjusting this while we're at it regardless.
1 parent 0a68171 commit c50bb99

File tree

2 files changed

+10
-247
lines changed

2 files changed

+10
-247
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2209,20 +2209,12 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
22092209
return true;
22102210

22112211
// Convervatively only handle extracting half of a vector.
2212+
// TODO: We can do arbitrary slidedowns, but for now only support extracting
2213+
// the upper half of a vector until we have more test coverage.
22122214
// TODO: For sizes which aren't multiples of VLEN sizes, this may not be
22132215
// a cheap extract. However, this case is important in practice for
22142216
// shuffled extracts of longer vectors. How resolve?
2215-
if ((ResElts * 2) != SrcElts)
2216-
return false;
2217-
2218-
// Slide can support arbitrary index, but we only treat vslidedown.vi as
2219-
// cheap.
2220-
if (Index >= 32)
2221-
return false;
2222-
2223-
// TODO: We can do arbitrary slidedowns, but for now only support extracting
2224-
// the upper half of a vector until we have more test coverage.
2225-
return Index == 0 || Index == ResElts;
2217+
return (ResElts * 2) == SrcElts && (Index == 0 || Index == ResElts);
22262218
}
22272219

22282220
MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll

Lines changed: 7 additions & 236 deletions
Original file line numberDiff line numberDiff line change
@@ -307,243 +307,14 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) {
307307
ret <32 x i32> %2
308308
}
309309

310-
; TODO: This case should be a simple vnsrl, but gets scalarized instead
311310
define <32 x i8> @vnsrl_v32i8_v64i8(<64 x i8> %in) {
312-
; RV32-LABEL: vnsrl_v32i8_v64i8:
313-
; RV32: # %bb.0:
314-
; RV32-NEXT: addi sp, sp, -128
315-
; RV32-NEXT: .cfi_def_cfa_offset 128
316-
; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
317-
; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
318-
; RV32-NEXT: .cfi_offset ra, -4
319-
; RV32-NEXT: .cfi_offset s0, -8
320-
; RV32-NEXT: addi s0, sp, 128
321-
; RV32-NEXT: .cfi_def_cfa s0, 0
322-
; RV32-NEXT: andi sp, sp, -64
323-
; RV32-NEXT: li a0, 64
324-
; RV32-NEXT: mv a1, sp
325-
; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
326-
; RV32-NEXT: vse8.v v8, (a1)
327-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
328-
; RV32-NEXT: vslidedown.vi v10, v8, 1
329-
; RV32-NEXT: vmv.x.s a0, v10
330-
; RV32-NEXT: li a1, 32
331-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
332-
; RV32-NEXT: vmv.v.x v10, a0
333-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
334-
; RV32-NEXT: vslidedown.vi v12, v8, 3
335-
; RV32-NEXT: vmv.x.s a0, v12
336-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
337-
; RV32-NEXT: vslide1down.vx v10, v10, a0
338-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
339-
; RV32-NEXT: vslidedown.vi v12, v8, 5
340-
; RV32-NEXT: vmv.x.s a0, v12
341-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
342-
; RV32-NEXT: vslide1down.vx v10, v10, a0
343-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
344-
; RV32-NEXT: vslidedown.vi v12, v8, 7
345-
; RV32-NEXT: vmv.x.s a0, v12
346-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
347-
; RV32-NEXT: vslide1down.vx v10, v10, a0
348-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
349-
; RV32-NEXT: vslidedown.vi v12, v8, 9
350-
; RV32-NEXT: vmv.x.s a0, v12
351-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
352-
; RV32-NEXT: vslide1down.vx v10, v10, a0
353-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
354-
; RV32-NEXT: vslidedown.vi v12, v8, 11
355-
; RV32-NEXT: vmv.x.s a0, v12
356-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
357-
; RV32-NEXT: vslide1down.vx v10, v10, a0
358-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
359-
; RV32-NEXT: vslidedown.vi v12, v8, 13
360-
; RV32-NEXT: vmv.x.s a0, v12
361-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
362-
; RV32-NEXT: vslide1down.vx v10, v10, a0
363-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
364-
; RV32-NEXT: vslidedown.vi v12, v8, 15
365-
; RV32-NEXT: vmv.x.s a0, v12
366-
; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
367-
; RV32-NEXT: vslide1down.vx v10, v10, a0
368-
; RV32-NEXT: vslidedown.vi v12, v8, 17
369-
; RV32-NEXT: vmv.x.s a0, v12
370-
; RV32-NEXT: vslide1down.vx v10, v10, a0
371-
; RV32-NEXT: vslidedown.vi v12, v8, 19
372-
; RV32-NEXT: vmv.x.s a0, v12
373-
; RV32-NEXT: vslide1down.vx v10, v10, a0
374-
; RV32-NEXT: vslidedown.vi v12, v8, 21
375-
; RV32-NEXT: vmv.x.s a0, v12
376-
; RV32-NEXT: vslide1down.vx v10, v10, a0
377-
; RV32-NEXT: vslidedown.vi v12, v8, 23
378-
; RV32-NEXT: vmv.x.s a0, v12
379-
; RV32-NEXT: vslide1down.vx v10, v10, a0
380-
; RV32-NEXT: vslidedown.vi v12, v8, 25
381-
; RV32-NEXT: vmv.x.s a0, v12
382-
; RV32-NEXT: vslide1down.vx v10, v10, a0
383-
; RV32-NEXT: vslidedown.vi v12, v8, 27
384-
; RV32-NEXT: vmv.x.s a0, v12
385-
; RV32-NEXT: vslide1down.vx v10, v10, a0
386-
; RV32-NEXT: vslidedown.vi v12, v8, 29
387-
; RV32-NEXT: vmv.x.s a0, v12
388-
; RV32-NEXT: vslide1down.vx v10, v10, a0
389-
; RV32-NEXT: vslidedown.vi v8, v8, 31
390-
; RV32-NEXT: vmv.x.s a0, v8
391-
; RV32-NEXT: vslide1down.vx v8, v10, a0
392-
; RV32-NEXT: lbu a0, 33(sp)
393-
; RV32-NEXT: lbu a1, 35(sp)
394-
; RV32-NEXT: lbu a2, 37(sp)
395-
; RV32-NEXT: lbu a3, 39(sp)
396-
; RV32-NEXT: vslide1down.vx v8, v8, a0
397-
; RV32-NEXT: vslide1down.vx v8, v8, a1
398-
; RV32-NEXT: vslide1down.vx v8, v8, a2
399-
; RV32-NEXT: vslide1down.vx v8, v8, a3
400-
; RV32-NEXT: lbu a0, 41(sp)
401-
; RV32-NEXT: lbu a1, 43(sp)
402-
; RV32-NEXT: lbu a2, 45(sp)
403-
; RV32-NEXT: lbu a3, 47(sp)
404-
; RV32-NEXT: vslide1down.vx v8, v8, a0
405-
; RV32-NEXT: vslide1down.vx v8, v8, a1
406-
; RV32-NEXT: vslide1down.vx v8, v8, a2
407-
; RV32-NEXT: vslide1down.vx v8, v8, a3
408-
; RV32-NEXT: lbu a0, 49(sp)
409-
; RV32-NEXT: lbu a1, 51(sp)
410-
; RV32-NEXT: lbu a2, 53(sp)
411-
; RV32-NEXT: lbu a3, 55(sp)
412-
; RV32-NEXT: vslide1down.vx v8, v8, a0
413-
; RV32-NEXT: vslide1down.vx v8, v8, a1
414-
; RV32-NEXT: vslide1down.vx v8, v8, a2
415-
; RV32-NEXT: vslide1down.vx v8, v8, a3
416-
; RV32-NEXT: lbu a0, 57(sp)
417-
; RV32-NEXT: lbu a1, 59(sp)
418-
; RV32-NEXT: lbu a2, 61(sp)
419-
; RV32-NEXT: lbu a3, 63(sp)
420-
; RV32-NEXT: vslide1down.vx v8, v8, a0
421-
; RV32-NEXT: vslide1down.vx v8, v8, a1
422-
; RV32-NEXT: vslide1down.vx v8, v8, a2
423-
; RV32-NEXT: vslide1down.vx v8, v8, a3
424-
; RV32-NEXT: addi sp, s0, -128
425-
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
426-
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
427-
; RV32-NEXT: addi sp, sp, 128
428-
; RV32-NEXT: ret
429-
;
430-
; RV64-LABEL: vnsrl_v32i8_v64i8:
431-
; RV64: # %bb.0:
432-
; RV64-NEXT: addi sp, sp, -128
433-
; RV64-NEXT: .cfi_def_cfa_offset 128
434-
; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
435-
; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
436-
; RV64-NEXT: .cfi_offset ra, -8
437-
; RV64-NEXT: .cfi_offset s0, -16
438-
; RV64-NEXT: addi s0, sp, 128
439-
; RV64-NEXT: .cfi_def_cfa s0, 0
440-
; RV64-NEXT: andi sp, sp, -64
441-
; RV64-NEXT: li a0, 64
442-
; RV64-NEXT: mv a1, sp
443-
; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma
444-
; RV64-NEXT: vse8.v v8, (a1)
445-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
446-
; RV64-NEXT: vslidedown.vi v10, v8, 1
447-
; RV64-NEXT: vmv.x.s a0, v10
448-
; RV64-NEXT: li a1, 32
449-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
450-
; RV64-NEXT: vmv.v.x v10, a0
451-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
452-
; RV64-NEXT: vslidedown.vi v12, v8, 3
453-
; RV64-NEXT: vmv.x.s a0, v12
454-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
455-
; RV64-NEXT: vslide1down.vx v10, v10, a0
456-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
457-
; RV64-NEXT: vslidedown.vi v12, v8, 5
458-
; RV64-NEXT: vmv.x.s a0, v12
459-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
460-
; RV64-NEXT: vslide1down.vx v10, v10, a0
461-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
462-
; RV64-NEXT: vslidedown.vi v12, v8, 7
463-
; RV64-NEXT: vmv.x.s a0, v12
464-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
465-
; RV64-NEXT: vslide1down.vx v10, v10, a0
466-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
467-
; RV64-NEXT: vslidedown.vi v12, v8, 9
468-
; RV64-NEXT: vmv.x.s a0, v12
469-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
470-
; RV64-NEXT: vslide1down.vx v10, v10, a0
471-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
472-
; RV64-NEXT: vslidedown.vi v12, v8, 11
473-
; RV64-NEXT: vmv.x.s a0, v12
474-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
475-
; RV64-NEXT: vslide1down.vx v10, v10, a0
476-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
477-
; RV64-NEXT: vslidedown.vi v12, v8, 13
478-
; RV64-NEXT: vmv.x.s a0, v12
479-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
480-
; RV64-NEXT: vslide1down.vx v10, v10, a0
481-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
482-
; RV64-NEXT: vslidedown.vi v12, v8, 15
483-
; RV64-NEXT: vmv.x.s a0, v12
484-
; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
485-
; RV64-NEXT: vslide1down.vx v10, v10, a0
486-
; RV64-NEXT: vslidedown.vi v12, v8, 17
487-
; RV64-NEXT: vmv.x.s a0, v12
488-
; RV64-NEXT: vslide1down.vx v10, v10, a0
489-
; RV64-NEXT: vslidedown.vi v12, v8, 19
490-
; RV64-NEXT: vmv.x.s a0, v12
491-
; RV64-NEXT: vslide1down.vx v10, v10, a0
492-
; RV64-NEXT: vslidedown.vi v12, v8, 21
493-
; RV64-NEXT: vmv.x.s a0, v12
494-
; RV64-NEXT: vslide1down.vx v10, v10, a0
495-
; RV64-NEXT: vslidedown.vi v12, v8, 23
496-
; RV64-NEXT: vmv.x.s a0, v12
497-
; RV64-NEXT: vslide1down.vx v10, v10, a0
498-
; RV64-NEXT: vslidedown.vi v12, v8, 25
499-
; RV64-NEXT: vmv.x.s a0, v12
500-
; RV64-NEXT: vslide1down.vx v10, v10, a0
501-
; RV64-NEXT: vslidedown.vi v12, v8, 27
502-
; RV64-NEXT: vmv.x.s a0, v12
503-
; RV64-NEXT: vslide1down.vx v10, v10, a0
504-
; RV64-NEXT: vslidedown.vi v12, v8, 29
505-
; RV64-NEXT: vmv.x.s a0, v12
506-
; RV64-NEXT: vslide1down.vx v10, v10, a0
507-
; RV64-NEXT: vslidedown.vi v8, v8, 31
508-
; RV64-NEXT: vmv.x.s a0, v8
509-
; RV64-NEXT: vslide1down.vx v8, v10, a0
510-
; RV64-NEXT: lbu a0, 33(sp)
511-
; RV64-NEXT: lbu a1, 35(sp)
512-
; RV64-NEXT: lbu a2, 37(sp)
513-
; RV64-NEXT: lbu a3, 39(sp)
514-
; RV64-NEXT: vslide1down.vx v8, v8, a0
515-
; RV64-NEXT: vslide1down.vx v8, v8, a1
516-
; RV64-NEXT: vslide1down.vx v8, v8, a2
517-
; RV64-NEXT: vslide1down.vx v8, v8, a3
518-
; RV64-NEXT: lbu a0, 41(sp)
519-
; RV64-NEXT: lbu a1, 43(sp)
520-
; RV64-NEXT: lbu a2, 45(sp)
521-
; RV64-NEXT: lbu a3, 47(sp)
522-
; RV64-NEXT: vslide1down.vx v8, v8, a0
523-
; RV64-NEXT: vslide1down.vx v8, v8, a1
524-
; RV64-NEXT: vslide1down.vx v8, v8, a2
525-
; RV64-NEXT: vslide1down.vx v8, v8, a3
526-
; RV64-NEXT: lbu a0, 49(sp)
527-
; RV64-NEXT: lbu a1, 51(sp)
528-
; RV64-NEXT: lbu a2, 53(sp)
529-
; RV64-NEXT: lbu a3, 55(sp)
530-
; RV64-NEXT: vslide1down.vx v8, v8, a0
531-
; RV64-NEXT: vslide1down.vx v8, v8, a1
532-
; RV64-NEXT: vslide1down.vx v8, v8, a2
533-
; RV64-NEXT: vslide1down.vx v8, v8, a3
534-
; RV64-NEXT: lbu a0, 57(sp)
535-
; RV64-NEXT: lbu a1, 59(sp)
536-
; RV64-NEXT: lbu a2, 61(sp)
537-
; RV64-NEXT: lbu a3, 63(sp)
538-
; RV64-NEXT: vslide1down.vx v8, v8, a0
539-
; RV64-NEXT: vslide1down.vx v8, v8, a1
540-
; RV64-NEXT: vslide1down.vx v8, v8, a2
541-
; RV64-NEXT: vslide1down.vx v8, v8, a3
542-
; RV64-NEXT: addi sp, s0, -128
543-
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
544-
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
545-
; RV64-NEXT: addi sp, sp, 128
546-
; RV64-NEXT: ret
311+
; CHECK-LABEL: vnsrl_v32i8_v64i8:
312+
; CHECK: # %bb.0:
313+
; CHECK-NEXT: li a0, 32
314+
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
315+
; CHECK-NEXT: vnsrl.wi v12, v8, 8
316+
; CHECK-NEXT: vmv.v.v v8, v12
317+
; CHECK-NEXT: ret
547318
%res = shufflevector <64 x i8> %in, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
548319
ret <32 x i8> %res
549320
}

0 commit comments

Comments
 (0)