Skip to content

Commit b8545e1

Browse files
authored
[RISCV] Consider all subvector extracts within a single VREG cheap (#81032)
This adjusts the isSubVectorExtractCheap callback to consider any extract which fits entirely within the first VLEN bits of the src vector (and uses a 5 bit immediate for the slide) as cheap. These can be done via a single m1 vslide1down.vi instruction. This allows our generic DAG combine logic to kick in and recognize a few more cases where shuffle source is longer than the dest, but that using a wider shuffle is still profitable. (Or as shown in the test diff, we can split the wider source and do two narrower shuffles.)
1 parent 705fcd4 commit b8545e1

File tree

2 files changed

+40
-97
lines changed

2 files changed

+40
-97
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2173,19 +2173,34 @@ bool RISCVTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
21732173
if (ResVT.isScalableVector() || SrcVT.isScalableVector())
21742174
return false;
21752175

2176+
EVT EltVT = ResVT.getVectorElementType();
2177+
assert(EltVT == SrcVT.getVectorElementType() && "Should hold for node");
2178+
2179+
// The smallest type we can slide is i8.
2180+
// TODO: We can extract index 0 from a mask vector without a slide.
2181+
if (EltVT == MVT::i1)
2182+
return false;
2183+
21762184
unsigned ResElts = ResVT.getVectorNumElements();
21772185
unsigned SrcElts = SrcVT.getVectorNumElements();
21782186

2187+
unsigned MinVLen = Subtarget.getRealMinVLen();
2188+
unsigned MinVLMAX = MinVLen / EltVT.getSizeInBits();
2189+
2190+
// If we're extracting only data from the first VLEN bits of the source
2191+
// then we can always do this with an m1 vslidedown.vx. Restricting the
2192+
// Index ensures we can use a vslidedown.vi.
2193+
// TODO: We can generalize this when the exact VLEN is known.
2194+
if (Index + ResElts <= MinVLMAX && Index < 31)
2195+
return true;
2196+
21792197
// Convervatively only handle extracting half of a vector.
2180-
// TODO: Relax this.
2198+
// TODO: For sizes which aren't multiples of VLEN sizes, this may not be
2199+
// a cheap extract. However, this case is important in practice for
2200+
// shuffled extracts of longer vectors. How resolve?
21812201
if ((ResElts * 2) != SrcElts)
21822202
return false;
21832203

2184-
// The smallest type we can slide is i8.
2185-
// TODO: We can extract index 0 from a mask vector without a slide.
2186-
if (ResVT.getVectorElementType() == MVT::i1)
2187-
return false;
2188-
21892204
// Slide can support arbitrary index, but we only treat vslidedown.vi as
21902205
// cheap.
21912206
if (Index >= 32)

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 19 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -722,97 +722,25 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
722722

723723
; FIXME: This could be expressed as a vrgather.vv
724724
define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
725-
; RV32-LABEL: shuffle_v64i8_v8i8:
726-
; RV32: # %bb.0:
727-
; RV32-NEXT: addi sp, sp, -128
728-
; RV32-NEXT: .cfi_def_cfa_offset 128
729-
; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill
730-
; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill
731-
; RV32-NEXT: .cfi_offset ra, -4
732-
; RV32-NEXT: .cfi_offset s0, -8
733-
; RV32-NEXT: addi s0, sp, 128
734-
; RV32-NEXT: .cfi_def_cfa s0, 0
735-
; RV32-NEXT: andi sp, sp, -64
736-
; RV32-NEXT: li a0, 64
737-
; RV32-NEXT: mv a1, sp
738-
; RV32-NEXT: vsetvli zero, a0, e8, m4, ta, ma
739-
; RV32-NEXT: vse8.v v8, (a1)
740-
; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
741-
; RV32-NEXT: vslidedown.vi v10, v8, 8
742-
; RV32-NEXT: vmv.x.s a0, v10
743-
; RV32-NEXT: vmv.x.s a1, v8
744-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
745-
; RV32-NEXT: vmv.v.x v10, a1
746-
; RV32-NEXT: vslide1down.vx v10, v10, a0
747-
; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
748-
; RV32-NEXT: vslidedown.vi v12, v8, 16
749-
; RV32-NEXT: vmv.x.s a0, v12
750-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
751-
; RV32-NEXT: vslide1down.vx v10, v10, a0
752-
; RV32-NEXT: vsetivli zero, 1, e8, m2, ta, ma
753-
; RV32-NEXT: vslidedown.vi v8, v8, 24
754-
; RV32-NEXT: vmv.x.s a0, v8
755-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
756-
; RV32-NEXT: vslide1down.vx v8, v10, a0
757-
; RV32-NEXT: lbu a0, 32(sp)
758-
; RV32-NEXT: lbu a1, 40(sp)
759-
; RV32-NEXT: lbu a2, 48(sp)
760-
; RV32-NEXT: lbu a3, 56(sp)
761-
; RV32-NEXT: vslide1down.vx v8, v8, a0
762-
; RV32-NEXT: vslide1down.vx v8, v8, a1
763-
; RV32-NEXT: vslide1down.vx v8, v8, a2
764-
; RV32-NEXT: vslide1down.vx v8, v8, a3
765-
; RV32-NEXT: addi sp, s0, -128
766-
; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload
767-
; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload
768-
; RV32-NEXT: addi sp, sp, 128
769-
; RV32-NEXT: ret
770-
;
771-
; RV64-LABEL: shuffle_v64i8_v8i8:
772-
; RV64: # %bb.0:
773-
; RV64-NEXT: addi sp, sp, -128
774-
; RV64-NEXT: .cfi_def_cfa_offset 128
775-
; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
776-
; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
777-
; RV64-NEXT: .cfi_offset ra, -8
778-
; RV64-NEXT: .cfi_offset s0, -16
779-
; RV64-NEXT: addi s0, sp, 128
780-
; RV64-NEXT: .cfi_def_cfa s0, 0
781-
; RV64-NEXT: andi sp, sp, -64
782-
; RV64-NEXT: li a0, 64
783-
; RV64-NEXT: mv a1, sp
784-
; RV64-NEXT: vsetvli zero, a0, e8, m4, ta, ma
785-
; RV64-NEXT: vse8.v v8, (a1)
786-
; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
787-
; RV64-NEXT: vslidedown.vi v10, v8, 8
788-
; RV64-NEXT: vmv.x.s a0, v10
789-
; RV64-NEXT: vmv.x.s a1, v8
790-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
791-
; RV64-NEXT: vmv.v.x v10, a1
792-
; RV64-NEXT: vslide1down.vx v10, v10, a0
793-
; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
794-
; RV64-NEXT: vslidedown.vi v12, v8, 16
795-
; RV64-NEXT: vmv.x.s a0, v12
796-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
797-
; RV64-NEXT: vslide1down.vx v10, v10, a0
798-
; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
799-
; RV64-NEXT: vslidedown.vi v8, v8, 24
800-
; RV64-NEXT: vmv.x.s a0, v8
801-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
802-
; RV64-NEXT: vslide1down.vx v8, v10, a0
803-
; RV64-NEXT: lbu a0, 32(sp)
804-
; RV64-NEXT: lbu a1, 40(sp)
805-
; RV64-NEXT: lbu a2, 48(sp)
806-
; RV64-NEXT: lbu a3, 56(sp)
807-
; RV64-NEXT: vslide1down.vx v8, v8, a0
808-
; RV64-NEXT: vslide1down.vx v8, v8, a1
809-
; RV64-NEXT: vslide1down.vx v8, v8, a2
810-
; RV64-NEXT: vslide1down.vx v8, v8, a3
811-
; RV64-NEXT: addi sp, s0, -128
812-
; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
813-
; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
814-
; RV64-NEXT: addi sp, sp, 128
815-
; RV64-NEXT: ret
725+
; CHECK-LABEL: shuffle_v64i8_v8i8:
726+
; CHECK: # %bb.0:
727+
; CHECK-NEXT: li a0, 32
728+
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma
729+
; CHECK-NEXT: vid.v v12
730+
; CHECK-NEXT: vsll.vi v14, v12, 3
731+
; CHECK-NEXT: vrgather.vv v12, v8, v14
732+
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
733+
; CHECK-NEXT: vslidedown.vx v8, v8, a0
734+
; CHECK-NEXT: li a1, 240
735+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
736+
; CHECK-NEXT: vmv.s.x v0, a1
737+
; CHECK-NEXT: lui a1, 98561
738+
; CHECK-NEXT: addi a1, a1, -2048
739+
; CHECK-NEXT: vmv.v.x v10, a1
740+
; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu
741+
; CHECK-NEXT: vrgather.vv v12, v8, v10, v0.t
742+
; CHECK-NEXT: vmv1r.v v8, v12
743+
; CHECK-NEXT: ret
816744
%s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
817745
ret <8 x i8> %s
818746
}

0 commit comments

Comments
 (0)