Skip to content

Commit a0d74e2

Browse files
lukel97yuxuanchen1997
authored andcommitted
[RISCV] Add unit strided load/store to whole register peephole (#100116)
Summary: This adds a new vector peephole that converts unmasked, VLMAX vleN.v/vseN.v to their whole register equivalents. It replaces the existing tablegen patterns on ISD::LOAD/ISD::STORE and is a bit more general since it also catches VP loads and stores and @llvm.riscv intrinsics. The heavy lifting of detecting a VLMAX AVL and an all-ones mask is already taken care of by existing peepholes. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250662
1 parent a611c16 commit a0d74e2

File tree

7 files changed

+94
-69
lines changed

7 files changed

+94
-69
lines changed

llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,6 @@ multiclass VPatUSLoadStoreSDNode<ValueType type,
4141
(store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>;
4242
}
4343

44-
multiclass VPatUSLoadStoreWholeVRSDNode<ValueType type,
45-
int log2sew,
46-
LMULInfo vlmul,
47-
VReg reg_class,
48-
int sew = !shl(1, log2sew)> {
49-
defvar load_instr =
50-
!cast<Instruction>("VL"#!substr(vlmul.MX, 1)#"RE"#sew#"_V");
51-
defvar store_instr =
52-
!cast<Instruction>("VS"#!substr(vlmul.MX, 1)#"R_V");
53-
54-
// Load
55-
def : Pat<(type (load GPR:$rs1)),
56-
(load_instr GPR:$rs1)>;
57-
// Store
58-
def : Pat<(store type:$rs2, GPR:$rs1),
59-
(store_instr reg_class:$rs2, GPR:$rs1)>;
60-
}
61-
6244
multiclass VPatUSLoadStoreMaskSDNode<MTypeInfo m> {
6345
defvar load_instr = !cast<Instruction>("PseudoVLM_V_"#m.BX);
6446
defvar store_instr = !cast<Instruction>("PseudoVSM_V_"#m.BX);
@@ -895,23 +877,11 @@ multiclass VPatAVGADD_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
895877
//===----------------------------------------------------------------------===//
896878

897879
// 7.4. Vector Unit-Stride Instructions
898-
foreach vti = !listconcat(FractionalGroupIntegerVectors,
899-
FractionalGroupFloatVectors,
900-
FractionalGroupBFloatVectors) in
880+
foreach vti = AllVectors in
901881
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
902882
GetVTypePredicates<vti>.Predicates) in
903883
defm : VPatUSLoadStoreSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
904884
vti.AVL, vti.RegClass>;
905-
foreach vti = [VI8M1, VI16M1, VI32M1, VI64M1, VBF16M1, VF16M1, VF32M1, VF64M1] in
906-
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
907-
GetVTypePredicates<vti>.Predicates) in
908-
defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
909-
vti.RegClass>;
910-
foreach vti = !listconcat(GroupIntegerVectors, GroupFloatVectors, GroupBFloatVectors) in
911-
let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
912-
GetVTypePredicates<vti>.Predicates) in
913-
defm : VPatUSLoadStoreWholeVRSDNode<vti.Vector, vti.Log2SEW, vti.LMul,
914-
vti.RegClass>;
915885
foreach mti = AllMasks in
916886
let Predicates = [HasVInstructions] in
917887
defm : VPatUSLoadStoreMaskSDNode<mti>;

llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class RISCVVectorPeephole : public MachineFunctionPass {
5959

6060
private:
6161
bool convertToVLMAX(MachineInstr &MI) const;
62+
bool convertToWholeRegister(MachineInstr &MI) const;
6263
bool convertToUnmasked(MachineInstr &MI) const;
6364
bool convertVMergeToVMv(MachineInstr &MI) const;
6465

@@ -155,6 +156,58 @@ bool RISCVVectorPeephole::isAllOnesMask(const MachineInstr *MaskDef) const {
155156
}
156157
}
157158

159+
/// Convert unit strided unmasked loads and stores to whole-register equivalents
160+
/// to avoid the dependency on $vl and $vtype.
161+
///
162+
/// %x = PseudoVLE8_V_M1 %passthru, %ptr, %vlmax, policy
163+
/// PseudoVSE8_V_M1 %v, %ptr, %vlmax
164+
///
165+
/// ->
166+
///
167+
/// %x = VL1RE8_V %ptr
168+
/// VS1R_V %v, %ptr
169+
bool RISCVVectorPeephole::convertToWholeRegister(MachineInstr &MI) const {
170+
#define CASE_WHOLE_REGISTER_LMUL_SEW(lmul, sew) \
171+
case RISCV::PseudoVLE##sew##_V_M##lmul: \
172+
NewOpc = RISCV::VL##lmul##RE##sew##_V; \
173+
break; \
174+
case RISCV::PseudoVSE##sew##_V_M##lmul: \
175+
NewOpc = RISCV::VS##lmul##R_V; \
176+
break;
177+
#define CASE_WHOLE_REGISTER_LMUL(lmul) \
178+
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 8) \
179+
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 16) \
180+
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 32) \
181+
CASE_WHOLE_REGISTER_LMUL_SEW(lmul, 64)
182+
183+
unsigned NewOpc;
184+
switch (MI.getOpcode()) {
185+
CASE_WHOLE_REGISTER_LMUL(1)
186+
CASE_WHOLE_REGISTER_LMUL(2)
187+
CASE_WHOLE_REGISTER_LMUL(4)
188+
CASE_WHOLE_REGISTER_LMUL(8)
189+
default:
190+
return false;
191+
}
192+
193+
MachineOperand &VLOp = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
194+
if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
195+
return false;
196+
197+
// Whole register instructions aren't pseudos so they don't have
198+
// policy/SEW/AVL ops, and they don't have passthrus.
199+
if (RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags))
200+
MI.removeOperand(RISCVII::getVecPolicyOpNum(MI.getDesc()));
201+
MI.removeOperand(RISCVII::getSEWOpNum(MI.getDesc()));
202+
MI.removeOperand(RISCVII::getVLOpNum(MI.getDesc()));
203+
if (RISCVII::isFirstDefTiedToFirstUse(MI.getDesc()))
204+
MI.removeOperand(1);
205+
206+
MI.setDesc(TII->get(NewOpc));
207+
208+
return true;
209+
}
210+
158211
// Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
159212
// (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
160213
bool RISCVVectorPeephole::convertVMergeToVMv(MachineInstr &MI) const {
@@ -281,6 +334,7 @@ bool RISCVVectorPeephole::runOnMachineFunction(MachineFunction &MF) {
281334
for (MachineInstr &MI : MBB) {
282335
Changed |= convertToVLMAX(MI);
283336
Changed |= convertToUnmasked(MI);
337+
Changed |= convertToWholeRegister(MI);
284338
Changed |= convertVMergeToVMv(MI);
285339
}
286340
}

llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ define dso_local void @lots_args(i32 signext %x0, i32 signext %x1, <vscale x 16
2020
; CHECK-NEXT: slli a0, a0, 3
2121
; CHECK-NEXT: sub a0, s0, a0
2222
; CHECK-NEXT: addi a0, a0, -64
23-
; CHECK-NEXT: vs8r.v v8, (a0)
23+
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
24+
; CHECK-NEXT: vse32.v v8, (a0)
2425
; CHECK-NEXT: sw a2, -36(s0)
2526
; CHECK-NEXT: sw a3, -40(s0)
2627
; CHECK-NEXT: sw a4, -44(s0)
@@ -85,7 +86,8 @@ define dso_local signext i32 @main() #0 {
8586
; CHECK-NEXT: slli s1, s1, 3
8687
; CHECK-NEXT: sub s1, s0, s1
8788
; CHECK-NEXT: addi s1, s1, -112
88-
; CHECK-NEXT: vs8r.v v8, (s1)
89+
; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
90+
; CHECK-NEXT: vse32.v v8, (s1)
8991
; CHECK-NEXT: li a0, 1
9092
; CHECK-NEXT: sw a0, -76(s0)
9193
; CHECK-NEXT: sw a0, -80(s0)
@@ -99,7 +101,7 @@ define dso_local signext i32 @main() #0 {
99101
; CHECK-NEXT: sw a0, -112(s0)
100102
; CHECK-NEXT: lw a0, -76(s0)
101103
; CHECK-NEXT: lw a1, -80(s0)
102-
; CHECK-NEXT: vl8re32.v v8, (s1)
104+
; CHECK-NEXT: vle32.v v8, (s1)
103105
; CHECK-NEXT: lw a2, -84(s0)
104106
; CHECK-NEXT: lw a3, -88(s0)
105107
; CHECK-NEXT: lw a4, -92(s0)
@@ -115,7 +117,8 @@ define dso_local signext i32 @main() #0 {
115117
; CHECK-NEXT: addi sp, sp, 16
116118
; CHECK-NEXT: lw a0, -76(s0)
117119
; CHECK-NEXT: lw a1, -80(s0)
118-
; CHECK-NEXT: vl8re32.v v8, (s1)
120+
; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma
121+
; CHECK-NEXT: vle32.v v8, (s1)
119122
; CHECK-NEXT: lw a2, -84(s0)
120123
; CHECK-NEXT: lw a3, -88(s0)
121124
; CHECK-NEXT: lw a4, -92(s0)

llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops-mir.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define void @vpmerge_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
1717
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
1818
; CHECK-NEXT: $v0 = COPY [[COPY1]]
1919
; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 8)
20-
; CHECK-NEXT: VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
20+
; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
2121
; CHECK-NEXT: PseudoRET
2222
%a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
2323
%b = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
@@ -36,7 +36,7 @@ define void @vpselect_vpload_store(<vscale x 2 x i32> %passthru, ptr %p, <vscale
3636
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
3737
; CHECK-NEXT: $v0 = COPY [[COPY1]]
3838
; CHECK-NEXT: [[PseudoVLE32_V_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVLE32_V_M1_MASK [[COPY3]], [[COPY2]], $v0, [[COPY]], 5 /* e32 */, 1 /* ta, mu */ :: (load unknown-size from %ir.p, align 8)
39-
; CHECK-NEXT: VS1R_V killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]] :: (store (<vscale x 1 x s64>) into %ir.p)
39+
; CHECK-NEXT: PseudoVSE32_V_M1 killed [[PseudoVLE32_V_M1_MASK]], [[COPY2]], -1, 5 /* e32 */ :: (store (<vscale x 1 x s64>) into %ir.p)
4040
; CHECK-NEXT: PseudoRET
4141
%a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
4242
%b = call <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)

llvm/test/CodeGen/RISCV/rvv/vpload.ll

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,3 +536,14 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
536536
store <vscale x 1 x double> %hi, ptr %out
537537
ret <vscale x 16 x double> %lo
538538
}
539+
540+
define <vscale x 8 x i8> @vpload_all_active_nxv8i8(ptr %ptr) {
541+
; CHECK-LABEL: vpload_all_active_nxv8i8:
542+
; CHECK: # %bb.0:
543+
; CHECK-NEXT: vl1r.v v8, (a0)
544+
; CHECK-NEXT: ret
545+
%vscale = call i32 @llvm.vscale()
546+
%evl = mul i32 %vscale, 8
547+
%load = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
548+
ret <vscale x 8 x i8> %load
549+
}

llvm/test/CodeGen/RISCV/rvv/vpstore.ll

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,3 +459,14 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
459459
call void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 %evl)
460460
ret void
461461
}
462+
463+
define void @vpstore_all_active_nxv8i8(<vscale x 8 x i8> %val, ptr %ptr) {
464+
; CHECK-LABEL: vpstore_all_active_nxv8i8:
465+
; CHECK: # %bb.0:
466+
; CHECK-NEXT: vs1r.v v8, (a0)
467+
; CHECK-NEXT: ret
468+
%vscale = call i32 @llvm.vscale()
469+
%evl = mul i32 %vscale, 8
470+
call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> %val, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %evl)
471+
ret void
472+
}

llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll

Lines changed: 8 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -487,42 +487,18 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
487487
define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
488488
; CHECK-LABEL: vselect_combine_regression:
489489
; CHECK: # %bb.0:
490-
; CHECK-NEXT: addi sp, sp, -16
491-
; CHECK-NEXT: .cfi_def_cfa_offset 16
492-
; CHECK-NEXT: csrr a1, vlenb
493-
; CHECK-NEXT: slli a1, a1, 4
494-
; CHECK-NEXT: sub sp, sp, a1
495-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
496-
; CHECK-NEXT: addi a1, sp, 16
497-
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
490+
; CHECK-NEXT: vmv8r.v v24, v16
498491
; CHECK-NEXT: csrr a1, vlenb
499492
; CHECK-NEXT: slli a1, a1, 3
500493
; CHECK-NEXT: add a1, a0, a1
501-
; CHECK-NEXT: vl8re64.v v8, (a1)
502-
; CHECK-NEXT: csrr a1, vlenb
503-
; CHECK-NEXT: slli a1, a1, 3
504-
; CHECK-NEXT: add a1, sp, a1
505-
; CHECK-NEXT: addi a1, a1, 16
506-
; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
507-
; CHECK-NEXT: vl8re64.v v8, (a0)
508-
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
509-
; CHECK-NEXT: vmseq.vi v24, v16, 0
510-
; CHECK-NEXT: addi a0, sp, 16
511-
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
512-
; CHECK-NEXT: vmseq.vi v0, v16, 0
494+
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu
495+
; CHECK-NEXT: vmseq.vi v0, v8, 0
513496
; CHECK-NEXT: vmv.v.i v16, 0
514-
; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0
515-
; CHECK-NEXT: vmv1r.v v0, v24
516-
; CHECK-NEXT: csrr a0, vlenb
517-
; CHECK-NEXT: slli a0, a0, 3
518-
; CHECK-NEXT: add a0, sp, a0
519-
; CHECK-NEXT: addi a0, a0, 16
520-
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
521-
; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0
522-
; CHECK-NEXT: csrr a0, vlenb
523-
; CHECK-NEXT: slli a0, a0, 4
524-
; CHECK-NEXT: add sp, sp, a0
525-
; CHECK-NEXT: addi sp, sp, 16
497+
; CHECK-NEXT: vmseq.vi v7, v24, 0
498+
; CHECK-NEXT: vmv.v.i v8, 0
499+
; CHECK-NEXT: vle64.v v8, (a0), v0.t
500+
; CHECK-NEXT: vmv1r.v v0, v7
501+
; CHECK-NEXT: vle64.v v16, (a1), v0.t
526502
; CHECK-NEXT: ret
527503
%cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
528504
%sel = select <vscale x 16 x i1> %cond, <vscale x 16 x double> %vb, <vscale x 16 x double> zeroinitializer

0 commit comments

Comments
 (0)