Skip to content

Commit 4dc9a2c

Browse files
committed
[RISCV] Use scalar stores for splats of zero to memory up to XLen
The direct motivation here is to undo an unprofitable vectorization performed by SLP, but the transform seems generally useful as well. If we are storing a zero to memory, we can use a single scalar store (from X0) for all power of two sizes up to XLen. Differential Revision: https://reviews.llvm.org/D150717
1 parent 6a81e69 commit 4dc9a2c

File tree

3 files changed

+103
-55
lines changed

3 files changed

+103
-55
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12170,7 +12170,33 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1217012170
break;
1217112171

1217212172
auto *Store = cast<StoreSDNode>(N);
12173+
EVT MemVT = Store->getMemoryVT();
1217312174
SDValue Val = Store->getValue();
12175+
12176+
// Using vector to store zeros requires e.g.:
12177+
// vsetivli zero, 2, e64, m1, ta, ma
12178+
// vmv.v.i v8, 0
12179+
// vse64.v v8, (a0)
12180+
// If sufficiently aligned, we can use at most one scalar store to zero
12181+
// initialize any power-of-two size up to XLen bits.
12182+
if (DCI.isBeforeLegalize() && !Store->isTruncatingStore() &&
12183+
!Store->isIndexed() && ISD::isBuildVectorAllZeros(Val.getNode()) &&
12184+
MemVT.getVectorElementType().bitsLE(Subtarget.getXLenVT()) &&
12185+
isPowerOf2_64(MemVT.getSizeInBits()) &&
12186+
MemVT.getSizeInBits() <= Subtarget.getXLen()) {
12187+
assert(!MemVT.isScalableVector());
12188+
auto NewVT = MVT::getIntegerVT(MemVT.getSizeInBits());
12189+
if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
12190+
NewVT, *Store->getMemOperand())) {
12191+
SDLoc DL(N);
12192+
SDValue Chain = Store->getChain();
12193+
auto NewV = DAG.getConstant(0, DL, NewVT);
12194+
return DAG.getStore(Chain, DL, NewV, Store->getBasePtr(),
12195+
Store->getPointerInfo(), Store->getOriginalAlign(),
12196+
Store->getMemOperand()->getFlags());
12197+
}
12198+
}
12199+
1217412200
// Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1.
1217512201
// vfmv.f.s is represented as extract element from 0. Match it late to avoid
1217612202
// any illegal types.
@@ -12180,7 +12206,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1218012206
isNullConstant(Val.getOperand(1)))) {
1218112207
SDValue Src = Val.getOperand(0);
1218212208
MVT VecVT = Src.getSimpleValueType();
12183-
EVT MemVT = Store->getMemoryVT();
1218412209
// VecVT should be scalable and memory VT should match the element type.
1218512210
if (VecVT.isScalableVector() &&
1218612211
MemVT == VecVT.getVectorElementType()) {

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll

Lines changed: 71 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -450,9 +450,7 @@ define void @splat_zero_v4i64(ptr %x) {
450450
define void @splat_zero_v2i16(ptr %p) {
451451
; CHECK-LABEL: splat_zero_v2i16:
452452
; CHECK: # %bb.0:
453-
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
454-
; CHECK-NEXT: vmv.v.i v8, 0
455-
; CHECK-NEXT: vse16.v v8, (a0)
453+
; CHECK-NEXT: sw zero, 0(a0)
456454
; CHECK-NEXT: ret
457455
store <2 x i16> zeroinitializer, ptr %p
458456
ret void
@@ -471,23 +469,81 @@ define void @splat_zero_v2i16_unaligned(ptr %p) {
471469
}
472470

473471
define void @splat_zero_v4i16(ptr %p) {
474-
; CHECK-LABEL: splat_zero_v4i16:
475-
; CHECK: # %bb.0:
476-
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
477-
; CHECK-NEXT: vmv.v.i v8, 0
478-
; CHECK-NEXT: vse16.v v8, (a0)
479-
; CHECK-NEXT: ret
472+
; LMULMAX8-RV32-LABEL: splat_zero_v4i16:
473+
; LMULMAX8-RV32: # %bb.0:
474+
; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
475+
; LMULMAX8-RV32-NEXT: vmv.v.i v8, 0
476+
; LMULMAX8-RV32-NEXT: vse16.v v8, (a0)
477+
; LMULMAX8-RV32-NEXT: ret
478+
;
479+
; LMULMAX2-RV32-LABEL: splat_zero_v4i16:
480+
; LMULMAX2-RV32: # %bb.0:
481+
; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
482+
; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0
483+
; LMULMAX2-RV32-NEXT: vse16.v v8, (a0)
484+
; LMULMAX2-RV32-NEXT: ret
485+
;
486+
; LMULMAX1-RV32-LABEL: splat_zero_v4i16:
487+
; LMULMAX1-RV32: # %bb.0:
488+
; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
489+
; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0
490+
; LMULMAX1-RV32-NEXT: vse16.v v8, (a0)
491+
; LMULMAX1-RV32-NEXT: ret
492+
;
493+
; LMULMAX8-RV64-LABEL: splat_zero_v4i16:
494+
; LMULMAX8-RV64: # %bb.0:
495+
; LMULMAX8-RV64-NEXT: sd zero, 0(a0)
496+
; LMULMAX8-RV64-NEXT: ret
497+
;
498+
; LMULMAX2-RV64-LABEL: splat_zero_v4i16:
499+
; LMULMAX2-RV64: # %bb.0:
500+
; LMULMAX2-RV64-NEXT: sd zero, 0(a0)
501+
; LMULMAX2-RV64-NEXT: ret
502+
;
503+
; LMULMAX1-RV64-LABEL: splat_zero_v4i16:
504+
; LMULMAX1-RV64: # %bb.0:
505+
; LMULMAX1-RV64-NEXT: sd zero, 0(a0)
506+
; LMULMAX1-RV64-NEXT: ret
480507
store <4 x i16> zeroinitializer, ptr %p
481508
ret void
482509
}
483510

484511
define void @splat_zero_v2i32(ptr %p) {
485-
; CHECK-LABEL: splat_zero_v2i32:
486-
; CHECK: # %bb.0:
487-
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
488-
; CHECK-NEXT: vmv.v.i v8, 0
489-
; CHECK-NEXT: vse32.v v8, (a0)
490-
; CHECK-NEXT: ret
512+
; LMULMAX8-RV32-LABEL: splat_zero_v2i32:
513+
; LMULMAX8-RV32: # %bb.0:
514+
; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
515+
; LMULMAX8-RV32-NEXT: vmv.v.i v8, 0
516+
; LMULMAX8-RV32-NEXT: vse32.v v8, (a0)
517+
; LMULMAX8-RV32-NEXT: ret
518+
;
519+
; LMULMAX2-RV32-LABEL: splat_zero_v2i32:
520+
; LMULMAX2-RV32: # %bb.0:
521+
; LMULMAX2-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
522+
; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0
523+
; LMULMAX2-RV32-NEXT: vse32.v v8, (a0)
524+
; LMULMAX2-RV32-NEXT: ret
525+
;
526+
; LMULMAX1-RV32-LABEL: splat_zero_v2i32:
527+
; LMULMAX1-RV32: # %bb.0:
528+
; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
529+
; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0
530+
; LMULMAX1-RV32-NEXT: vse32.v v8, (a0)
531+
; LMULMAX1-RV32-NEXT: ret
532+
;
533+
; LMULMAX8-RV64-LABEL: splat_zero_v2i32:
534+
; LMULMAX8-RV64: # %bb.0:
535+
; LMULMAX8-RV64-NEXT: sd zero, 0(a0)
536+
; LMULMAX8-RV64-NEXT: ret
537+
;
538+
; LMULMAX2-RV64-LABEL: splat_zero_v2i32:
539+
; LMULMAX2-RV64: # %bb.0:
540+
; LMULMAX2-RV64-NEXT: sd zero, 0(a0)
541+
; LMULMAX2-RV64-NEXT: ret
542+
;
543+
; LMULMAX1-RV64-LABEL: splat_zero_v2i32:
544+
; LMULMAX1-RV64: # %bb.0:
545+
; LMULMAX1-RV64-NEXT: sd zero, 0(a0)
546+
; LMULMAX1-RV64-NEXT: ret
491547
store <2 x i32> zeroinitializer, ptr %p
492548
ret void
493549
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll

Lines changed: 6 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,7 @@ define void @splat_ones_v1i1(ptr %x) {
2626
define void @splat_zeros_v2i1(ptr %x) {
2727
; CHECK-LABEL: splat_zeros_v2i1:
2828
; CHECK: # %bb.0:
29-
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
30-
; CHECK-NEXT: vmclr.m v0
31-
; CHECK-NEXT: vmv.v.i v8, 0
32-
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
33-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
34-
; CHECK-NEXT: vmv.v.i v9, 0
35-
; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma
36-
; CHECK-NEXT: vslideup.vi v9, v8, 0
37-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
38-
; CHECK-NEXT: vmsne.vi v8, v9, 0
39-
; CHECK-NEXT: vsm.v v8, (a0)
29+
; CHECK-NEXT: sb zero, 0(a0)
4030
; CHECK-NEXT: ret
4131
store <2 x i1> zeroinitializer, ptr %x
4232
ret void
@@ -135,9 +125,7 @@ define void @splat_v4i1(ptr %x, i1 %y) {
135125
define void @splat_zeros_v8i1(ptr %x) {
136126
; CHECK-LABEL: splat_zeros_v8i1:
137127
; CHECK: # %bb.0:
138-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
139-
; CHECK-NEXT: vmclr.m v8
140-
; CHECK-NEXT: vsm.v v8, (a0)
128+
; CHECK-NEXT: sb zero, 0(a0)
141129
; CHECK-NEXT: ret
142130
store <8 x i1> zeroinitializer, ptr %x
143131
ret void
@@ -185,31 +173,10 @@ define void @splat_v16i1(ptr %x, i1 %y) {
185173
}
186174

187175
define void @splat_zeros_v32i1(ptr %x) {
188-
; LMULMAX2-LABEL: splat_zeros_v32i1:
189-
; LMULMAX2: # %bb.0:
190-
; LMULMAX2-NEXT: li a1, 32
191-
; LMULMAX2-NEXT: vsetvli zero, a1, e8, m2, ta, ma
192-
; LMULMAX2-NEXT: vmclr.m v8
193-
; LMULMAX2-NEXT: vsm.v v8, (a0)
194-
; LMULMAX2-NEXT: ret
195-
;
196-
; LMULMAX1-RV32-LABEL: splat_zeros_v32i1:
197-
; LMULMAX1-RV32: # %bb.0:
198-
; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
199-
; LMULMAX1-RV32-NEXT: vmclr.m v8
200-
; LMULMAX1-RV32-NEXT: vsm.v v8, (a0)
201-
; LMULMAX1-RV32-NEXT: addi a0, a0, 2
202-
; LMULMAX1-RV32-NEXT: vsm.v v8, (a0)
203-
; LMULMAX1-RV32-NEXT: ret
204-
;
205-
; LMULMAX1-RV64-LABEL: splat_zeros_v32i1:
206-
; LMULMAX1-RV64: # %bb.0:
207-
; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
208-
; LMULMAX1-RV64-NEXT: vmclr.m v8
209-
; LMULMAX1-RV64-NEXT: vsm.v v8, (a0)
210-
; LMULMAX1-RV64-NEXT: addi a0, a0, 2
211-
; LMULMAX1-RV64-NEXT: vsm.v v8, (a0)
212-
; LMULMAX1-RV64-NEXT: ret
176+
; CHECK-LABEL: splat_zeros_v32i1:
177+
; CHECK: # %bb.0:
178+
; CHECK-NEXT: sw zero, 0(a0)
179+
; CHECK-NEXT: ret
213180
store <32 x i1> zeroinitializer, ptr %x
214181
ret void
215182
}

0 commit comments

Comments
 (0)