-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Prefer strided store for interleave store with one lane active #119027
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment. This is the store side of a905203. As before, this only covers fixed vectors.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesIf we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment. This is the store side of a905203. As before, this only covers fixed vectors. Full diff: https://github.com/llvm/llvm-project/pull/119027.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 83308682835394..743d68523bff76 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21906,6 +21906,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
IRBuilder<> Builder(SI);
+ auto Mask = SVI->getShuffleMask();
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
@@ -21917,11 +21918,35 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+ unsigned Index;
+ // If the segment store only has one active lane (i.e. the interleave is
+ // just a spread shuffle), we can use a strided store instead. This will
+ // be equally fast, and create less vector register pressure.
+ if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
+ isSpreadMask(Mask, Factor, Index)) {
+ unsigned ScalarSizeInBytes = ShuffleVTy->getScalarSizeInBits() / 8;
+ Value *Data = SVI->getOperand(0);
+ auto *DataVTy = cast<FixedVectorType>(Data->getType());
+ Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+ Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
+ Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
+ Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
+ Value *VL = Builder.getInt32(VTy->getNumElements());
+
+ CallInst *CI =
+ Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+ {Data->getType(), BasePtr->getType(), Stride->getType()},
+ {Data, BasePtr, Stride, Mask, VL});
+ CI->addParamAttr(
+ 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+
+ return true;
+ }
+
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
SI->getModule(), FixedVssegIntrIds[Factor - 2],
{VTy, SI->getPointerOperandType(), XLenTy});
- auto Mask = SVI->getShuffleMask();
SmallVector<Value *, 10> Ops;
for (unsigned i = 0; i < Factor; i++) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 5649ee20a47092..8833634be1a0ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1285,17 +1285,55 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
ret void
}
-; TODO: This should be a strided store
-define void @store_factor4_one_active_storeback(ptr %ptr, <4 x i32> %v) {
-; CHECK-LABEL: store_factor4_one_active_storeback:
+define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active:
; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 16
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v9, v8, 1
-; CHECK-NEXT: vmv.v.v v10, v9
-; CHECK-NEXT: vmv.v.v v11, v9
-; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: vsse32.v v8, (a0), a1
; CHECK-NEXT: ret
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
store <16 x i32> %v0, ptr %ptr
ret void
}
+
+define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_idx1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a0, a0, 4
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vsse32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef>
+ store <16 x i32> %v0, ptr %ptr
+ ret void
+}
+
+define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_fullwidth:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
+; CHECK-NEXT: vsse32.v v8, (a0), a1
+; CHECK-NEXT: ret
+ %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
+ store <16 x i32> %v0, ptr %ptr
+ ret void
+}
+
+; TODO: This could be a vslidedown followed by a strided store
+define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_slidedown:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 1
+; CHECK-NEXT: vslideup.vi v10, v8, 1
+; CHECK-NEXT: vmv.v.v v11, v10
+; CHECK-NEXT: vmv.v.v v12, v10
+; CHECK-NEXT: vsseg4e32.v v9, (a0)
+; CHECK-NEXT: ret
+ %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
+ store <16 x i32> %v0, ptr %ptr
+ ret void
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment.
This is the store side of a905203. As before, this only covers fixed vectors.