Skip to content

Commit 02ad623

Browse files
authored
[RISCV] Prefer strided store for interleave store with one lane active (#119027)
If we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment. This is the store side of a905203. As before, this only covers fixed vectors.
1 parent a07e8cd commit 02ad623

File tree

2 files changed

+71
-8
lines changed

2 files changed

+71
-8
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21906,6 +21906,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
2190621906
ShuffleVectorInst *SVI,
2190721907
unsigned Factor) const {
2190821908
IRBuilder<> Builder(SI);
21909+
auto Mask = SVI->getShuffleMask();
2190921910
auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
2191021911
// Given SVI : <n*factor x ty>, then VTy : <n x ty>
2191121912
auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
@@ -21917,11 +21918,35 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
2191721918

2191821919
auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
2191921920

21921+
unsigned Index;
21922+
// If the segment store only has one active lane (i.e. the interleave is
21923+
// just a spread shuffle), we can use a strided store instead. This will
21924+
// be equally fast, and create less vector register pressure.
21925+
if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
21926+
isSpreadMask(Mask, Factor, Index)) {
21927+
unsigned ScalarSizeInBytes = ShuffleVTy->getScalarSizeInBits() / 8;
21928+
Value *Data = SVI->getOperand(0);
21929+
auto *DataVTy = cast<FixedVectorType>(Data->getType());
21930+
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
21931+
Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
21932+
Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
21933+
Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
21934+
Value *VL = Builder.getInt32(VTy->getNumElements());
21935+
21936+
CallInst *CI = Builder.CreateIntrinsic(
21937+
Intrinsic::experimental_vp_strided_store,
21938+
{Data->getType(), BasePtr->getType(), Stride->getType()},
21939+
{Data, BasePtr, Stride, Mask, VL});
21940+
CI->addParamAttr(
21941+
1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
21942+
21943+
return true;
21944+
}
21945+
2192021946
Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
2192121947
SI->getModule(), FixedVssegIntrIds[Factor - 2],
2192221948
{VTy, SI->getPointerOperandType(), XLenTy});
2192321949

21924-
auto Mask = SVI->getShuffleMask();
2192521950
SmallVector<Value *, 10> Ops;
2192621951

2192721952
for (unsigned i = 0; i < Factor; i++) {

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,17 +1285,55 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
12851285
ret void
12861286
}
12871287

1288-
; TODO: This should be a strided store
1289-
define void @store_factor4_one_active_storeback(ptr %ptr, <4 x i32> %v) {
1290-
; CHECK-LABEL: store_factor4_one_active_storeback:
1288+
define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
1289+
; CHECK-LABEL: store_factor4_one_active:
12911290
; CHECK: # %bb.0:
1291+
; CHECK-NEXT: li a1, 16
12921292
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1293-
; CHECK-NEXT: vslideup.vi v9, v8, 1
1294-
; CHECK-NEXT: vmv.v.v v10, v9
1295-
; CHECK-NEXT: vmv.v.v v11, v9
1296-
; CHECK-NEXT: vsseg4e32.v v8, (a0)
1293+
; CHECK-NEXT: vsse32.v v8, (a0), a1
12971294
; CHECK-NEXT: ret
12981295
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
12991296
store <16 x i32> %v0, ptr %ptr
13001297
ret void
13011298
}
1299+
1300+
define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) {
1301+
; CHECK-LABEL: store_factor4_one_active_idx1:
1302+
; CHECK: # %bb.0:
1303+
; CHECK-NEXT: addi a0, a0, 4
1304+
; CHECK-NEXT: li a1, 16
1305+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1306+
; CHECK-NEXT: vsse32.v v8, (a0), a1
1307+
; CHECK-NEXT: ret
1308+
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef>
1309+
store <16 x i32> %v0, ptr %ptr
1310+
ret void
1311+
}
1312+
1313+
define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
1314+
; CHECK-LABEL: store_factor4_one_active_fullwidth:
1315+
; CHECK: # %bb.0:
1316+
; CHECK-NEXT: li a1, 16
1317+
; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
1318+
; CHECK-NEXT: vsse32.v v8, (a0), a1
1319+
; CHECK-NEXT: ret
1320+
%v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
1321+
store <16 x i32> %v0, ptr %ptr
1322+
ret void
1323+
}
1324+
1325+
; TODO: This could be a vslidedown followed by a strided store
1326+
define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
1327+
; CHECK-LABEL: store_factor4_one_active_slidedown:
1328+
; CHECK: # %bb.0:
1329+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
1330+
; CHECK-NEXT: vslidedown.vi v9, v8, 1
1331+
; CHECK-NEXT: vslideup.vi v10, v8, 1
1332+
; CHECK-NEXT: vmv.v.v v11, v10
1333+
; CHECK-NEXT: vmv.v.v v12, v10
1334+
; CHECK-NEXT: vsseg4e32.v v9, (a0)
1335+
; CHECK-NEXT: ret
1336+
%v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
1337+
store <16 x i32> %v0, ptr %ptr
1338+
ret void
1339+
}

0 commit comments

Comments
 (0)