Skip to content

Commit ed612d2

Browse files
committed
Add tuning feature for p470 and p670
1 parent 297bfa2 commit ed612d2

File tree

4 files changed

+31
-6
lines changed

4 files changed

+31
-6
lines changed

llvm/lib/Target/RISCV/RISCVFeatures.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,11 @@ def TuneOptimizedZeroStrideLoad
13571357
"true", "Optimized (perform fewer memory operations)"
13581358
"zero-stride vector load">;
13591359

1360+
def TuneOptimizedVectorGather
1361+
: SubtargetFeature<"optimized-vector-gather", "HasOptimizedVectorGather",
1362+
"true", "At LMUL > 1 vrgather.vv doesn't read from"
1363+
"registers that have no indices">;
1364+
13601365
def Experimental
13611366
: SubtargetFeature<"experimental", "HasExperimental",
13621367
"true", "Experimental intrinsics">;

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10331,10 +10331,11 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
1033110331
MVT XLenVT = Subtarget.getXLenVT();
1033210332
auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
1033310333

10334-
// On most uarchs vrgather.vv is quadratic in LMUL because each output
10335-
// register may read from LMUL registers. However to reverse a vector each
10336-
// output register only needs to read from one register. So decompose it into
10337-
// LMUL * M1 vrgather.vvs, so we get O(LMUL) performance instead of O(LMUL^2).
10334+
// On some uarchs vrgather.vv will read from every input register for each
10335+
// output register, regardless of the indices. However to reverse a vector
10336+
// each output register only needs to read from one register. So decompose it
10337+
// into LMUL * M1 vrgather.vvs, so we get O(LMUL) performance instead of
10338+
// O(LMUL^2).
1033810339
//
1033910340
// vsetvli a1, zero, e64, m4, ta, ma
1034010341
// vrgatherei16.vv v12, v8, v16
@@ -10344,7 +10345,8 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
1034410345
// vrgather.vv v14, v9, v16
1034510346
// vrgather.vv v13, v10, v16
1034610347
// vrgather.vv v12, v11, v16
10347-
if (ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
10348+
if (!Subtarget.hasOptimizedVectorGather() &&
10349+
ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
1034810350
ContainerVT.getVectorElementCount().isKnownMultipleOf(2)) {
1034910351
auto [Lo, Hi] = DAG.SplitVector(Vec, DL);
1035010352
Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, Lo.getSimpleValueType(), Lo);

llvm/lib/Target/RISCV/RISCVProcessors.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,8 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
269269
FeatureUnalignedScalarMem,
270270
FeatureUnalignedVectorMem]),
271271
!listconcat(SiFiveP400TuneFeatures,
272-
[TuneNoSinkSplatOperands])>;
272+
[TuneNoSinkSplatOperands,
273+
TuneOptimizedVectorGather])>;
273274

274275

275276
def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
@@ -290,6 +291,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
290291
TuneLUIADDIFusion,
291292
TuneAUIPCADDIFusion,
292293
TuneNoSinkSplatOperands,
294+
TuneOptimizedVectorGather,
293295
FeaturePostRAScheduler]>;
294296

295297
def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",

llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,3 +2014,19 @@ declare <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double>
20142014
declare <vscale x 3 x i64> @llvm.vector.reverse.nxv3i64(<vscale x 3 x i64>)
20152015
declare <vscale x 6 x i64> @llvm.vector.reverse.nxv6i64(<vscale x 6 x i64>)
20162016
declare <vscale x 12 x i64> @llvm.vector.reverse.nxv12i64(<vscale x 12 x i64>)
2017+
2018+
define <vscale x 8 x i64> @reverse_nxv8i64_optimized_vector_gather(<vscale x 8 x i64> %a) "target-features"="+optimized-vector-gather" {
2019+
; CHECK-LABEL: reverse_nxv8i64_optimized_vector_gather:
2020+
; CHECK: # %bb.0:
2021+
; CHECK-NEXT: csrr a0, vlenb
2022+
; CHECK-NEXT: addi a0, a0, -1
2023+
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
2024+
; CHECK-NEXT: vid.v v16
2025+
; CHECK-NEXT: vrsub.vx v24, v16, a0
2026+
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
2027+
; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
2028+
; CHECK-NEXT: vmv.v.v v8, v16
2029+
; CHECK-NEXT: ret
2030+
%res = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
2031+
ret <vscale x 8 x i64> %res
2032+
}

0 commit comments

Comments
 (0)