Skip to content

Commit 16f895c

Browse files
committed
[RISCV] Add +optimized-nf{3,4}-segment-load-store
This is a follow up to llvm#111511, where after benchmarking we learnt that the Banana Pi F3 has fast segmented loads for not just NF=2, but also NF=3 and NF=4: https://github.com/preames/bp3-microarch#vlseg_lmul_x_sew_throughput This adds a tuning feature to allow these segment loads and stores to be costed cheaper and enables it for the spacemit-x60.
1 parent 6da5968 commit 16f895c

File tree

4 files changed

+62
-26
lines changed

4 files changed

+62
-26
lines changed

llvm/lib/Target/RISCV/RISCVFeatures.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,13 @@ def TuneOptimizedZeroStrideLoad
13861386
"true", "Optimized (perform fewer memory operations)"
13871387
"zero-stride vector load">;
13881388

1389+
foreach nf = {3-4} in
1390+
def TuneOptimizedNF#nf#SegmentLoadStore :
1391+
SubtargetFeature<"optimized-nf"#nf#"-segment-load-store",
1392+
"HasOptimizedNF"#nf#"SegmentLoadStore",
1393+
"true", "vlseg"#nf#"eN.v and vsseg"#nf#"eN.v are"
1394+
"implemented as a wide memory op and shuffle">;
1395+
13891396
def Experimental
13901397
: SubtargetFeature<"experimental", "HasExperimental",
13911398
"true", "Experimental intrinsics">;

llvm/lib/Target/RISCV/RISCVProcessors.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,9 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
472472
FeatureStdExtZvfh,
473473
FeatureStdExtZvkt,
474474
FeatureStdExtZvl256b]),
475-
[TuneDLenFactor2]>;
475+
[TuneDLenFactor2,
476+
TuneOptimizedNF3SegmentLoadStore,
477+
TuneOptimizedNF4SegmentLoadStore]>;
476478

477479
def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3",
478480
NoSchedModel,

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -738,8 +738,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
738738
AddressSpace, DL)) {
739739

740740
// Most available hardware today optimizes NF=2 as as one wide memory op
741-
// + Factor * LMUL shuffle ops.
742-
if (Factor == 2) {
741+
// + Factor * LMUL shuffle ops. Some processors may also optimize NF=3
742+
// and NF=4.
743+
if (Factor == 2 ||
744+
(Factor == 3 && ST->hasOptimizedNF3SegmentLoadStore()) ||
745+
(Factor == 4 && ST->hasOptimizedNF4SegmentLoadStore())) {
743746
InstructionCost Cost =
744747
getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
745748
MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; REQUIRES: asserts
2-
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
2+
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,NO-OPT
3+
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf3-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF3
4+
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf4-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF4
35

46
%i8.2 = type {i8, i8}
57
define void @i8_factor_2(ptr %data, i64 %n) {
@@ -48,17 +50,28 @@ for.end:
4850
define void @i8_factor_3(ptr %data, i64 %n) {
4951
entry:
5052
br label %for.body
51-
; CHECK-LABEL: Checking a loop in 'i8_factor_3'
52-
; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
53-
; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
54-
; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
55-
; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
56-
; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
57-
; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
58-
; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
59-
; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
60-
; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
61-
; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
53+
; OPT-NF3-LABEL: Checking a loop in 'i8_factor_3'
54+
; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
55+
; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
56+
; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
57+
; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
58+
; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
59+
; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
60+
; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
61+
; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
62+
; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
63+
; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
64+
; NO-OPT-LABEL: Checking a loop in 'i8_factor_3'
65+
; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
66+
; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
67+
; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
68+
; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
69+
; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
70+
; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
71+
; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
72+
; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
73+
; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0>
74+
; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%p0>
6275
for.body:
6376
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
6477
%p0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -85,17 +98,28 @@ for.end:
8598
define void @i8_factor_4(ptr %data, i64 %n) {
8699
entry:
87100
br label %for.body
88-
; CHECK-LABEL: Checking a loop in 'i8_factor_4'
89-
; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
90-
; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
91-
; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
92-
; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
93-
; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
94-
; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
95-
; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
96-
; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
97-
; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
98-
; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
101+
; OPT-NF4-LABEL: Checking a loop in 'i8_factor_4'
102+
; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
103+
; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
104+
; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
105+
; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
106+
; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
107+
; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
108+
; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
109+
; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
110+
; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
111+
; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
112+
; NO-OPT-LABEL: Checking a loop in 'i8_factor_4'
113+
; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
114+
; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
115+
; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
116+
; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
117+
; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
118+
; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
119+
; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
120+
; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
121+
; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0>
122+
; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%p0>
99123
for.body:
100124
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
101125
%p0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0

0 commit comments

Comments
 (0)