Skip to content

Commit b7df650

Browse files
committed
[RISCV] Prefer VLS over VLA if costs are equal
This is inspired by llvm#95819. Some kernels like s000 have some improvements and we can reduce code for calculating vector length, fully unroll tail epilogue. Currently, we add a SubtargetFeature for this and the processors can add it if needed.
1 parent a466db2 commit b7df650

File tree

3 files changed

+175
-0
lines changed

3 files changed

+175
-0
lines changed

llvm/lib/Target/RISCV/RISCVFeatures.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,12 @@ def FeaturePredictableSelectIsExpensive
13241324
: SubtargetFeature<"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
13251325
"Prefer likely predicted branches over selects">;
13261326

1327+
def FeatureUseFixedOverScalableIfEqualCost
1328+
: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
1329+
"UseFixedOverScalableIfEqualCost", "true",
1330+
"Prefer fixed width loop vectorization over scalable"
1331+
"if the cost-model assigns equal costs">;
1332+
13271333
def TuneOptimizedZeroStrideLoad
13281334
: SubtargetFeature<"optimized-zero-stride-load", "HasOptimizedZeroStrideLoad",
13291335
"true", "Optimized (perform fewer memory operations)"

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
342342

343343
bool enableInterleavedAccessVectorization() { return true; }
344344

345+
bool preferFixedOverScalableIfEqualCost() const {
346+
return ST->useFixedOverScalableIfEqualCost();
347+
}
348+
345349
enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };
346350
unsigned getNumberOfRegisters(unsigned ClassID) const {
347351
switch (ClassID) {
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -mtriple riscv64 -S -passes=loop-vectorize -force-target-instruction-cost=1 < %s \
3+
; RUN: -mattr=+v | FileCheck %s -check-prefix=SCALABLE
4+
; RUN: opt -mtriple riscv64 -S -passes=loop-vectorize -force-target-instruction-cost=1 < %s \
5+
; RUN: -mattr=+v,+use-fixed-over-scalable-if-equal-cost \
6+
; RUN: | FileCheck %s -check-prefix=FIXED
7+
8+
define void @s000(ptr %a, ptr %b, i32 %n) {
9+
; SCALABLE-LABEL: define void @s000(
10+
; SCALABLE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; SCALABLE-NEXT: [[ENTRY:.*:]]
12+
; SCALABLE-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64
13+
; SCALABLE-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64
14+
; SCALABLE-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
15+
; SCALABLE-NEXT: br i1 [[CMP6]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
16+
; SCALABLE: [[FOR_BODY_PREHEADER]]:
17+
; SCALABLE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
18+
; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
19+
; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
20+
; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 8, i64 [[TMP1]])
21+
; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP2]]
22+
; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
23+
; SCALABLE: [[VECTOR_MEMCHECK]]:
24+
; SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
25+
; SCALABLE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
26+
; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
27+
; SCALABLE-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B2]]
28+
; SCALABLE-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
29+
; SCALABLE-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
30+
; SCALABLE: [[VECTOR_PH]]:
31+
; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
32+
; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
33+
; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP8]]
34+
; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
35+
; SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
36+
; SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
37+
; SCALABLE-NEXT: br label %[[VECTOR_BODY:.*]]
38+
; SCALABLE: [[VECTOR_BODY]]:
39+
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
40+
; SCALABLE-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
41+
; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
42+
; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
43+
; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4
44+
; SCALABLE-NEXT: [[TMP14:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
45+
; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]]
46+
; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0
47+
; SCALABLE-NEXT: store <vscale x 4 x float> [[TMP14]], ptr [[TMP16]], align 4
48+
; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
49+
; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
50+
; SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
51+
; SCALABLE: [[MIDDLE_BLOCK]]:
52+
; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
53+
; SCALABLE-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
54+
; SCALABLE: [[SCALAR_PH]]:
55+
; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
56+
; SCALABLE-NEXT: br label %[[FOR_BODY:.*]]
57+
; SCALABLE: [[FOR_COND_CLEANUP_LOOPEXIT]]:
58+
; SCALABLE-NEXT: br label %[[FOR_COND_CLEANUP]]
59+
; SCALABLE: [[FOR_COND_CLEANUP]]:
60+
; SCALABLE-NEXT: ret void
61+
; SCALABLE: [[FOR_BODY]]:
62+
; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
63+
; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
64+
; SCALABLE-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
65+
; SCALABLE-NEXT: [[ADD:%.*]] = fadd float [[TMP18]], 1.000000e+00
66+
; SCALABLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
67+
; SCALABLE-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
68+
; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
69+
; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
70+
; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
71+
;
72+
; FIXED-LABEL: define void @s000(
73+
; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
74+
; FIXED-NEXT: [[ENTRY:.*:]]
75+
; FIXED-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64
76+
; FIXED-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64
77+
; FIXED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N]], 0
78+
; FIXED-NEXT: br i1 [[CMP6]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
79+
; FIXED: [[FOR_BODY_PREHEADER]]:
80+
; FIXED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
81+
; FIXED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
82+
; FIXED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
83+
; FIXED: [[VECTOR_MEMCHECK]]:
84+
; FIXED-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]]
85+
; FIXED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
86+
; FIXED-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
87+
; FIXED: [[VECTOR_PH]]:
88+
; FIXED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
89+
; FIXED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
90+
; FIXED-NEXT: br label %[[VECTOR_BODY:.*]]
91+
; FIXED: [[VECTOR_BODY]]:
92+
; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
93+
; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
94+
; FIXED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
95+
; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP1]]
96+
; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
97+
; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
98+
; FIXED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8
99+
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
100+
; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
101+
; FIXED-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
102+
; FIXED-NEXT: [[TMP8:%.*]] = fadd <8 x float> [[WIDE_LOAD3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
103+
; FIXED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]]
104+
; FIXED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP2]]
105+
; FIXED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
106+
; FIXED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 8
107+
; FIXED-NEXT: store <8 x float> [[TMP7]], ptr [[TMP11]], align 4
108+
; FIXED-NEXT: store <8 x float> [[TMP8]], ptr [[TMP12]], align 4
109+
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
110+
; FIXED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
111+
; FIXED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
112+
; FIXED: [[MIDDLE_BLOCK]]:
113+
; FIXED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
114+
; FIXED-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
115+
; FIXED: [[SCALAR_PH]]:
116+
; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
117+
; FIXED-NEXT: br label %[[FOR_BODY:.*]]
118+
; FIXED: [[FOR_COND_CLEANUP_LOOPEXIT]]:
119+
; FIXED-NEXT: br label %[[FOR_COND_CLEANUP]]
120+
; FIXED: [[FOR_COND_CLEANUP]]:
121+
; FIXED-NEXT: ret void
122+
; FIXED: [[FOR_BODY]]:
123+
; FIXED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
124+
; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
125+
; FIXED-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
126+
; FIXED-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], 1.000000e+00
127+
; FIXED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
128+
; FIXED-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4
129+
; FIXED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
130+
; FIXED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
131+
; FIXED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
132+
;
133+
entry:
134+
%cmp6 = icmp sgt i32 %n, 0
135+
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
136+
137+
for.body.preheader:
138+
%wide.trip.count = zext nneg i32 %n to i64
139+
br label %for.body
140+
141+
for.cond.cleanup:
142+
ret void
143+
144+
for.body:
145+
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
146+
%arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
147+
%0 = load float, ptr %arrayidx, align 4
148+
%add = fadd float %0, 1.000000e+00
149+
%arrayidx2 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
150+
store float %add, ptr %arrayidx2, align 4
151+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
152+
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
153+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
154+
}
155+
;.
156+
; SCALABLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
157+
; SCALABLE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
158+
; SCALABLE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
159+
; SCALABLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
160+
;.
161+
; FIXED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
162+
; FIXED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
163+
; FIXED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
164+
; FIXED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
165+
;.

0 commit comments

Comments
 (0)