Skip to content

Commit b44fbde

Browse files
authored
[RISCV] Tune flag for fast vrgather.vv (#124664)
Add tune knob for N*Log2(N) vrgather.vv cost.
1 parent ee09df8 commit b44fbde

File tree

7 files changed

+76
-2
lines changed

7 files changed

+76
-2
lines changed

llvm/docs/ReleaseNotes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ Changes to the RISC-V Backend
111111
extension.
112112
* Adds experimental assembler support for the Qualcomm 'Xqccmp' extension, which
113113
is a frame-pointer convention compatible version of Zcmp.
114+
* Added non-quadratic ``log-vrgather`` cost model for ``vrgather.vv`` instruction
114115

115116
Changes to the WebAssembly Backend
116117
----------------------------------

llvm/lib/Target/RISCV/RISCVFeatures.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,10 @@ def FeatureUnalignedVectorMem
14451445
"true", "Has reasonably performant unaligned vector "
14461446
"loads and stores">;
14471447

1448+
def TuneNLogNVRGather
1449+
: SubtargetFeature<"log-vrgather", "RISCVVRGatherCostModel", "NLog2N",
1450+
"Has vrgather.vv with LMUL*log2(LMUL) latency">;
1451+
14481452
def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
14491453
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
14501454

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2869,10 +2869,19 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const {
28692869

28702870

28712871
/// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv
2872-
/// is generally quadratic in the number of vreg implied by LMUL. Note that
2872+
/// may be quadratic in the number of vreg implied by LMUL, and is assumed to
2873+
/// be by default. VRGatherCostModel reflects available options. Note that
28732874
/// operand (index and possibly mask) are handled separately.
28742875
InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const {
2875-
return getLMULCost(VT) * getLMULCost(VT);
2876+
auto LMULCost = getLMULCost(VT);
2877+
bool Log2CostModel =
2878+
Subtarget.getVRGatherCostModel() == llvm::RISCVSubtarget::NLog2N;
2879+
if (Log2CostModel && LMULCost.isValid()) {
2880+
unsigned Log = Log2_64(*LMULCost.getValue());
2881+
if (Log > 0)
2882+
return LMULCost * Log;
2883+
}
2884+
return LMULCost * LMULCost;
28762885
}
28772886

28782887
/// Return the cost of a vrgather.vi (or vx) instruction for the type VT.

llvm/lib/Target/RISCV/RISCVProcessors.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
494494
FeatureUnalignedScalarMem,
495495
FeatureUnalignedVectorMem]),
496496
[TuneNoDefaultUnroll,
497+
TuneNLogNVRGather,
497498
TuneOptimizedZeroStrideLoad,
498499
TunePostRAScheduler]>;
499500

llvm/lib/Target/RISCV/RISCVSubtarget.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,16 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
8484
VentanaVeyron,
8585
MIPSP8700,
8686
};
87+
enum RISCVVRGatherCostModelEnum : uint8_t {
88+
Quadratic,
89+
NLog2N,
90+
};
8791
// clang-format on
8892
private:
8993
virtual void anchor();
9094

9195
RISCVProcFamilyEnum RISCVProcFamily = Others;
96+
RISCVVRGatherCostModelEnum RISCVVRGatherCostModel = Quadratic;
9297

9398
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
9499
bool ATTRIBUTE = DEFAULT;
@@ -155,6 +160,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
155160
/// initializeProperties().
156161
RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; }
157162

163+
RISCVVRGatherCostModelEnum getVRGatherCostModel() const { return RISCVVRGatherCostModel; }
164+
158165
#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
159166
bool GETTER() const { return ATTRIBUTE; }
160167
#include "RISCVGenSubtargetInfo.inc"

llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
44
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
55
; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
6+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
7+
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG
68
; Check that we don't crash querying costs when vectors are not enabled.
79
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=riscv32
810

@@ -44,6 +46,24 @@ define void @general_permute_single_source() {
4446
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
4547
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
4648
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
49+
;
50+
; LOG-VRG-LABEL: 'general_permute_single_source'
51+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
52+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
53+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 9, i32 6, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
54+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
55+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 5, i32 2, i32 1, i32 0>
56+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 11, i32 11, i32 11, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
57+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 0>
58+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
59+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
60+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
61+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
62+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 12, i32 12, i32 12, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
63+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 1>
64+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
65+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 0>
66+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4767
;
4868
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
4969
%v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
@@ -133,6 +153,37 @@ define void @general_permute_two_source() {
133153
; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
134154
; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
135155
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
156+
;
157+
; LOG-VRG-LABEL: 'general_permute_two_source'
158+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
159+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
160+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
161+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
162+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
163+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
164+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
165+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
166+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
167+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
168+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
169+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
170+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 3, i32 0>
171+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
172+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
173+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
174+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 3, i32 0>
175+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
176+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
177+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
178+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 3, i32 0>
179+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
180+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
181+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
182+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 3, i32 0>
183+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
184+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> <i32 14, i32 6, i32 5, i32 4, i32 13, i32 2, i32 1, i32 0>
185+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 17, i32 11, i32 20, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
186+
; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
136187
;
137188
%v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
138189
%v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 5, i32 7, i32 1, i32 0>

llvm/test/CodeGen/RISCV/features-info.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
; CHECK: h - 'H' (Hypervisor).
3636
; CHECK: i - 'I' (Base Integer Instruction Set).
3737
; CHECK: ld-add-fusion - Enable LD+ADD macrofusion.
38+
; CHECK: log-vrgather - Has vrgather.vv with LMUL*log2(LMUL) latency
3839
; CHECK: lui-addi-fusion - Enable LUI+ADDI macro fusion.
3940
; CHECK: m - 'M' (Integer Multiplication and Division).
4041
; CHECK: mips-p8700 - MIPS p8700 processor.

0 commit comments

Comments
 (0)