Skip to content

Commit 95ce3c2

Browse files
authored
[RISCV] Be more aggressive about shrinking constant build_vector etype (llvm#67175)
If LMUL is more than m1, we can be more aggressive about narrowing the build_vector via a vsext if legal. If the narrow build_vector gets lowered as a load, while both are linear in lmul, load uops are generally more expensive than extend uops. If the narrow build_vector gets lowered via dominant values, that work is linear in both #unique elements and LMUL. So provided the number of unique values > 2, this is a net win in work performed.
1 parent d374a78 commit 95ce3c2

11 files changed

+406
-283
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3511,17 +3511,14 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
35113511
}
35123512
}
35133513

3514-
if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
3515-
return Res;
3516-
35173514
// If the number of signbits allows, see if we can lower as a <N x i8>.
3518-
// We restrict this to N <= 4 to ensure the resulting narrow vector is
3519-
// 32 bits of smaller and can thus be materialized cheaply from scalar.
3520-
// The main motivation for this is the constant index vector required
3521-
// by vrgather.vv. This covers all indice vectors up to size 4.
3515+
// Our main goal here is to reduce LMUL (and thus work) required to
3516+
// build the constant, but we will also narrow if the resulting
3517+
// narrow vector is known to materialize cheaply.
35223518
// TODO: We really should be costing the smaller vector. There are
35233519
// profitable cases this misses.
3524-
if (EltBitSize > 8 && NumElts <= 4) {
3520+
if (EltBitSize > 8 &&
3521+
(NumElts <= 4 || VT.getSizeInBits() > Subtarget.getRealMinVLen())) {
35253522
unsigned SignBits = DAG.ComputeNumSignBits(Op);
35263523
if (EltBitSize - SignBits < 8) {
35273524
SDValue Source =
@@ -3533,6 +3530,9 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
35333530
}
35343531
}
35353532

3533+
if (SDValue Res = lowerBuildVectorViaDominantValues(Op, DAG, Subtarget))
3534+
return Res;
3535+
35363536
// For constant vectors, use generic constant pool lowering. Otherwise,
35373537
// we'd have to materialize constants in GPRs just to move them into the
35383538
// vector.

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -106,11 +106,12 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
106106
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
107107
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
108108
; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
109-
; CHECK-NEXT: vle64.v v8, (a0)
109+
; CHECK-NEXT: vle8.v v8, (a0)
110110
; CHECK-NEXT: vid.v v16
111111
; CHECK-NEXT: vsaddu.vx v16, v16, a1
112112
; CHECK-NEXT: vmsltu.vx v0, v16, a2
113-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
113+
; CHECK-NEXT: vsext.vf8 v16, v8
114+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
114115
; CHECK-NEXT: vmsltu.vx v16, v8, a2
115116
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
116117
; CHECK-NEXT: vslideup.vi v0, v16, 2
@@ -125,27 +126,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
125126
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
126127
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
127128
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
128-
; CHECK-NEXT: vle64.v v8, (a0)
129+
; CHECK-NEXT: vle8.v v8, (a0)
129130
; CHECK-NEXT: vid.v v16
130131
; CHECK-NEXT: vsaddu.vx v16, v16, a1
131132
; CHECK-NEXT: vmsltu.vx v0, v16, a2
132-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
133+
; CHECK-NEXT: vsext.vf8 v16, v8
134+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
133135
; CHECK-NEXT: vmsltu.vx v16, v8, a2
134136
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
135137
; CHECK-NEXT: vslideup.vi v0, v16, 2
136138
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
137139
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
138140
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
139-
; CHECK-NEXT: vle64.v v8, (a0)
140-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
141+
; CHECK-NEXT: vle8.v v8, (a0)
142+
; CHECK-NEXT: vsext.vf8 v16, v8
143+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
141144
; CHECK-NEXT: vmsltu.vx v16, v8, a2
142145
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
143146
; CHECK-NEXT: vslideup.vi v0, v16, 4
144147
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
145148
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
146149
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
147-
; CHECK-NEXT: vle64.v v8, (a0)
148-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
150+
; CHECK-NEXT: vle8.v v8, (a0)
151+
; CHECK-NEXT: vsext.vf8 v16, v8
152+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
149153
; CHECK-NEXT: vmsltu.vx v16, v8, a2
150154
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
151155
; CHECK-NEXT: vslideup.vi v0, v16, 6
@@ -160,59 +164,66 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
160164
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
161165
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
162166
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
163-
; CHECK-NEXT: vle64.v v8, (a0)
167+
; CHECK-NEXT: vle8.v v8, (a0)
164168
; CHECK-NEXT: vid.v v16
165169
; CHECK-NEXT: vsaddu.vx v16, v16, a1
166170
; CHECK-NEXT: vmsltu.vx v0, v16, a2
167-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
171+
; CHECK-NEXT: vsext.vf8 v16, v8
172+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
168173
; CHECK-NEXT: vmsltu.vx v16, v8, a2
169174
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
170175
; CHECK-NEXT: vslideup.vi v0, v16, 2
171176
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
172177
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
173178
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
174-
; CHECK-NEXT: vle64.v v8, (a0)
175-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
179+
; CHECK-NEXT: vle8.v v8, (a0)
180+
; CHECK-NEXT: vsext.vf8 v16, v8
181+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
176182
; CHECK-NEXT: vmsltu.vx v16, v8, a2
177183
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
178184
; CHECK-NEXT: vslideup.vi v0, v16, 4
179185
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
180186
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
181187
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
182-
; CHECK-NEXT: vle64.v v8, (a0)
183-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
188+
; CHECK-NEXT: vle8.v v8, (a0)
189+
; CHECK-NEXT: vsext.vf8 v16, v8
190+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
184191
; CHECK-NEXT: vmsltu.vx v16, v8, a2
185192
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
186193
; CHECK-NEXT: vslideup.vi v0, v16, 6
187194
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
188195
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
189196
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
190-
; CHECK-NEXT: vle64.v v8, (a0)
191-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
197+
; CHECK-NEXT: vle8.v v8, (a0)
198+
; CHECK-NEXT: vsext.vf8 v16, v8
199+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
192200
; CHECK-NEXT: vmsltu.vx v16, v8, a2
193201
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
194202
; CHECK-NEXT: vslideup.vi v0, v16, 8
195203
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
196204
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
197205
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
198-
; CHECK-NEXT: vle64.v v8, (a0)
199-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
206+
; CHECK-NEXT: vle8.v v8, (a0)
207+
; CHECK-NEXT: vsext.vf8 v16, v8
208+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
200209
; CHECK-NEXT: vmsltu.vx v16, v8, a2
201210
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
202211
; CHECK-NEXT: vslideup.vi v0, v16, 10
203212
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
204213
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
205214
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
206-
; CHECK-NEXT: vle64.v v8, (a0)
207-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
215+
; CHECK-NEXT: vle8.v v8, (a0)
216+
; CHECK-NEXT: vsext.vf8 v16, v8
217+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
208218
; CHECK-NEXT: vmsltu.vx v16, v8, a2
209219
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
210220
; CHECK-NEXT: vslideup.vi v0, v16, 12
211221
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
212222
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
213223
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
214-
; CHECK-NEXT: vle64.v v8, (a0)
215-
; CHECK-NEXT: vsaddu.vx v8, v8, a1
224+
; CHECK-NEXT: vle8.v v8, (a0)
225+
; CHECK-NEXT: vsext.vf8 v16, v8
226+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
216227
; CHECK-NEXT: vmsltu.vx v16, v8, a2
217228
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
218229
; CHECK-NEXT: vslideup.vi v0, v16, 14

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -950,16 +950,16 @@ define i32 @extractelt_mul_v4i32(<4 x i32> %x) {
950950
define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
951951
; RV32NOM-LABEL: extractelt_sdiv_v4i32:
952952
; RV32NOM: # %bb.0:
953-
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
954-
; RV32NOM-NEXT: vmv.v.i v9, 0
955953
; RV32NOM-NEXT: lui a0, %hi(.LCPI42_0)
956954
; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
957-
; RV32NOM-NEXT: vle32.v v10, (a0)
958-
; RV32NOM-NEXT: li a0, -1
959-
; RV32NOM-NEXT: vslide1down.vx v9, v9, a0
960-
; RV32NOM-NEXT: vand.vv v9, v8, v9
961-
; RV32NOM-NEXT: vmulh.vv v8, v8, v10
962-
; RV32NOM-NEXT: vadd.vv v8, v8, v9
955+
; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
956+
; RV32NOM-NEXT: vle32.v v9, (a0)
957+
; RV32NOM-NEXT: vmulh.vv v9, v8, v9
958+
; RV32NOM-NEXT: lui a0, 1044480
959+
; RV32NOM-NEXT: vmv.s.x v10, a0
960+
; RV32NOM-NEXT: vsext.vf4 v11, v10
961+
; RV32NOM-NEXT: vand.vv v8, v8, v11
962+
; RV32NOM-NEXT: vadd.vv v8, v9, v8
963963
; RV32NOM-NEXT: lui a0, 12320
964964
; RV32NOM-NEXT: addi a0, a0, 257
965965
; RV32NOM-NEXT: vmv.s.x v9, a0
@@ -986,16 +986,16 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
986986
;
987987
; RV64NOM-LABEL: extractelt_sdiv_v4i32:
988988
; RV64NOM: # %bb.0:
989-
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
990-
; RV64NOM-NEXT: vmv.v.i v9, 0
991989
; RV64NOM-NEXT: lui a0, %hi(.LCPI42_0)
992990
; RV64NOM-NEXT: addi a0, a0, %lo(.LCPI42_0)
993-
; RV64NOM-NEXT: vle32.v v10, (a0)
994-
; RV64NOM-NEXT: li a0, -1
995-
; RV64NOM-NEXT: vslide1down.vx v9, v9, a0
996-
; RV64NOM-NEXT: vand.vv v9, v8, v9
997-
; RV64NOM-NEXT: vmulh.vv v8, v8, v10
998-
; RV64NOM-NEXT: vadd.vv v8, v8, v9
991+
; RV64NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma
992+
; RV64NOM-NEXT: vle32.v v9, (a0)
993+
; RV64NOM-NEXT: vmulh.vv v9, v8, v9
994+
; RV64NOM-NEXT: lui a0, 1044480
995+
; RV64NOM-NEXT: vmv.s.x v10, a0
996+
; RV64NOM-NEXT: vsext.vf4 v11, v10
997+
; RV64NOM-NEXT: vand.vv v8, v8, v11
998+
; RV64NOM-NEXT: vadd.vv v8, v9, v8
999999
; RV64NOM-NEXT: lui a0, 12320
10001000
; RV64NOM-NEXT: addiw a0, a0, 257
10011001
; RV64NOM-NEXT: vmv.s.x v9, a0

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,8 @@ define <4 x i64> @buildvec_vid_step1_add0_v4i64() {
292292
; RV32-NEXT: lui a0, %hi(.LCPI25_0)
293293
; RV32-NEXT: addi a0, a0, %lo(.LCPI25_0)
294294
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
295-
; RV32-NEXT: vle32.v v8, (a0)
295+
; RV32-NEXT: vle8.v v10, (a0)
296+
; RV32-NEXT: vsext.vf4 v8, v10
296297
; RV32-NEXT: ret
297298
;
298299
; RV64-LABEL: buildvec_vid_step1_add0_v4i64:
@@ -309,7 +310,8 @@ define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
309310
; RV32-NEXT: lui a0, %hi(.LCPI26_0)
310311
; RV32-NEXT: addi a0, a0, %lo(.LCPI26_0)
311312
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
312-
; RV32-NEXT: vle32.v v8, (a0)
313+
; RV32-NEXT: vle8.v v10, (a0)
314+
; RV32-NEXT: vsext.vf4 v8, v10
313315
; RV32-NEXT: ret
314316
;
315317
; RV64-LABEL: buildvec_vid_step2_add0_v4i64:

0 commit comments

Comments
 (0)