Skip to content

Commit 131b7fe

Browse files
[RISCV][VLOPT] Add support for widening integer mul-add instructions (#112219)
This adds support for these instructions and also tests getOperandInfo for these instructions as well. I think the VL on the using add instruction can be optimized further, once we add support for optimizing non-vlmax.
1 parent ab0dc29 commit 131b7fe

File tree

3 files changed

+156
-32
lines changed

3 files changed

+156
-32
lines changed

llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,19 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
403403
case RISCV::VWMULSU_VV:
404404
case RISCV::VWMULSU_VX:
405405
case RISCV::VWMULU_VV:
406-
case RISCV::VWMULU_VX: {
406+
case RISCV::VWMULU_VX:
407+
// Vector Widening Integer Multiply-Add Instructions
408+
// Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
409+
// A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
410+
// is then added to the 2*SEW-bit Dest. These instructions never have a
411+
// passthru operand.
412+
case RISCV::VWMACCU_VV:
413+
case RISCV::VWMACCU_VX:
414+
case RISCV::VWMACC_VV:
415+
case RISCV::VWMACC_VX:
416+
case RISCV::VWMACCSU_VV:
417+
case RISCV::VWMACCSU_VX:
418+
case RISCV::VWMACCUS_VX: {
407419
unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
408420
RISCVII::VLMUL EMUL =
409421
IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
@@ -418,18 +430,7 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
418430
case RISCV::VWADD_WV:
419431
case RISCV::VWADD_WX:
420432
case RISCV::VWSUB_WV:
421-
case RISCV::VWSUB_WX:
422-
// Vector Widening Integer Multiply-Add Instructions
423-
// Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
424-
// Even though the add is a 2*SEW addition, the operands of the add are the
425-
// Dest which is 2*SEW and the result of the multiply which is 2*SEW.
426-
case RISCV::VWMACCU_VV:
427-
case RISCV::VWMACCU_VX:
428-
case RISCV::VWMACC_VV:
429-
case RISCV::VWMACC_VX:
430-
case RISCV::VWMACCSU_VV:
431-
case RISCV::VWMACCSU_VX:
432-
case RISCV::VWMACCUS_VX: {
433+
case RISCV::VWSUB_WX: {
433434
bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
434435
bool TwoTimes = IsMODef || IsOp1;
435436
unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
@@ -571,9 +572,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
571572
// Vector Single-Width Integer Multiply-Add Instructions
572573
// FIXME: Add support
573574
// Vector Widening Integer Multiply-Add Instructions
574-
// FIXME: Add support
575-
case RISCV::VWMACC_VX:
575+
case RISCV::VWMACCU_VV:
576576
case RISCV::VWMACCU_VX:
577+
case RISCV::VWMACC_VV:
578+
case RISCV::VWMACC_VX:
579+
case RISCV::VWMACCSU_VV:
580+
case RISCV::VWMACCSU_VX:
581+
case RISCV::VWMACCUS_VX:
577582
// Vector Integer Merge Instructions
578583
// FIXME: Add support
579584
// Vector Integer Move Instructions

llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll

Lines changed: 122 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,44 +1248,149 @@ define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
12481248
ret <vscale x 4 x i64> %2
12491249
}
12501250

1251-
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
1251+
define <vscale x 4 x i32> @vwmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
1252+
; NOVLOPT-LABEL: vwmacc_vv:
1253+
; NOVLOPT: # %bb.0:
1254+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1255+
; NOVLOPT-NEXT: vwmacc.vv v8, v10, v11
1256+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
1257+
; NOVLOPT-NEXT: vadd.vv v8, v8, v12
1258+
; NOVLOPT-NEXT: ret
1259+
;
1260+
; VLOPT-LABEL: vwmacc_vv:
1261+
; VLOPT: # %bb.0:
1262+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1263+
; VLOPT-NEXT: vwmacc.vv v8, v10, v11
1264+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1265+
; VLOPT-NEXT: vadd.vv v8, v8, v12
1266+
; VLOPT-NEXT: ret
1267+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1268+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
1269+
ret <vscale x 4 x i32> %2
1270+
}
1271+
1272+
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
12521273
; NOVLOPT-LABEL: vwmacc_vx:
12531274
; NOVLOPT: # %bb.0:
1254-
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
1255-
; NOVLOPT-NEXT: vwmacc.vx v10, a0, v8
1275+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1276+
; NOVLOPT-NEXT: vwmacc.vx v8, a0, v10
12561277
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1257-
; NOVLOPT-NEXT: vadd.vv v8, v10, v10
1278+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
12581279
; NOVLOPT-NEXT: ret
12591280
;
12601281
; VLOPT-LABEL: vwmacc_vx:
12611282
; VLOPT: # %bb.0:
1262-
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
1263-
; VLOPT-NEXT: vwmacc.vx v10, a0, v8
1283+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1284+
; VLOPT-NEXT: vwmacc.vx v8, a0, v10
12641285
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1265-
; VLOPT-NEXT: vadd.vv v8, v10, v10
1286+
; VLOPT-NEXT: vadd.vv v8, v8, v8
12661287
; VLOPT-NEXT: ret
1267-
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
1288+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
12681289
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
12691290
ret <vscale x 4 x i32> %2
12701291
}
12711292

1272-
define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
1293+
define <vscale x 4 x i32> @vwmaccu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, iXLen %vl) {
1294+
; NOVLOPT-LABEL: vwmaccu_vv:
1295+
; NOVLOPT: # %bb.0:
1296+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1297+
; NOVLOPT-NEXT: vwmaccu.vv v8, v10, v11
1298+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
1299+
; NOVLOPT-NEXT: vadd.vv v8, v8, v12
1300+
; NOVLOPT-NEXT: ret
1301+
;
1302+
; VLOPT-LABEL: vwmaccu_vv:
1303+
; VLOPT: # %bb.0:
1304+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1305+
; VLOPT-NEXT: vwmaccu.vv v8, v10, v11
1306+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1307+
; VLOPT-NEXT: vadd.vv v8, v8, v12
1308+
; VLOPT-NEXT: ret
1309+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1310+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
1311+
ret <vscale x 4 x i32> %2
1312+
}
1313+
1314+
define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, <vscale x 4 x i32> %d, i32 %e, iXLen %vl) {
12731315
; NOVLOPT-LABEL: vwmaccu_vx:
12741316
; NOVLOPT: # %bb.0:
1275-
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
1276-
; NOVLOPT-NEXT: vwmaccu.vx v10, a0, v8
1277-
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1278-
; NOVLOPT-NEXT: vadd.vv v8, v10, v10
1317+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1318+
; NOVLOPT-NEXT: vwmaccu.vx v8, a0, v10
1319+
; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, ta, ma
1320+
; NOVLOPT-NEXT: vadd.vv v8, v8, v12
12791321
; NOVLOPT-NEXT: ret
12801322
;
12811323
; VLOPT-LABEL: vwmaccu_vx:
12821324
; VLOPT: # %bb.0:
1283-
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
1284-
; VLOPT-NEXT: vwmaccu.vx v10, a0, v8
1325+
; VLOPT-NEXT: vsetvli zero, a2, e16, m1, tu, ma
1326+
; VLOPT-NEXT: vwmaccu.vx v8, a0, v10
1327+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1328+
; VLOPT-NEXT: vadd.vv v8, v8, v12
1329+
; VLOPT-NEXT: ret
1330+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1331+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %d, iXLen %vl)
1332+
ret <vscale x 4 x i32> %2
1333+
}
1334+
1335+
define <vscale x 4 x i32> @vwmaccsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
1336+
; NOVLOPT-LABEL: vwmaccsu_vv:
1337+
; NOVLOPT: # %bb.0:
1338+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1339+
; NOVLOPT-NEXT: vwmaccsu.vv v8, v10, v11
1340+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
1341+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1342+
; NOVLOPT-NEXT: ret
1343+
;
1344+
; VLOPT-LABEL: vwmaccsu_vv:
1345+
; VLOPT: # %bb.0:
1346+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1347+
; VLOPT-NEXT: vwmaccsu.vv v8, v10, v11
12851348
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1286-
; VLOPT-NEXT: vadd.vv v8, v10, v10
1349+
; VLOPT-NEXT: vadd.vv v8, v8, v8
1350+
; VLOPT-NEXT: ret
1351+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1352+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
1353+
ret <vscale x 4 x i32> %2
1354+
}
1355+
1356+
define <vscale x 4 x i32> @vwmaccsu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
1357+
; NOVLOPT-LABEL: vwmaccsu_vx:
1358+
; NOVLOPT: # %bb.0:
1359+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1360+
; NOVLOPT-NEXT: vwmaccsu.vx v8, a0, v10
1361+
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1362+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1363+
; NOVLOPT-NEXT: ret
1364+
;
1365+
; VLOPT-LABEL: vwmaccsu_vx:
1366+
; VLOPT: # %bb.0:
1367+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1368+
; VLOPT-NEXT: vwmaccsu.vx v8, a0, v10
1369+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1370+
; VLOPT-NEXT: vadd.vv v8, v8, v8
1371+
; VLOPT-NEXT: ret
1372+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1373+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
1374+
ret <vscale x 4 x i32> %2
1375+
}
1376+
1377+
define <vscale x 4 x i32> @vwmaccus_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
1378+
; NOVLOPT-LABEL: vwmaccus_vx:
1379+
; NOVLOPT: # %bb.0:
1380+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1381+
; NOVLOPT-NEXT: vwmaccus.vx v8, a0, v10
1382+
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1383+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1384+
; NOVLOPT-NEXT: ret
1385+
;
1386+
; VLOPT-LABEL: vwmaccus_vx:
1387+
; VLOPT: # %bb.0:
1388+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1389+
; VLOPT-NEXT: vwmaccus.vx v8, a0, v10
1390+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1391+
; VLOPT-NEXT: vadd.vv v8, v8, v8
12871392
; VLOPT-NEXT: ret
1288-
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
1393+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
12891394
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
12901395
ret <vscale x 4 x i32> %2
12911396
}

llvm/test/CodeGen/RISCV/rvv/vl-opt.ll

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,17 @@ define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru
136136
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
137137
ret <vscale x 4 x i32> %w
138138
}
139+
140+
define <vscale x 4 x i32> @dont_optimize_tied_def(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
141+
; CHECK-LABEL: dont_optimize_tied_def:
142+
; CHECK: # %bb.0:
143+
; CHECK-NEXT: vsetvli a1, zero, e16, m1, tu, ma
144+
; CHECK-NEXT: vwmacc.vv v8, v10, v11
145+
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
146+
; CHECK-NEXT: vwmacc.vv v8, v10, v11
147+
; CHECK-NEXT: ret
148+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
149+
%2 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %1, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl, iXLen 0)
150+
ret <vscale x 4 x i32> %2
151+
}
152+

0 commit comments

Comments
 (0)