Skip to content

Commit 4cb75a2

Browse files
[RISCV][VLOPT] Add support for 11.14 widening integer mul-add instructions
1 parent ae68d53 commit 4cb75a2

File tree

3 files changed

+164
-32
lines changed

3 files changed

+164
-32
lines changed

llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,19 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
404404
case RISCV::VWMULSU_VV:
405405
case RISCV::VWMULSU_VX:
406406
case RISCV::VWMULU_VV:
407-
case RISCV::VWMULU_VX: {
407+
case RISCV::VWMULU_VX:
408+
// Vector Widening Integer Multiply-Add Instructions
409+
// Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
410+
// A SEW-bit*SEW-bit multiply of the sources forms a 2*SEW-bit value, which
411+
// is then added to the 2*SEW-bit Dest. These instructions never have a
412+
// passthru operand.
413+
case RISCV::VWMACCU_VV:
414+
case RISCV::VWMACCU_VX:
415+
case RISCV::VWMACC_VV:
416+
case RISCV::VWMACC_VX:
417+
case RISCV::VWMACCSU_VV:
418+
case RISCV::VWMACCSU_VX:
419+
case RISCV::VWMACCUS_VX: {
408420
unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
409421
RISCVII::VLMUL EMUL =
410422
IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
@@ -419,18 +431,7 @@ static OperandInfo getOperandInfo(const MachineInstr &MI,
419431
case RISCV::VWADD_WV:
420432
case RISCV::VWADD_WX:
421433
case RISCV::VWSUB_WV:
422-
case RISCV::VWSUB_WX:
423-
// Vector Widening Integer Multiply-Add Instructions
424-
// Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
425-
// Even though the add is a 2*SEW addition, the operands of the add are the
426-
// Dest which is 2*SEW and the result of the multiply which is 2*SEW.
427-
case RISCV::VWMACCU_VV:
428-
case RISCV::VWMACCU_VX:
429-
case RISCV::VWMACC_VV:
430-
case RISCV::VWMACC_VX:
431-
case RISCV::VWMACCSU_VV:
432-
case RISCV::VWMACCSU_VX:
433-
case RISCV::VWMACCUS_VX: {
434+
case RISCV::VWSUB_WX: {
434435
bool IsOp1 = HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 1;
435436
bool TwoTimes = IsMODef || IsOp1;
436437
unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
@@ -572,9 +573,13 @@ static bool isSupportedInstr(const MachineInstr &MI) {
572573
// Vector Single-Width Integer Multiply-Add Instructions
573574
// FIXME: Add support
574575
// Vector Widening Integer Multiply-Add Instructions
575-
// FIXME: Add support
576-
case RISCV::VWMACC_VX:
576+
case RISCV::VWMACCU_VV:
577577
case RISCV::VWMACCU_VX:
578+
case RISCV::VWMACC_VV:
579+
case RISCV::VWMACC_VX:
580+
case RISCV::VWMACCSU_VV:
581+
case RISCV::VWMACCSU_VX:
582+
case RISCV::VWMACCUS_VX:
578583
// Vector Integer Merge Instructions
579584
// FIXME: Add support
580585
// Vector Integer Move Instructions

llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll

Lines changed: 130 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,44 +1248,157 @@ define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
12481248
ret <vscale x 4 x i64> %2
12491249
}
12501250

1251-
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
1251+
define <vscale x 4 x i64> @vwmacc_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i64> %d, iXLen %vl) {
1252+
; NOVLOPT-LABEL: vwmacc_vv:
1253+
; NOVLOPT: # %bb.0:
1254+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1255+
; NOVLOPT-NEXT: vwmacc.vv v8, v10, v11
1256+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
1257+
; NOVLOPT-NEXT: vwmacc.vv v12, v8, v8
1258+
; NOVLOPT-NEXT: vmv4r.v v8, v12
1259+
; NOVLOPT-NEXT: ret
1260+
;
1261+
; VLOPT-LABEL: vwmacc_vv:
1262+
; VLOPT: # %bb.0:
1263+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1264+
; VLOPT-NEXT: vwmacc.vv v8, v10, v11
1265+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, tu, ma
1266+
; VLOPT-NEXT: vwmacc.vv v12, v8, v8
1267+
; VLOPT-NEXT: vmv4r.v v8, v12
1268+
; VLOPT-NEXT: ret
1269+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1270+
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i64> %d, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl, iXLen 0)
1271+
ret <vscale x 4 x i64> %2
1272+
}
1273+
1274+
define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
12521275
; NOVLOPT-LABEL: vwmacc_vx:
12531276
; NOVLOPT: # %bb.0:
1254-
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
1255-
; NOVLOPT-NEXT: vwmacc.vx v10, a0, v8
1277+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1278+
; NOVLOPT-NEXT: vwmacc.vx v8, a0, v10
12561279
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1257-
; NOVLOPT-NEXT: vadd.vv v8, v10, v10
1280+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
12581281
; NOVLOPT-NEXT: ret
12591282
;
12601283
; VLOPT-LABEL: vwmacc_vx:
12611284
; VLOPT: # %bb.0:
1262-
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
1263-
; VLOPT-NEXT: vwmacc.vx v10, a0, v8
1285+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1286+
; VLOPT-NEXT: vwmacc.vx v8, a0, v10
12641287
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1265-
; VLOPT-NEXT: vadd.vv v8, v10, v10
1288+
; VLOPT-NEXT: vadd.vv v8, v8, v8
12661289
; VLOPT-NEXT: ret
1267-
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
1290+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
12681291
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
12691292
ret <vscale x 4 x i32> %2
12701293
}
12711294

1272-
define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
1295+
define <vscale x 4 x i64> @vwmaccu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, <vscale x 4 x i64> %d, iXLen %vl) {
1296+
; NOVLOPT-LABEL: vwmaccu_vv:
1297+
; NOVLOPT: # %bb.0:
1298+
; NOVLOPT-NEXT: vmv2r.v v16, v8
1299+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1300+
; NOVLOPT-NEXT: vwmaccu.vv v16, v10, v11
1301+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, tu, ma
1302+
; NOVLOPT-NEXT: vwmaccu.vv v12, v8, v16
1303+
; NOVLOPT-NEXT: vmv4r.v v8, v12
1304+
; NOVLOPT-NEXT: ret
1305+
;
1306+
; VLOPT-LABEL: vwmaccu_vv:
1307+
; VLOPT: # %bb.0:
1308+
; VLOPT-NEXT: vmv2r.v v16, v8
1309+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1310+
; VLOPT-NEXT: vwmaccu.vv v16, v10, v11
1311+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, tu, ma
1312+
; VLOPT-NEXT: vwmaccu.vv v12, v8, v16
1313+
; VLOPT-NEXT: vmv4r.v v8, v12
1314+
; VLOPT-NEXT: ret
1315+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1316+
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.nxv4i32(<vscale x 4 x i64> %d, <vscale x 4 x i32> %a, <vscale x 4 x i32> %1, iXLen %vl, iXLen 0)
1317+
ret <vscale x 4 x i64> %2
1318+
}
1319+
1320+
define <vscale x 4 x i64> @vwmaccu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, <vscale x 4 x i64> %d, i32 %e, iXLen %vl) {
12731321
; NOVLOPT-LABEL: vwmaccu_vx:
12741322
; NOVLOPT: # %bb.0:
1275-
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, ta, ma
1276-
; NOVLOPT-NEXT: vwmaccu.vx v10, a0, v8
1277-
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1278-
; NOVLOPT-NEXT: vadd.vv v8, v10, v10
1323+
; NOVLOPT-NEXT: vsetvli a3, zero, e16, m1, tu, ma
1324+
; NOVLOPT-NEXT: vwmaccu.vx v8, a0, v10
1325+
; NOVLOPT-NEXT: vsetvli zero, a2, e32, m2, tu, ma
1326+
; NOVLOPT-NEXT: vwmaccu.vx v12, a1, v8
1327+
; NOVLOPT-NEXT: vmv4r.v v8, v12
12791328
; NOVLOPT-NEXT: ret
12801329
;
12811330
; VLOPT-LABEL: vwmaccu_vx:
12821331
; VLOPT: # %bb.0:
1283-
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, ta, ma
1284-
; VLOPT-NEXT: vwmaccu.vx v10, a0, v8
1332+
; VLOPT-NEXT: vsetvli zero, a2, e16, m1, tu, ma
1333+
; VLOPT-NEXT: vwmaccu.vx v8, a0, v10
1334+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, tu, ma
1335+
; VLOPT-NEXT: vwmaccu.vx v12, a1, v8
1336+
; VLOPT-NEXT: vmv4r.v v8, v12
1337+
; VLOPT-NEXT: ret
1338+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1339+
%2 = call <vscale x 4 x i64> @llvm.riscv.vwmaccu.nxv4i64.i32(<vscale x 4 x i64> %d, i32 %e, <vscale x 4 x i32> %1, iXLen %vl, iXLen 0)
1340+
ret <vscale x 4 x i64> %2
1341+
}
1342+
1343+
define <vscale x 4 x i32> @vwmaccsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
1344+
; NOVLOPT-LABEL: vwmaccsu_vv:
1345+
; NOVLOPT: # %bb.0:
1346+
; NOVLOPT-NEXT: vsetvli a1, zero, e16, m1, tu, ma
1347+
; NOVLOPT-NEXT: vwmaccsu.vv v8, v10, v11
1348+
; NOVLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
1349+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1350+
; NOVLOPT-NEXT: ret
1351+
;
1352+
; VLOPT-LABEL: vwmaccsu_vv:
1353+
; VLOPT: # %bb.0:
1354+
; VLOPT-NEXT: vsetvli zero, a0, e16, m1, tu, ma
1355+
; VLOPT-NEXT: vwmaccsu.vv v8, v10, v11
1356+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1357+
; VLOPT-NEXT: vadd.vv v8, v8, v8
1358+
; VLOPT-NEXT: ret
1359+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1360+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
1361+
ret <vscale x 4 x i32> %2
1362+
}
1363+
1364+
define <vscale x 4 x i32> @vwmaccsu_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
1365+
; NOVLOPT-LABEL: vwmaccsu_vx:
1366+
; NOVLOPT: # %bb.0:
1367+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1368+
; NOVLOPT-NEXT: vwmaccsu.vx v8, a0, v10
1369+
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1370+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1371+
; NOVLOPT-NEXT: ret
1372+
;
1373+
; VLOPT-LABEL: vwmaccsu_vx:
1374+
; VLOPT: # %bb.0:
1375+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1376+
; VLOPT-NEXT: vwmaccsu.vx v8, a0, v10
12851377
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1286-
; VLOPT-NEXT: vadd.vv v8, v10, v10
1378+
; VLOPT-NEXT: vadd.vv v8, v8, v8
1379+
; VLOPT-NEXT: ret
1380+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccsu.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
1381+
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
1382+
ret <vscale x 4 x i32> %2
1383+
}
1384+
1385+
define <vscale x 4 x i32> @vwmaccus_vx(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen %vl) {
1386+
; NOVLOPT-LABEL: vwmaccus_vx:
1387+
; NOVLOPT: # %bb.0:
1388+
; NOVLOPT-NEXT: vsetvli a2, zero, e16, m1, tu, ma
1389+
; NOVLOPT-NEXT: vwmaccus.vx v8, a0, v10
1390+
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
1391+
; NOVLOPT-NEXT: vadd.vv v8, v8, v8
1392+
; NOVLOPT-NEXT: ret
1393+
;
1394+
; VLOPT-LABEL: vwmaccus_vx:
1395+
; VLOPT: # %bb.0:
1396+
; VLOPT-NEXT: vsetvli zero, a1, e16, m1, tu, ma
1397+
; VLOPT-NEXT: vwmaccus.vx v8, a0, v10
1398+
; VLOPT-NEXT: vsetvli zero, zero, e32, m2, ta, ma
1399+
; VLOPT-NEXT: vadd.vv v8, v8, v8
12871400
; VLOPT-NEXT: ret
1288-
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
1401+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccus.nxv4i32.i16(<vscale x 4 x i32> %a, i16 %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
12891402
%2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
12901403
ret <vscale x 4 x i32> %2
12911404
}

llvm/test/CodeGen/RISCV/rvv/vl-opt.ll

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,3 +136,17 @@ define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru
136136
%w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
137137
ret <vscale x 4 x i32> %w
138138
}
139+
140+
define <vscale x 4 x i32> @dont_optimize_tied_def(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl) {
141+
; CHECK-LABEL: dont_optimize_tied_def:
142+
; CHECK: # %bb.0:
143+
; CHECK-NEXT: vsetvli a1, zero, e16, m1, tu, ma
144+
; CHECK-NEXT: vwmacc.vv v8, v10, v11
145+
; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
146+
; CHECK-NEXT: vwmacc.vv v8, v10, v11
147+
; CHECK-NEXT: ret
148+
%1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen -1, iXLen 0)
149+
%2 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.nxv4i16(<vscale x 4 x i32> %1, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c, iXLen %vl, iXLen 0)
150+
ret <vscale x 4 x i32> %2
151+
}
152+

0 commit comments

Comments
 (0)