-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[PowerPC] Add Dense Math binary integer outer-Product accumulate to DMR Instructions #130791
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-powerpc Author: Maryam Moghadas (maryammo) ChangesThis commit adds the following Dense Math Facility integer calculation instructions: dmxvi8gerx4, dmxvi8gerx4pp, dmxvi8gerx4spp, pmdmxvi8gerx4, pmdmxvi8gerx4pp, and pmdmxvi8gerx4spp, along with their corresponding intrinsics and tests. Patch is 24.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130791.diff 7 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 6f49ed39d8a09..d5c45c03b0eb6 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -280,6 +280,13 @@ multiclass PowerPC_MMA_ACC_PP_Intrinsic<list<LLVMType> args> {
[IntrNoMem]>;
}
+multiclass PowerPC_MMA_DMR_PP_Intrinsic<list<LLVMType> args> {
+ def NAME: DefaultAttrsIntrinsic<[llvm_v1024i1_ty], args, [IntrNoMem]>;
+ def pp : DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+ !listconcat([llvm_v1024i1_ty], args),
+ [IntrNoMem]>;
+}
+
//===----------------------------------------------------------------------===//
// PowerPC Altivec Intrinsic Class Definitions.
//
@@ -1701,6 +1708,20 @@ let TargetPrefix = "ppc" in {
[llvm_v512i1_ty, llvm_v16i8_ty, llvm_v16i8_ty,
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
+ defm int_ppc_mma_dmxvi8gerx4 :
+ PowerPC_MMA_DMR_PP_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>;
+ defm int_ppc_mma_pmdmxvi8gerx4 :
+ PowerPC_MMA_DMR_PP_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
+ llvm_i32_ty, llvm_i32_ty]>;
+ def int_ppc_mma_dmxvi8gerx4spp :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+ [llvm_v1024i1_ty, llvm_v256i1_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
+ def int_ppc_mma_pmdmxvi8gerx4spp :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty],
+ [llvm_v1024i1_ty, llvm_v256i1_ty, llvm_v16i8_ty,
+ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem]>;
}
// XL Compat intrinsics.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index 4da2969857d55..9d5bf1eab13ad 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -11,6 +11,13 @@
//
//===----------------------------------------------------------------------===//
+// Mask immediates for MMA instructions (2, 4 and 8 bits).
+def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
+def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
+def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
+
+def MMA : Predicate<"Subtarget->hasMMA()">;
+
class XX3Form_AT3_XABp5_P1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
string asmstr, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, NoItinerary> {
@@ -69,6 +76,96 @@ class XForm_ATB3<bits<6> opcode, bits<5> o, bits<10> xo, dag OOL, dag IOL,
let Inst{31} = 0;
}
+class XX3Form_AT3_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : I<opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<5> XAp;
+ bits<6> XB;
+
+ let Pattern = pattern;
+
+ let Inst{6-8} = AT;
+ let Inst{9-10} = 0;
+ let Inst{11-14} = XAp{3-0};
+ let Inst{15} = 0;
+ let Inst{16-20} = XB{4-0};
+ let Inst{21-28} = xo;
+ let Inst{29} = XAp{4};
+ let Inst{30} = XB{5};
+ let Inst{31} = 0;
+}
+
+class MMIRR_XX3Form_X8YP4_XAp5B6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL,
+ string asmstr, InstrItinClass itin,
+ list<dag> pattern>
+ : PI<1, opcode, OOL, IOL, asmstr, itin> {
+ bits<3> AT;
+ bits<6> XAp;
+ bits<6> XB;
+ bits<8> XMSK;
+ bits<4> YMSK;
+ bits<4> PMSK;
+
+ let Pattern = pattern;
+
+ // The prefix.
+ let Inst{6-7} = 3;
+ let Inst{8-11} = 9;
+ let Inst{12-15} = 0;
+ let Inst{16-19} = PMSK;
+ let Inst{20-27} = XMSK;
+ let Inst{28-31} = YMSK;
+
+ // The instruction.
+ let Inst{38-40} = AT;
+ let Inst{41-42} = 0;
+ let Inst{43-46} = XAp{3-0};
+ let Inst{47} = 0;
+ let Inst{48-52} = XB{4-0};
+ let Inst{53-60} = xo;
+ let Inst{61} = XAp{4};
+ let Inst{62} = XB{5};
+ let Inst{63} = 0;
+}
+
+multiclass DMR_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ let Predicates = [IsISAFuture] in {
+ def NAME :
+ XX3Form_AT3_XAp5B6<opcode, !or(xo, 0x01), (outs dmr:$AT), IOL,
+ !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PP :
+ XX3Form_AT3_XAp5B6<opcode, xo, (outs dmr:$AT), !con((ins dmr:$ATi), IOL),
+ !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
+multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
+ string asmstr> {
+ defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>;
+ let Predicates = [IsISAFuture] in {
+ def PM#NAME :
+ MMIRR_XX3Form_X8YP4_XAp5B6<
+ opcode, !or(xo, 0x01), (outs dmr:$AT),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)),
+ !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"@earlyclobber $AT">;
+ def PM#NAME#PP :
+ MMIRR_XX3Form_X8YP4_XAp5B6<
+ opcode, xo, (outs dmr:$AT),
+ !con((ins dmr:$ATi),
+ !con(IOL, (ins u8imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))),
+ !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
+ IIC_VecFP, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+ }
+}
+
let Predicates = [IsISAFuture] in {
def DMXXEXTFDMR512 : XX3Form_AT3_XABp5_P1<60, 226,
(outs vsrprc:$XAp, vsrprc:$XBp),
@@ -113,4 +210,58 @@ let Predicates = [IsISAFuture] in {
def DMSETDMRZ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins),
"dmsetdmrz $AT", NoItinerary, []>;
+
+}
+
+// MMA+ accumulating/non-accumulating instructions.
+
+// DMXVI8GERX4, DMXVI8GERX4PP, PMDMXVI8GERX4, PMDMXVI8GERX4PP
+defm DMXVI8GERX4 : DMR_UM_M448_XOEO<59, 10, (ins vsrprc:$XAp, vsrc:$XB),
+ "dmxvi8gerx4", "$AT, $XAp, $XB">;
+
+let Predicates = [MMA, IsISAFuture] in {
+ def DMXVI8GERX4SPP :
+ XX3Form_AT3_XAp5B6<59, 98, (outs dmr:$AT), (ins dmr:$ATi, vsrprc:$XAp, vsrc:$XB),
+ "dmxvi8gerx4spp $AT, $XAp, $XB", IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+ def PMDMXVI8GERX4SPP :
+ MMIRR_XX3Form_X8YP4_XAp5B6<59, 98, (outs dmr:$AT),
+ (ins dmr:$ATi, vsrprc:$XAp,vsrc:$XB, u8imm:$XMSK,
+ u4imm:$YMSK, u4imm:$PMSK),
+ "pmdmxvi8gerx4spp $AT, $XAp, $XB, $XMSK, $YMSK, $PMSK",
+ IIC_VecGeneral, []>,
+ RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;
+}
+
+// MMA+ Intrinsics
+let Predicates = [MMA, IsISAFuture] in {
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4 v256i1:$XAp, v16i8:$XB)),
+ (DMXVI8GERX4 $XAp, RCCp.BToVSRC)>;
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_dmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB)),
+ (DMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC)>;
+}
+
+let Predicates = [MMA, PrefixInstrs, IsISAFuture] in {
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4 v256i1:$XAp, v16i8:$XB, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)),
+ (PMDMXVI8GERX4 $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4pp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk4Imm:$PMSK)),
+ (PMDMXVI8GERX4PP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk4Imm:$PMSK)>;
+
+ def : Pat<(v1024i1 (int_ppc_mma_pmdmxvi8gerx4spp v1024i1:$ATi, v256i1:$XAp, v16i8:$XB,
+ Msk8Imm:$XMSK, Msk4Imm:$YMSK,
+ Msk2Imm:$PMSK)),
+ (PMDMXVI8GERX4SPP $ATi, $XAp, RCCp.BToVSRC, Msk8Imm:$XMSK,
+ Msk4Imm:$YMSK, Msk2Imm:$PMSK)>;
}
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index 161d4d3c492f3..fd8418a6c50ea 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -1,12 +1,4 @@
-// Mask immediates for MMA instructions (2, 4 and 8 bits).
-def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>;
-def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
-def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
-
-def MMA : Predicate<"Subtarget->hasMMA()">;
-
-
// Multiclass definitions for MMA accumulator instructions.
// ----------------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
new file mode 100644
index 0000000000000..774b13e0fd2d3
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1>, <16 x i8>)
+
+define void @test_dmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvi8gerx4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: lxv vs0, 0(r4)
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: dmxvi8gerx4 dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvi8gerx4:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: dmxvi8gerx4 dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <256 x i1>, ptr %vpp, align 32
+ %1 = load <16 x i8>, ptr %vcp, align 32
+ %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1> %0, <16 x i8> %1)
+ store <1024 x i1> %2, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvi8gerx4pp:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: dmxvi8gerx4pp dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvi8gerx4pp:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxvi8gerx4pp dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <1024 x i1>, ptr %vop, align 64
+ %1 = load <256 x i1>, ptr %vpp, align 32
+ %2 = load <16 x i8>, ptr %vcp, align 32
+ %3 = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4pp(<1024 x i1> %0, <256 x i1> %1, <16 x i8> %2)
+ store <1024 x i1> %3, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1>, <256 x i1>, <16 x i8>)
+
+define void @test_dmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_dmxvi8gerx4spp:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: dmxvi8gerx4spp dmr0, vsp34, vs0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_dmxvi8gerx4spp:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxvi8gerx4spp dmr0, vsp34, vs0
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <1024 x i1>, ptr %vop, align 64
+ %1 = load <256 x i1>, ptr %vpp, align 32
+ %2 = load <16 x i8>, ptr %vcp, align 32
+ %3 = tail call <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4spp(<1024 x i1> %0, <256 x i1> %1, <16 x i8> %2)
+ store <1024 x i1> %3, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvi8gerx4pp:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: pmdmxvi8gerx4pp dmr0, vsp34, vs0, 0, 0, 0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_pmdmxvi8gerx4pp:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r5)
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: pmdmxvi8gerx4pp dmr0, vsp34, vs0, 0, 0, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r6)
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r6)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <1024 x i1>, ptr %vop, align 64
+ %1 = load <256 x i1>, ptr %vpp, align 32
+ %2 = load <16 x i8>, ptr %vcp, align 32
+ %3 = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> %0, <256 x i1> %1, <16 x i8> %2, i32 0, i32 0, i32 0)
+ store <1024 x i1> %3, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1>, <16 x i8>, i32, i32, i32)
+
+define void @test_pmdmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvi8gerx4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: lxv vs0, 0(r4)
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: pmdmxvi8gerx4 dmr0, vsp34, vs0, 0, 0, 0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r5)
+; CHECK-NEXT: stxvp vsp36, 64(r5)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r5)
+; CHECK-NEXT: stxvp vsp36, 0(r5)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_pmdmxvi8gerx4:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: pmdmxvi8gerx4 dmr0, vsp34, vs0, 0, 0, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
+; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
+; CHECK-BE-NEXT: blr
+entry:
+ %0 = load <256 x i1>, ptr %vpp, align 32
+ %1 = load <16 x i8>, ptr %vcp, align 32
+ %2 = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1> %0, <16 x i8> %1, i32 0, i32 0, i32 0)
+ store <1024 x i1> %2, ptr %resp, align 64
+ ret void
+}
+
+declare <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1>, <256 x i1>, <16 x i8>, i32, i32, i32)
+
+define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
+; CHECK-LABEL: test_pmdmxvi8gerx4spp:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxvp vsp34, 0(r3)
+; CHECK-NEXT: lxvp vsp36, 32(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-NEXT: lxvp vsp34, 64(r3)
+; CHECK-NEXT: lxvp vsp36, 96(r3)
+; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: lxv vs0, 0(r5)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: pmdmxvi8gerx4spp dmr0, vsp34, vs0, 0, 0, 0
+; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: stxvp vsp34, 96(r6)
+; CHECK-NEXT: stxvp vsp36, 64(r6)
+; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: stxvp vsp34, 32(r6)
+; CHECK-NEXT: stxvp vsp36, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: test_pmdmxvi8gerx4spp:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
+; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
+; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
+; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-B...
[truncated]
|
def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>; | ||
|
||
def MMA : Predicate<"Subtarget->hasMMA()">; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this stuff be moved to PPCInstrInfo.td instead? It doesn't make sense to me for MMA to depend on MMA Future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right, it is moved to PPCInstrInfo.td.
multiclass DMR_UM_M448_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, | ||
string asmstr> { | ||
defm NAME : DMR_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; | ||
let Predicates = [IsISAFuture] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should there be a PrefixInstrs predicate here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I added both MMA and PrefixInstrs.
PowerPC_MMA_DMR_PP_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty]>; | ||
defm int_ppc_mma_pmdmxvi8gerx4 : | ||
PowerPC_MMA_DMR_PP_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty, | ||
llvm_i32_ty, llvm_i32_ty]>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the 32-bit int args should probably be marked as ImmArg.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when I add the ImmArg to the intrinsics definition, the original types that make sure the mask values are fitted into 2/4/8 bits wont work.
def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>; -
def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>;
def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>;
and an IR test like the following
declare <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1>, <256 x i1>, <16 x i8>, i32 immarg, i32 immarg, i32 immarg)
define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
entry:
%0 = load <1024 x i1>, ptr %vop, align 64
%1 = load <256 x i1>, ptr %vpp, align 32
%2 = load <16 x i8>, ptr %vcp, align 32
%3 = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4spp(<1024 x i1> %0, <256 x i1> %1, <16 x i8> %2, i32 0, i32 0, i32 0)
store <1024 x i1> %3, ptr %resp, align 64
ret void
}
fails with
LLVM ERROR: Cannot select: intrinsic %llvm.ppc.mma.pmdmxvi8gerx4pp
By changing the Pat for them to use i32 instead of Msk*Imm, the IR compiles fine, however it wont have the original constraint on the masks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm. Not sure why that is failing, but I am okay staying with your original code.
def int_ppc_mma_pmdmxvi8gerx4spp : | ||
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], | ||
[llvm_v1024i1_ty, llvm_v256i1_ty, llvm_v16i8_ty, | ||
llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ImmArg
; CHECK-BE-NEXT: stxvp vsp34, 0(r5) | ||
; CHECK-BE-NEXT: blr | ||
entry: | ||
%0 = load <256 x i1>, ptr %vpp, align 32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named results rather than numbered.
%0 = load <1024 x i1>, ptr %vop, align 64 | ||
%1 = load <256 x i1>, ptr %vpp, align 32 | ||
%2 = load <16 x i8>, ptr %vcp, align 32 | ||
%3 = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4pp(<1024 x i1> %0, <256 x i1> %1, <16 x i8> %2, i32 0, i32 0, i32 0) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we make the masks non-zero - otherwise the result is a constant 0.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…MR Instructions This commit adds the following Dense Math Facility integer calculation instructions: dmxvi8gerx4, dmxvi8gerx4pp, dmxvi8gerx4spp, pmdmxvi8gerx4, pmdmxvi8gerx4pp, and pmdmxvi8gerx4spp, along with their corresponding intrinsics and tests.
There was a merge conflict which I resolved and force pushed. |
This commit adds the following Dense Math Facility integer calculation instructions: dmxvi8gerx4, dmxvi8gerx4pp, dmxvi8gerx4spp, pmdmxvi8gerx4, pmdmxvi8gerx4pp, and pmdmxvi8gerx4spp, along with their corresponding intrinsics and tests.