Skip to content

[PowerPC] Add intrinsics and tests for basic Dense Math enablement instructions #129913

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsPowerPC.td
Original file line number Diff line number Diff line change
Expand Up @@ -1644,6 +1644,16 @@ let TargetPrefix = "ppc" in {
def int_ppc_mma_xxsetaccz :
DefaultAttrsIntrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;

def int_ppc_mma_dmsetdmrz :
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [], [IntrNoMem]>;

def int_ppc_mma_dmmr :
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty], [IntrNoMem]>;

def int_ppc_mma_dmxor :
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty,
llvm_v1024i1_ty], [IntrNoMem]>;

// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,15 @@ let Predicates = [IsISAFuture] in {
"dmxxinstfdmr256 $AT, $XBp, $P", []>;

def DMMR : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB),
"dmmr $AT, $AB", []>;
"dmmr $AT, $AB",
[(set v1024i1:$AT, (int_ppc_mma_dmmr v1024i1:$AB))]>;

def DMXOR : XForm_ATB3<31, 7, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB),
"dmxor $AT, $AB", []>,
"dmxor $AT, $AB",
[(set v1024i1:$AT, (int_ppc_mma_dmxor v1024i1:$ATi, v1024i1:$AB))]>,
RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">;

def DMSETDMRZ : XForm_AT3<31, 2, 177, (outs dmr:$AT), (ins),
"dmsetdmrz $AT", NoItinerary, []>;
"dmsetdmrz $AT", NoItinerary,
[(set v1024i1:$AT, (int_ppc_mma_dmsetdmrz))]>;
}
134 changes: 134 additions & 0 deletions llvm/test/CodeGen/PowerPC/dmr-enable.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=future -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix \
; RUN: -mcpu=future -ppc-asm-full-reg-names \
; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE

define void @tdmrz(ptr nocapture readonly %vp1, ptr nocapture %resp) {
; CHECK-LABEL: tdmrz:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: dmsetdmrz dmr0
; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-NEXT: stxvp vsp34, 96(r4)
; CHECK-NEXT: stxvp vsp36, 64(r4)
; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-NEXT: stxvp vsp34, 32(r4)
; CHECK-NEXT: stxvp vsp36, 0(r4)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: tdmrz:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: dmsetdmrz dmr0
; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-BE-NEXT: stxvp vsp36, 96(r4)
; CHECK-BE-NEXT: stxvp vsp34, 64(r4)
; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-BE-NEXT: stxvp vsp36, 32(r4)
; CHECK-BE-NEXT: stxvp vsp34, 0(r4)
; CHECK-BE-NEXT: blr
entry:
%z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
store <1024 x i1> %z, ptr %resp, align 32
ret void
}

define void @tdmmr(ptr nocapture readonly %vp1, ptr nocapture %resp) {
; CHECK-LABEL: tdmmr:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxvp vsp34, 0(r3)
; CHECK-NEXT: lxvp vsp36, 32(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-NEXT: lxvp vsp34, 64(r3)
; CHECK-NEXT: lxvp vsp36, 96(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
; CHECK-NEXT: dmmr dmr0, dmr0
; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-NEXT: stxvp vsp34, 96(r4)
; CHECK-NEXT: stxvp vsp36, 64(r4)
; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-NEXT: stxvp vsp34, 32(r4)
; CHECK-NEXT: stxvp vsp36, 0(r4)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: tdmmr:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
; CHECK-BE-NEXT: dmmr dmr0, dmr0
; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-BE-NEXT: stxvp vsp36, 96(r4)
; CHECK-BE-NEXT: stxvp vsp34, 64(r4)
; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-BE-NEXT: stxvp vsp36, 32(r4)
; CHECK-BE-NEXT: stxvp vsp34, 0(r4)
; CHECK-BE-NEXT: blr
entry:
%l = load <1024 x i1>, ptr %vp1, align 32
%c = call <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1> %l)
store <1024 x i1> %c, ptr %resp, align 32
ret void
}

define void @tdmxor(ptr nocapture readonly %vp1, ptr %vp2, ptr nocapture %resp) {
; CHECK-LABEL: tdmxor:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxvp vsp34, 0(r3)
; CHECK-NEXT: lxvp vsp36, 32(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-NEXT: lxvp vsp34, 64(r3)
; CHECK-NEXT: lxvp vsp36, 96(r3)
; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
; CHECK-NEXT: lxvp vsp34, 0(r4)
; CHECK-NEXT: lxvp vsp36, 32(r4)
; CHECK-NEXT: dmxxinstfdmr512 wacc_hi1, vsp36, vsp34, 1
; CHECK-NEXT: lxvp vsp34, 64(r4)
; CHECK-NEXT: lxvp vsp36, 96(r4)
; CHECK-NEXT: dmxxinstfdmr512 wacc1, vsp36, vsp34, 0
; CHECK-NEXT: dmxor dmr0, dmr1
; CHECK-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-NEXT: stxvp vsp34, 96(r5)
; CHECK-NEXT: stxvp vsp36, 64(r5)
; CHECK-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
; CHECK-NEXT: stxvp vsp34, 32(r5)
; CHECK-NEXT: stxvp vsp36, 0(r5)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: tdmxor:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxvp vsp34, 96(r3)
; CHECK-BE-NEXT: lxvp vsp36, 64(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1
; CHECK-BE-NEXT: lxvp vsp34, 32(r3)
; CHECK-BE-NEXT: lxvp vsp36, 0(r3)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I am looking at an older RFC.. the version I am looking at contains dmxxinstdmr512 not dmxxinstfdmr512.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think dmxxinstfdmr512 was renamed to dmxxinstdmr512.

; CHECK-BE-NEXT: lxvp vsp34, 96(r4)
; CHECK-BE-NEXT: lxvp vsp36, 64(r4)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi1, vsp36, vsp34, 1
; CHECK-BE-NEXT: lxvp vsp34, 32(r4)
; CHECK-BE-NEXT: lxvp vsp36, 0(r4)
; CHECK-BE-NEXT: dmxxinstfdmr512 wacc1, vsp36, vsp34, 0
; CHECK-BE-NEXT: dmxor dmr0, dmr1
; CHECK-BE-NEXT: dmxxextfdmr512 wacc_hi0, vsp34, vsp36, 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For dmxxextfdmr512, shouldn't the signature be something like dmxxextfdmr512 vsp, vsp, wacc, 0|1?

dmxxextfdmr512 XAp,XBp,AS,

Register operand data layout: 
* src : DMR[AS][4xP[+0..3]
* tgt : VSR[XAp[+1]] VSR[XBp[+1]]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The older RFC that implemented/added this instruction had this signature dmxxextfdmr512 AS,XAp,XBp,P and the latest one has dmxxextfdmr512 XAp,XBp,AS,P.

; CHECK-BE-NEXT: stxvp vsp36, 96(r5)
; CHECK-BE-NEXT: stxvp vsp34, 64(r5)
; CHECK-BE-NEXT: dmxxextfdmr512 wacc0, vsp34, vsp36, 0
; CHECK-BE-NEXT: stxvp vsp36, 32(r5)
; CHECK-BE-NEXT: stxvp vsp34, 0(r5)
; CHECK-BE-NEXT: blr
entry:
%l = load <1024 x i1>, ptr %vp1, align 32
%r = load <1024 x i1>, ptr %vp2, align 32
%x = call <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1> %l, <1024 x i1> %r)
store <1024 x i1> %x, ptr %resp, align 32
ret void
}

declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)